{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6220839813374806, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006220839813374805, "grad_norm": 75.29773712158203, "learning_rate": 5.0000000000000004e-08, "logits/chosen": -177.85220336914062, "logits/rejected": 61.4637451171875, "logps/chosen": -355.3916015625, "logps/rejected": -539.6513671875, "loss": 1.2554, "rewards/accuracies": 0.375, "rewards/chosen": -1.0287162065505981, "rewards/margins": -0.3372078239917755, "rewards/rejected": -0.6915084719657898, "step": 1 }, { "epoch": 0.001244167962674961, "grad_norm": 80.6917953491211, "learning_rate": 1.0000000000000001e-07, "logits/chosen": -23.33196258544922, "logits/rejected": 71.71320343017578, "logps/chosen": -398.1117858886719, "logps/rejected": -580.71630859375, "loss": 1.6137, "rewards/accuracies": 0.375, "rewards/chosen": -0.7210732102394104, "rewards/margins": -0.8189607858657837, "rewards/rejected": 0.09788760542869568, "step": 2 }, { "epoch": 0.0018662519440124418, "grad_norm": 229.31642150878906, "learning_rate": 1.5000000000000002e-07, "logits/chosen": -70.74639129638672, "logits/rejected": 93.88347625732422, "logps/chosen": -465.1885986328125, "logps/rejected": -692.7140502929688, "loss": 2.0668, "rewards/accuracies": 0.375, "rewards/chosen": -0.14922478795051575, "rewards/margins": -1.333322525024414, "rewards/rejected": 1.1840977668762207, "step": 3 }, { "epoch": 0.002488335925349922, "grad_norm": 43.34788513183594, "learning_rate": 2.0000000000000002e-07, "logits/chosen": -137.68182373046875, "logits/rejected": 18.784770965576172, "logps/chosen": -248.25372314453125, "logps/rejected": -433.6686706542969, "loss": 1.0693, "rewards/accuracies": 0.5, "rewards/chosen": -0.5562701225280762, "rewards/margins": -0.02385607361793518, "rewards/rejected": -0.5324140787124634, "step": 4 }, { "epoch": 0.003110419906687403, "grad_norm": 69.38472747802734, "learning_rate": 2.5000000000000004e-07, "logits/chosen": 21.578540802001953, "logits/rejected": -12.377201080322266, "logps/chosen": -562.13623046875, "logps/rejected": -494.3175048828125, "loss": 2.0564, "rewards/accuracies": 0.5, "rewards/chosen": -0.15131518244743347, "rewards/margins": -1.121817946434021, "rewards/rejected": 0.9705027341842651, "step": 5 }, { "epoch": 0.0037325038880248835, "grad_norm": 75.96995544433594, "learning_rate": 3.0000000000000004e-07, "logits/chosen": -116.01171112060547, "logits/rejected": 14.119502067565918, "logps/chosen": -1148.569091796875, "logps/rejected": -2129.08203125, "loss": 1.3099, "rewards/accuracies": 0.75, "rewards/chosen": 2.4665775299072266, "rewards/margins": 1.6689155101776123, "rewards/rejected": 0.7976619005203247, "step": 6 }, { "epoch": 0.004354587869362364, "grad_norm": 922.5938110351562, "learning_rate": 3.5000000000000004e-07, "logits/chosen": -136.36256408691406, "logits/rejected": 26.24632453918457, "logps/chosen": -497.417724609375, "logps/rejected": -865.60546875, "loss": 2.779, "rewards/accuracies": 0.5, "rewards/chosen": 0.6914694905281067, "rewards/margins": -1.1793031692504883, "rewards/rejected": 1.8707724809646606, "step": 7 }, { "epoch": 0.004976671850699844, "grad_norm": 44.46845245361328, "learning_rate": 4.0000000000000003e-07, "logits/chosen": -11.92451286315918, "logits/rejected": -3.7937088012695312, "logps/chosen": -444.5755615234375, "logps/rejected": -395.83831787109375, "loss": 0.9223, "rewards/accuracies": 0.5, "rewards/chosen": -0.04211881756782532, "rewards/margins": 0.26514172554016113, "rewards/rejected": -0.30726051330566406, "step": 8 }, { "epoch": 0.005598755832037325, "grad_norm": 43.450775146484375, "learning_rate": 4.5000000000000003e-07, "logits/chosen": -139.44090270996094, "logits/rejected": 32.01319122314453, "logps/chosen": -318.8926086425781, "logps/rejected": -431.93017578125, "loss": 0.6561, "rewards/accuracies": 0.5, "rewards/chosen": 0.07980266213417053, "rewards/margins": 0.22137239575386047, "rewards/rejected": -0.14156973361968994, "step": 9 }, { "epoch": 0.006220839813374806, "grad_norm": 39.409461975097656, "learning_rate": 5.000000000000001e-07, "logits/chosen": -54.87480163574219, "logits/rejected": 13.133830070495605, "logps/chosen": -403.47540283203125, "logps/rejected": -510.01226806640625, "loss": 0.5046, "rewards/accuracies": 0.875, "rewards/chosen": 1.2105703353881836, "rewards/margins": 0.9501632452011108, "rewards/rejected": 0.260407030582428, "step": 10 }, { "epoch": 0.006842923794712286, "grad_norm": 64.99250793457031, "learning_rate": 5.5e-07, "logits/chosen": -136.03424072265625, "logits/rejected": 39.76721954345703, "logps/chosen": -422.19256591796875, "logps/rejected": -524.5418701171875, "loss": 0.8112, "rewards/accuracies": 0.5, "rewards/chosen": -1.3280785083770752, "rewards/margins": 0.3193666338920593, "rewards/rejected": -1.6474450826644897, "step": 11 }, { "epoch": 0.007465007776049767, "grad_norm": 41.89508819580078, "learning_rate": 6.000000000000001e-07, "logits/chosen": -75.12355041503906, "logits/rejected": 50.2412109375, "logps/chosen": -318.134521484375, "logps/rejected": -429.7762145996094, "loss": 0.644, "rewards/accuracies": 0.75, "rewards/chosen": -0.010226830840110779, "rewards/margins": 1.1151671409606934, "rewards/rejected": -1.1253938674926758, "step": 12 }, { "epoch": 0.008087091757387248, "grad_norm": 59.16816711425781, "learning_rate": 6.5e-07, "logits/chosen": -79.24116516113281, "logits/rejected": 132.34188842773438, "logps/chosen": -288.0714111328125, "logps/rejected": -442.1520690917969, "loss": 1.0391, "rewards/accuracies": 0.375, "rewards/chosen": -0.3003309369087219, "rewards/margins": -0.26002201437950134, "rewards/rejected": -0.04030896723270416, "step": 13 }, { "epoch": 0.008709175738724729, "grad_norm": 163.5299072265625, "learning_rate": 7.000000000000001e-07, "logits/chosen": -162.06820678710938, "logits/rejected": -0.0832672119140625, "logps/chosen": -276.1287536621094, "logps/rejected": -581.9349365234375, "loss": 1.6963, "rewards/accuracies": 0.375, "rewards/chosen": 0.7750052809715271, "rewards/margins": -0.8691689372062683, "rewards/rejected": 1.6441740989685059, "step": 14 }, { "epoch": 0.00933125972006221, "grad_norm": 48.502872467041016, "learning_rate": 7.5e-07, "logits/chosen": -84.56794738769531, "logits/rejected": 87.51420593261719, "logps/chosen": -283.99798583984375, "logps/rejected": -397.23065185546875, "loss": 0.8379, "rewards/accuracies": 0.625, "rewards/chosen": 0.023177146911621094, "rewards/margins": 0.1964801400899887, "rewards/rejected": -0.1733030378818512, "step": 15 }, { "epoch": 0.009953343701399688, "grad_norm": 813.075439453125, "learning_rate": 8.000000000000001e-07, "logits/chosen": -104.10598754882812, "logits/rejected": -27.064477920532227, "logps/chosen": -420.6732177734375, "logps/rejected": -1304.482177734375, "loss": 3.3858, "rewards/accuracies": 0.375, "rewards/chosen": 0.16808362305164337, "rewards/margins": -2.892519235610962, "rewards/rejected": 3.060602903366089, "step": 16 }, { "epoch": 0.010575427682737169, "grad_norm": 30.26862907409668, "learning_rate": 8.500000000000001e-07, "logits/chosen": -159.76206970214844, "logits/rejected": -38.212928771972656, "logps/chosen": -299.5554504394531, "logps/rejected": -416.158203125, "loss": 0.8113, "rewards/accuracies": 0.5, "rewards/chosen": 0.44327592849731445, "rewards/margins": 0.563764214515686, "rewards/rejected": -0.12048837542533875, "step": 17 }, { "epoch": 0.01119751166407465, "grad_norm": 51.042213439941406, "learning_rate": 9.000000000000001e-07, "logits/chosen": -127.75035095214844, "logits/rejected": -25.308212280273438, "logps/chosen": -650.95751953125, "logps/rejected": -1018.9414672851562, "loss": 1.553, "rewards/accuracies": 0.375, "rewards/chosen": -1.552620530128479, "rewards/margins": 0.4592360854148865, "rewards/rejected": -2.0118565559387207, "step": 18 }, { "epoch": 0.01181959564541213, "grad_norm": 56.81817626953125, "learning_rate": 9.500000000000001e-07, "logits/chosen": -94.1162109375, "logits/rejected": 47.61082458496094, "logps/chosen": -401.20465087890625, "logps/rejected": -496.0693359375, "loss": 1.1, "rewards/accuracies": 0.75, "rewards/chosen": 0.3011024296283722, "rewards/margins": 0.1680152714252472, "rewards/rejected": 0.133087158203125, "step": 19 }, { "epoch": 0.012441679626749611, "grad_norm": 56.192413330078125, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -117.42288208007812, "logits/rejected": 47.06532287597656, "logps/chosen": -428.161376953125, "logps/rejected": -588.315673828125, "loss": 0.7544, "rewards/accuracies": 0.625, "rewards/chosen": 1.139656901359558, "rewards/margins": 0.22241297364234924, "rewards/rejected": 0.9172439575195312, "step": 20 }, { "epoch": 0.013063763608087092, "grad_norm": 68.3932876586914, "learning_rate": 1.0500000000000001e-06, "logits/chosen": -96.40730285644531, "logits/rejected": 0.16573667526245117, "logps/chosen": -445.1815185546875, "logps/rejected": -571.5523681640625, "loss": 2.2811, "rewards/accuracies": 0.125, "rewards/chosen": 0.4818965196609497, "rewards/margins": -1.7519872188568115, "rewards/rejected": 2.2338836193084717, "step": 21 }, { "epoch": 0.013685847589424573, "grad_norm": 80.41062927246094, "learning_rate": 1.1e-06, "logits/chosen": -100.3754653930664, "logits/rejected": 13.138660430908203, "logps/chosen": -286.0962829589844, "logps/rejected": -401.8005676269531, "loss": 1.6501, "rewards/accuracies": 0.25, "rewards/chosen": -0.6160720586776733, "rewards/margins": -0.9672492742538452, "rewards/rejected": 0.3511772155761719, "step": 22 }, { "epoch": 0.014307931570762053, "grad_norm": 63.30597686767578, "learning_rate": 1.1500000000000002e-06, "logits/chosen": -84.35729217529297, "logits/rejected": 10.734075546264648, "logps/chosen": -1314.2762451171875, "logps/rejected": -1399.5050048828125, "loss": 2.1339, "rewards/accuracies": 0.5, "rewards/chosen": -4.486008644104004, "rewards/margins": 0.26387083530426025, "rewards/rejected": -4.749879837036133, "step": 23 }, { "epoch": 0.014930015552099534, "grad_norm": 125.71233367919922, "learning_rate": 1.2000000000000002e-06, "logits/chosen": -104.85993957519531, "logits/rejected": 18.75521469116211, "logps/chosen": -1213.993896484375, "logps/rejected": -1906.4573974609375, "loss": 0.3398, "rewards/accuracies": 0.875, "rewards/chosen": -10.290596008300781, "rewards/margins": 9.480679512023926, "rewards/rejected": -19.77127456665039, "step": 24 }, { "epoch": 0.015552099533437015, "grad_norm": 40.78514862060547, "learning_rate": 1.25e-06, "logits/chosen": -163.934814453125, "logits/rejected": -40.20934295654297, "logps/chosen": -199.8103790283203, "logps/rejected": -298.68975830078125, "loss": 0.9414, "rewards/accuracies": 0.75, "rewards/chosen": 0.24528798460960388, "rewards/margins": 0.40989506244659424, "rewards/rejected": -0.16460704803466797, "step": 25 }, { "epoch": 0.016174183514774496, "grad_norm": 18.37883949279785, "learning_rate": 1.3e-06, "logits/chosen": -16.484825134277344, "logits/rejected": 12.064239501953125, "logps/chosen": -272.5722961425781, "logps/rejected": -1154.9356689453125, "loss": 0.3293, "rewards/accuracies": 0.875, "rewards/chosen": 0.4440985321998596, "rewards/margins": 10.712531089782715, "rewards/rejected": -10.2684326171875, "step": 26 }, { "epoch": 0.016796267496111975, "grad_norm": 40.317657470703125, "learning_rate": 1.3500000000000002e-06, "logits/chosen": -38.42012405395508, "logits/rejected": -63.222984313964844, "logps/chosen": -451.6551513671875, "logps/rejected": -429.0662841796875, "loss": 1.032, "rewards/accuracies": 0.625, "rewards/chosen": 1.1528089046478271, "rewards/margins": 0.10592526197433472, "rewards/rejected": 1.0468835830688477, "step": 27 }, { "epoch": 0.017418351477449457, "grad_norm": 478.78131103515625, "learning_rate": 1.4000000000000001e-06, "logits/chosen": -128.89784240722656, "logits/rejected": -21.452865600585938, "logps/chosen": -407.65185546875, "logps/rejected": -1335.2532958984375, "loss": 0.7879, "rewards/accuracies": 0.5, "rewards/chosen": 0.9539302587509155, "rewards/margins": 0.13599133491516113, "rewards/rejected": 0.817939043045044, "step": 28 }, { "epoch": 0.018040435458786936, "grad_norm": 581.1535034179688, "learning_rate": 1.45e-06, "logits/chosen": -103.2979507446289, "logits/rejected": -32.59198760986328, "logps/chosen": -916.250244140625, "logps/rejected": -1380.39404296875, "loss": 0.4149, "rewards/accuracies": 0.75, "rewards/chosen": -2.780393600463867, "rewards/margins": 1.7255855798721313, "rewards/rejected": -4.505979537963867, "step": 29 }, { "epoch": 0.01866251944012442, "grad_norm": 1151.39501953125, "learning_rate": 1.5e-06, "logits/chosen": -12.620857238769531, "logits/rejected": 144.8692169189453, "logps/chosen": -1457.410888671875, "logps/rejected": -1462.5665283203125, "loss": 4.2792, "rewards/accuracies": 0.375, "rewards/chosen": -11.171064376831055, "rewards/margins": -3.0617105960845947, "rewards/rejected": -8.109354019165039, "step": 30 }, { "epoch": 0.019284603421461897, "grad_norm": 426.7294921875, "learning_rate": 1.5500000000000002e-06, "logits/chosen": -75.78508758544922, "logits/rejected": -13.577807426452637, "logps/chosen": -1253.0411376953125, "logps/rejected": -1222.6817626953125, "loss": 3.9401, "rewards/accuracies": 0.5, "rewards/chosen": -6.0736188888549805, "rewards/margins": -3.1913158893585205, "rewards/rejected": -2.882302761077881, "step": 31 }, { "epoch": 0.019906687402799376, "grad_norm": 19.135032653808594, "learning_rate": 1.6000000000000001e-06, "logits/chosen": -158.91610717773438, "logits/rejected": 59.26982116699219, "logps/chosen": -213.5504913330078, "logps/rejected": -401.37286376953125, "loss": 0.3081, "rewards/accuracies": 0.875, "rewards/chosen": 1.2317920923233032, "rewards/margins": 1.5202338695526123, "rewards/rejected": -0.28844189643859863, "step": 32 }, { "epoch": 0.02052877138413686, "grad_norm": 17.474164962768555, "learning_rate": 1.6500000000000003e-06, "logits/chosen": -87.76173400878906, "logits/rejected": 51.22368621826172, "logps/chosen": -312.44091796875, "logps/rejected": -457.1952209472656, "loss": 0.1888, "rewards/accuracies": 0.875, "rewards/chosen": 0.9675540328025818, "rewards/margins": 2.397036075592041, "rewards/rejected": -1.4294819831848145, "step": 33 }, { "epoch": 0.021150855365474338, "grad_norm": 20.996366500854492, "learning_rate": 1.7000000000000002e-06, "logits/chosen": -95.53240966796875, "logits/rejected": 45.29603576660156, "logps/chosen": -694.0944213867188, "logps/rejected": -1572.3394775390625, "loss": 0.2843, "rewards/accuracies": 0.875, "rewards/chosen": 1.9569804668426514, "rewards/margins": 14.476667404174805, "rewards/rejected": -12.51968765258789, "step": 34 }, { "epoch": 0.02177293934681182, "grad_norm": 194.36459350585938, "learning_rate": 1.75e-06, "logits/chosen": 20.440092086791992, "logits/rejected": 20.325443267822266, "logps/chosen": -884.8192749023438, "logps/rejected": -840.1112060546875, "loss": 0.6413, "rewards/accuracies": 0.5, "rewards/chosen": -7.4156084060668945, "rewards/margins": 0.7475752830505371, "rewards/rejected": -8.163183212280273, "step": 35 }, { "epoch": 0.0223950233281493, "grad_norm": 51.03216552734375, "learning_rate": 1.8000000000000001e-06, "logits/chosen": -88.91024780273438, "logits/rejected": 43.20893859863281, "logps/chosen": -238.1575927734375, "logps/rejected": -401.048828125, "loss": 1.1353, "rewards/accuracies": 0.375, "rewards/chosen": 0.2638757824897766, "rewards/margins": -0.15922489762306213, "rewards/rejected": 0.4231005907058716, "step": 36 }, { "epoch": 0.023017107309486782, "grad_norm": 33.27362060546875, "learning_rate": 1.85e-06, "logits/chosen": -122.2191162109375, "logits/rejected": 38.30187225341797, "logps/chosen": -431.4573669433594, "logps/rejected": -568.153076171875, "loss": 0.4443, "rewards/accuracies": 0.75, "rewards/chosen": 2.218548059463501, "rewards/margins": 1.3690085411071777, "rewards/rejected": 0.849539577960968, "step": 37 }, { "epoch": 0.02363919129082426, "grad_norm": 63.82041549682617, "learning_rate": 1.9000000000000002e-06, "logits/chosen": -91.47080993652344, "logits/rejected": -87.55026245117188, "logps/chosen": -604.67822265625, "logps/rejected": -603.2916870117188, "loss": 1.0862, "rewards/accuracies": 0.625, "rewards/chosen": -1.2701797485351562, "rewards/margins": 0.6540302634239197, "rewards/rejected": -1.9242099523544312, "step": 38 }, { "epoch": 0.024261275272161743, "grad_norm": 28.931421279907227, "learning_rate": 1.9500000000000004e-06, "logits/chosen": -26.016319274902344, "logits/rejected": 80.02193450927734, "logps/chosen": -312.6510314941406, "logps/rejected": -370.9493713378906, "loss": 0.5454, "rewards/accuracies": 0.875, "rewards/chosen": 0.05695924162864685, "rewards/margins": 1.1090497970581055, "rewards/rejected": -1.0520905256271362, "step": 39 }, { "epoch": 0.024883359253499222, "grad_norm": 43.46025848388672, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -128.47161865234375, "logits/rejected": 36.54724884033203, "logps/chosen": -323.2990417480469, "logps/rejected": -517.6661376953125, "loss": 0.7731, "rewards/accuracies": 0.375, "rewards/chosen": 0.3716725707054138, "rewards/margins": 0.6612736582756042, "rewards/rejected": -0.2896011173725128, "step": 40 }, { "epoch": 0.0255054432348367, "grad_norm": 37.826210021972656, "learning_rate": 2.05e-06, "logits/chosen": -167.75421142578125, "logits/rejected": 71.50552368164062, "logps/chosen": -203.67337036132812, "logps/rejected": -419.05316162109375, "loss": 0.627, "rewards/accuracies": 0.625, "rewards/chosen": 0.8959906697273254, "rewards/margins": 0.7690800428390503, "rewards/rejected": 0.12691058218479156, "step": 41 }, { "epoch": 0.026127527216174184, "grad_norm": 45.87517166137695, "learning_rate": 2.1000000000000002e-06, "logits/chosen": -46.27946853637695, "logits/rejected": -76.20664978027344, "logps/chosen": -549.3441162109375, "logps/rejected": -469.05596923828125, "loss": 0.6102, "rewards/accuracies": 0.75, "rewards/chosen": 1.7958881855010986, "rewards/margins": 1.2705662250518799, "rewards/rejected": 0.5253219604492188, "step": 42 }, { "epoch": 0.026749611197511663, "grad_norm": 899.3736572265625, "learning_rate": 2.15e-06, "logits/chosen": -63.47185516357422, "logits/rejected": -16.3970947265625, "logps/chosen": -1258.77490234375, "logps/rejected": -792.7114868164062, "loss": 5.5398, "rewards/accuracies": 0.125, "rewards/chosen": -7.652675151824951, "rewards/margins": -5.116842269897461, "rewards/rejected": -2.5358333587646484, "step": 43 }, { "epoch": 0.027371695178849145, "grad_norm": 63.07781219482422, "learning_rate": 2.2e-06, "logits/chosen": -150.1295623779297, "logits/rejected": 66.1733169555664, "logps/chosen": -347.10931396484375, "logps/rejected": -1547.52099609375, "loss": 0.5592, "rewards/accuracies": 0.75, "rewards/chosen": 0.3661939203739166, "rewards/margins": 12.713611602783203, "rewards/rejected": -12.347416877746582, "step": 44 }, { "epoch": 0.027993779160186624, "grad_norm": 42.032386779785156, "learning_rate": 2.25e-06, "logits/chosen": -122.23858642578125, "logits/rejected": 12.047426223754883, "logps/chosen": -754.6688232421875, "logps/rejected": -1134.136962890625, "loss": 0.6924, "rewards/accuracies": 0.625, "rewards/chosen": -5.956652641296387, "rewards/margins": 4.037847995758057, "rewards/rejected": -9.994501113891602, "step": 45 }, { "epoch": 0.028615863141524107, "grad_norm": 20.641681671142578, "learning_rate": 2.3000000000000004e-06, "logits/chosen": -49.58843994140625, "logits/rejected": 81.55226135253906, "logps/chosen": -248.12454223632812, "logps/rejected": -323.6038818359375, "loss": 0.3802, "rewards/accuracies": 0.875, "rewards/chosen": 0.7325179576873779, "rewards/margins": 1.378382921218872, "rewards/rejected": -0.6458648443222046, "step": 46 }, { "epoch": 0.029237947122861586, "grad_norm": 28.368398666381836, "learning_rate": 2.35e-06, "logits/chosen": -157.29957580566406, "logits/rejected": -1.7885704040527344, "logps/chosen": -314.315673828125, "logps/rejected": -465.5535583496094, "loss": 0.7913, "rewards/accuracies": 0.75, "rewards/chosen": 1.0418667793273926, "rewards/margins": 1.2694858312606812, "rewards/rejected": -0.2276192307472229, "step": 47 }, { "epoch": 0.029860031104199068, "grad_norm": 37.04475784301758, "learning_rate": 2.4000000000000003e-06, "logits/chosen": -127.27993774414062, "logits/rejected": 103.44401550292969, "logps/chosen": -256.5416564941406, "logps/rejected": -388.2500915527344, "loss": 0.9014, "rewards/accuracies": 0.625, "rewards/chosen": 0.3541111350059509, "rewards/margins": 0.7713264226913452, "rewards/rejected": -0.41721534729003906, "step": 48 }, { "epoch": 0.030482115085536547, "grad_norm": 36.0450439453125, "learning_rate": 2.4500000000000003e-06, "logits/chosen": -173.40975952148438, "logits/rejected": -10.112944602966309, "logps/chosen": -164.98733520507812, "logps/rejected": -355.285400390625, "loss": 0.7949, "rewards/accuracies": 0.75, "rewards/chosen": 0.6446701288223267, "rewards/margins": 0.7658143639564514, "rewards/rejected": -0.12114429473876953, "step": 49 }, { "epoch": 0.03110419906687403, "grad_norm": 30.178518295288086, "learning_rate": 2.5e-06, "logits/chosen": -190.67752075195312, "logits/rejected": 30.741165161132812, "logps/chosen": -1336.4576416015625, "logps/rejected": -2448.550048828125, "loss": 0.6615, "rewards/accuracies": 0.625, "rewards/chosen": -3.1199212074279785, "rewards/margins": 5.6113362312316895, "rewards/rejected": -8.731256484985352, "step": 50 }, { "epoch": 0.031726283048211505, "grad_norm": 30.916231155395508, "learning_rate": 2.55e-06, "logits/chosen": -121.64002990722656, "logits/rejected": 42.10102081298828, "logps/chosen": -260.619873046875, "logps/rejected": -438.66802978515625, "loss": 0.542, "rewards/accuracies": 0.875, "rewards/chosen": 1.049727201461792, "rewards/margins": 1.4939913749694824, "rewards/rejected": -0.44426411390304565, "step": 51 }, { "epoch": 0.03234836702954899, "grad_norm": 1188.6829833984375, "learning_rate": 2.6e-06, "logits/chosen": -16.052227020263672, "logits/rejected": -38.67804718017578, "logps/chosen": -1465.586181640625, "logps/rejected": -778.7018432617188, "loss": 10.9155, "rewards/accuracies": 0.125, "rewards/chosen": -15.774911880493164, "rewards/margins": -9.96524429321289, "rewards/rejected": -5.809668064117432, "step": 52 }, { "epoch": 0.03297045101088647, "grad_norm": 43.6611442565918, "learning_rate": 2.6500000000000005e-06, "logits/chosen": 28.41799545288086, "logits/rejected": 62.08867645263672, "logps/chosen": -350.9700012207031, "logps/rejected": -360.8543701171875, "loss": 0.9881, "rewards/accuracies": 0.5, "rewards/chosen": -0.11333408951759338, "rewards/margins": 0.3725128471851349, "rewards/rejected": -0.4858469069004059, "step": 53 }, { "epoch": 0.03359253499222395, "grad_norm": 39.64183807373047, "learning_rate": 2.7000000000000004e-06, "logits/chosen": -190.10409545898438, "logits/rejected": 18.80118179321289, "logps/chosen": -574.2435302734375, "logps/rejected": -2505.159423828125, "loss": 0.972, "rewards/accuracies": 0.5, "rewards/chosen": -2.048830509185791, "rewards/margins": 18.88075065612793, "rewards/rejected": -20.929582595825195, "step": 54 }, { "epoch": 0.03421461897356143, "grad_norm": 30.02960205078125, "learning_rate": 2.7500000000000004e-06, "logits/chosen": -92.54093933105469, "logits/rejected": -56.713348388671875, "logps/chosen": -2031.30224609375, "logps/rejected": -2650.76171875, "loss": 0.3571, "rewards/accuracies": 0.875, "rewards/chosen": -8.098992347717285, "rewards/margins": 10.489635467529297, "rewards/rejected": -18.588626861572266, "step": 55 }, { "epoch": 0.034836702954898914, "grad_norm": 62.470542907714844, "learning_rate": 2.8000000000000003e-06, "logits/chosen": -59.0765380859375, "logits/rejected": 94.99482727050781, "logps/chosen": -590.1900634765625, "logps/rejected": -709.4454345703125, "loss": 1.81, "rewards/accuracies": 0.5, "rewards/chosen": 11.080465316772461, "rewards/margins": -0.3011820316314697, "rewards/rejected": 11.381646156311035, "step": 56 }, { "epoch": 0.03545878693623639, "grad_norm": 64.42080688476562, "learning_rate": 2.85e-06, "logits/chosen": -62.355316162109375, "logits/rejected": 44.94255828857422, "logps/chosen": -379.73193359375, "logps/rejected": -517.2540283203125, "loss": 1.6482, "rewards/accuracies": 0.5, "rewards/chosen": 1.2409422397613525, "rewards/margins": 0.2005990743637085, "rewards/rejected": 1.0403432846069336, "step": 57 }, { "epoch": 0.03608087091757387, "grad_norm": 48.77953338623047, "learning_rate": 2.9e-06, "logits/chosen": -77.36001586914062, "logits/rejected": 67.45735931396484, "logps/chosen": -299.3946838378906, "logps/rejected": -1086.472412109375, "loss": 0.9033, "rewards/accuracies": 0.625, "rewards/chosen": -0.5822135806083679, "rewards/margins": 11.542000770568848, "rewards/rejected": -12.124213218688965, "step": 58 }, { "epoch": 0.03670295489891135, "grad_norm": 70.42303466796875, "learning_rate": 2.95e-06, "logits/chosen": -137.59715270996094, "logits/rejected": -64.14185333251953, "logps/chosen": -475.1390686035156, "logps/rejected": -535.9818115234375, "loss": 1.7311, "rewards/accuracies": 0.25, "rewards/chosen": 0.11842460930347443, "rewards/margins": -1.0808355808258057, "rewards/rejected": 1.1992602348327637, "step": 59 }, { "epoch": 0.03732503888024884, "grad_norm": 749.0438842773438, "learning_rate": 3e-06, "logits/chosen": -44.86164093017578, "logits/rejected": 74.69981384277344, "logps/chosen": -730.4176025390625, "logps/rejected": -1221.76171875, "loss": 1.0804, "rewards/accuracies": 0.75, "rewards/chosen": -1.3353334665298462, "rewards/margins": 0.6859627962112427, "rewards/rejected": -2.021296262741089, "step": 60 }, { "epoch": 0.037947122861586316, "grad_norm": 39.6771125793457, "learning_rate": 3.05e-06, "logits/chosen": -179.0650634765625, "logits/rejected": -3.5082168579101562, "logps/chosen": -234.839599609375, "logps/rejected": -471.9064636230469, "loss": 0.8542, "rewards/accuracies": 0.75, "rewards/chosen": 1.3170084953308105, "rewards/margins": 0.9839060306549072, "rewards/rejected": 0.33310243487358093, "step": 61 }, { "epoch": 0.038569206842923795, "grad_norm": 63.63577651977539, "learning_rate": 3.1000000000000004e-06, "logits/chosen": -83.8985824584961, "logits/rejected": -73.05562591552734, "logps/chosen": -441.20672607421875, "logps/rejected": -426.09649658203125, "loss": 1.5998, "rewards/accuracies": 0.5, "rewards/chosen": 1.0796689987182617, "rewards/margins": -0.9385915994644165, "rewards/rejected": 2.0182604789733887, "step": 62 }, { "epoch": 0.039191290824261274, "grad_norm": 122.24563598632812, "learning_rate": 3.1500000000000003e-06, "logits/chosen": -138.49444580078125, "logits/rejected": 55.99853515625, "logps/chosen": -451.4609375, "logps/rejected": -607.027099609375, "loss": 1.1702, "rewards/accuracies": 0.875, "rewards/chosen": 1.9362308979034424, "rewards/margins": 1.6428611278533936, "rewards/rejected": 0.29336971044540405, "step": 63 }, { "epoch": 0.03981337480559875, "grad_norm": 638.4260864257812, "learning_rate": 3.2000000000000003e-06, "logits/chosen": -163.58985900878906, "logits/rejected": 17.45030975341797, "logps/chosen": -1477.1134033203125, "logps/rejected": -1320.5074462890625, "loss": 7.4452, "rewards/accuracies": 0.75, "rewards/chosen": -15.663278579711914, "rewards/margins": -4.383941650390625, "rewards/rejected": -11.279337882995605, "step": 64 }, { "epoch": 0.04043545878693624, "grad_norm": 44.93674087524414, "learning_rate": 3.2500000000000002e-06, "logits/chosen": -83.40707397460938, "logits/rejected": -9.785736083984375, "logps/chosen": -1263.858154296875, "logps/rejected": -1371.3720703125, "loss": 0.72, "rewards/accuracies": 0.375, "rewards/chosen": -12.643072128295898, "rewards/margins": 1.6235361099243164, "rewards/rejected": -14.266608238220215, "step": 65 }, { "epoch": 0.04105754276827372, "grad_norm": 29.014402389526367, "learning_rate": 3.3000000000000006e-06, "logits/chosen": -133.93663024902344, "logits/rejected": 81.33216094970703, "logps/chosen": -493.4540710449219, "logps/rejected": -1325.054443359375, "loss": 0.3811, "rewards/accuracies": 0.875, "rewards/chosen": -2.942257881164551, "rewards/margins": 16.843399047851562, "rewards/rejected": -19.78565788269043, "step": 66 }, { "epoch": 0.0416796267496112, "grad_norm": 558.5728759765625, "learning_rate": 3.3500000000000005e-06, "logits/chosen": -129.9456024169922, "logits/rejected": 18.055818557739258, "logps/chosen": -1181.14990234375, "logps/rejected": -1351.4603271484375, "loss": 0.3736, "rewards/accuracies": 0.875, "rewards/chosen": -1.0527009963989258, "rewards/margins": 7.643436431884766, "rewards/rejected": -8.696137428283691, "step": 67 }, { "epoch": 0.042301710730948676, "grad_norm": 30.672258377075195, "learning_rate": 3.4000000000000005e-06, "logits/chosen": -64.8094711303711, "logits/rejected": 34.24016571044922, "logps/chosen": -301.7353210449219, "logps/rejected": -361.22149658203125, "loss": 0.5399, "rewards/accuracies": 0.75, "rewards/chosen": 0.5628699660301208, "rewards/margins": 1.7594395875930786, "rewards/rejected": -1.1965696811676025, "step": 68 }, { "epoch": 0.04292379471228616, "grad_norm": 636.44384765625, "learning_rate": 3.45e-06, "logits/chosen": -73.6044921875, "logits/rejected": -50.587223052978516, "logps/chosen": -513.036865234375, "logps/rejected": -874.9848022460938, "loss": 1.7913, "rewards/accuracies": 0.5, "rewards/chosen": -0.0651128888130188, "rewards/margins": -0.5921887755393982, "rewards/rejected": 0.5270757079124451, "step": 69 }, { "epoch": 0.04354587869362364, "grad_norm": 20.517812728881836, "learning_rate": 3.5e-06, "logits/chosen": -82.50524139404297, "logits/rejected": 50.30849838256836, "logps/chosen": -158.05738830566406, "logps/rejected": -229.08792114257812, "loss": 0.6332, "rewards/accuracies": 0.75, "rewards/chosen": 0.4551521837711334, "rewards/margins": 0.8408031463623047, "rewards/rejected": -0.38565099239349365, "step": 70 }, { "epoch": 0.04416796267496112, "grad_norm": 8.484793663024902, "learning_rate": 3.5500000000000003e-06, "logits/chosen": -132.15859985351562, "logits/rejected": 57.076377868652344, "logps/chosen": -227.03680419921875, "logps/rejected": -407.4863586425781, "loss": 0.0942, "rewards/accuracies": 1.0, "rewards/chosen": 1.3141671419143677, "rewards/margins": 3.704324245452881, "rewards/rejected": -2.3901572227478027, "step": 71 }, { "epoch": 0.0447900466562986, "grad_norm": 13.282901763916016, "learning_rate": 3.6000000000000003e-06, "logits/chosen": -34.069034576416016, "logits/rejected": 73.45081329345703, "logps/chosen": -519.9937744140625, "logps/rejected": -1308.231689453125, "loss": 0.1265, "rewards/accuracies": 1.0, "rewards/chosen": -1.322060465812683, "rewards/margins": 20.501802444458008, "rewards/rejected": -21.823863983154297, "step": 72 }, { "epoch": 0.04541213063763608, "grad_norm": 18.102375030517578, "learning_rate": 3.65e-06, "logits/chosen": -139.56646728515625, "logits/rejected": 37.196990966796875, "logps/chosen": -583.7896118164062, "logps/rejected": -1530.263427734375, "loss": 0.1821, "rewards/accuracies": 0.875, "rewards/chosen": 1.0941334962844849, "rewards/margins": 10.677034378051758, "rewards/rejected": -9.582901000976562, "step": 73 }, { "epoch": 0.046034214618973564, "grad_norm": 20.324291229248047, "learning_rate": 3.7e-06, "logits/chosen": -109.41703033447266, "logits/rejected": -48.66609191894531, "logps/chosen": -1321.24267578125, "logps/rejected": -653.778076171875, "loss": 0.2195, "rewards/accuracies": 0.875, "rewards/chosen": 3.1100475788116455, "rewards/margins": 5.045758247375488, "rewards/rejected": -1.9357101917266846, "step": 74 }, { "epoch": 0.04665629860031104, "grad_norm": 252.70919799804688, "learning_rate": 3.7500000000000005e-06, "logits/chosen": 31.687286376953125, "logits/rejected": 4.830270767211914, "logps/chosen": -579.0111083984375, "logps/rejected": -509.2896423339844, "loss": 1.2229, "rewards/accuracies": 0.75, "rewards/chosen": -0.17332875728607178, "rewards/margins": 0.8899763822555542, "rewards/rejected": -1.063305139541626, "step": 75 }, { "epoch": 0.04727838258164852, "grad_norm": 13.11046314239502, "learning_rate": 3.8000000000000005e-06, "logits/chosen": -130.0702667236328, "logits/rejected": 68.48661804199219, "logps/chosen": -380.2161560058594, "logps/rejected": -531.8563232421875, "loss": 0.1229, "rewards/accuracies": 1.0, "rewards/chosen": 0.9826292395591736, "rewards/margins": 2.7443556785583496, "rewards/rejected": -1.7617263793945312, "step": 76 }, { "epoch": 0.047900466562986, "grad_norm": 10.009026527404785, "learning_rate": 3.85e-06, "logits/chosen": -176.3349609375, "logits/rejected": 24.265670776367188, "logps/chosen": -1046.247802734375, "logps/rejected": -2130.19140625, "loss": 0.0873, "rewards/accuracies": 1.0, "rewards/chosen": -8.486373901367188, "rewards/margins": 8.784834861755371, "rewards/rejected": -17.271209716796875, "step": 77 }, { "epoch": 0.04852255054432349, "grad_norm": 30.496267318725586, "learning_rate": 3.900000000000001e-06, "logits/chosen": -11.876935958862305, "logits/rejected": 98.8614730834961, "logps/chosen": -2258.26318359375, "logps/rejected": -3007.66455078125, "loss": 0.5565, "rewards/accuracies": 0.75, "rewards/chosen": -6.414497375488281, "rewards/margins": 9.659784317016602, "rewards/rejected": -16.074281692504883, "step": 78 }, { "epoch": 0.049144634525660966, "grad_norm": 31.76437759399414, "learning_rate": 3.95e-06, "logits/chosen": -70.17195129394531, "logits/rejected": 110.49501037597656, "logps/chosen": -231.7307586669922, "logps/rejected": -411.88763427734375, "loss": 0.5099, "rewards/accuracies": 0.875, "rewards/chosen": 0.24470536410808563, "rewards/margins": 1.2261040210723877, "rewards/rejected": -0.9813985824584961, "step": 79 }, { "epoch": 0.049766718506998445, "grad_norm": 35.35044860839844, "learning_rate": 4.000000000000001e-06, "logits/chosen": -89.04490661621094, "logits/rejected": 46.24753952026367, "logps/chosen": -455.7387390136719, "logps/rejected": -581.5548706054688, "loss": 0.3675, "rewards/accuracies": 0.875, "rewards/chosen": 2.755155563354492, "rewards/margins": 2.0568268299102783, "rewards/rejected": 0.6983289122581482, "step": 80 }, { "epoch": 0.050388802488335924, "grad_norm": 44.07930374145508, "learning_rate": 4.05e-06, "logits/chosen": -137.97994995117188, "logits/rejected": 53.932735443115234, "logps/chosen": -294.3134765625, "logps/rejected": -446.0760498046875, "loss": 0.3624, "rewards/accuracies": 0.875, "rewards/chosen": 1.1429147720336914, "rewards/margins": 2.3229827880859375, "rewards/rejected": -1.1800682544708252, "step": 81 }, { "epoch": 0.0510108864696734, "grad_norm": 26.889339447021484, "learning_rate": 4.1e-06, "logits/chosen": -75.96332550048828, "logits/rejected": 72.12476348876953, "logps/chosen": -483.62969970703125, "logps/rejected": -1503.28857421875, "loss": 0.2444, "rewards/accuracies": 0.875, "rewards/chosen": 0.06015336513519287, "rewards/margins": 5.909359931945801, "rewards/rejected": -5.849206924438477, "step": 82 }, { "epoch": 0.05163297045101089, "grad_norm": 31.431421279907227, "learning_rate": 4.15e-06, "logits/chosen": -19.129051208496094, "logits/rejected": 128.35189819335938, "logps/chosen": -422.5089111328125, "logps/rejected": -529.9282836914062, "loss": 0.3569, "rewards/accuracies": 0.75, "rewards/chosen": 1.3847264051437378, "rewards/margins": 1.6580657958984375, "rewards/rejected": -0.27333948016166687, "step": 83 }, { "epoch": 0.05225505443234837, "grad_norm": 27.511486053466797, "learning_rate": 4.2000000000000004e-06, "logits/chosen": -198.4405517578125, "logits/rejected": -0.6430850028991699, "logps/chosen": -323.93341064453125, "logps/rejected": -1238.6708984375, "loss": 0.4899, "rewards/accuracies": 0.625, "rewards/chosen": -1.0391197204589844, "rewards/margins": 15.887272834777832, "rewards/rejected": -16.926393508911133, "step": 84 }, { "epoch": 0.05287713841368585, "grad_norm": 540.4782104492188, "learning_rate": 4.25e-06, "logits/chosen": -157.5902557373047, "logits/rejected": -21.795957565307617, "logps/chosen": -792.9541015625, "logps/rejected": -1439.711669921875, "loss": 2.019, "rewards/accuracies": 0.875, "rewards/chosen": -7.619131565093994, "rewards/margins": 0.14130693674087524, "rewards/rejected": -7.760438919067383, "step": 85 }, { "epoch": 0.053499222395023326, "grad_norm": 13.117233276367188, "learning_rate": 4.3e-06, "logits/chosen": -8.542896270751953, "logits/rejected": 118.21063232421875, "logps/chosen": -372.491455078125, "logps/rejected": -469.3487243652344, "loss": 0.1338, "rewards/accuracies": 1.0, "rewards/chosen": 2.6040077209472656, "rewards/margins": 3.2366881370544434, "rewards/rejected": -0.6326805353164673, "step": 86 }, { "epoch": 0.05412130637636081, "grad_norm": 36.41606521606445, "learning_rate": 4.350000000000001e-06, "logits/chosen": -87.39544677734375, "logits/rejected": 80.2397232055664, "logps/chosen": -210.7410125732422, "logps/rejected": -334.9414367675781, "loss": 0.4893, "rewards/accuracies": 0.75, "rewards/chosen": 1.0292229652404785, "rewards/margins": 2.4309725761413574, "rewards/rejected": -1.401749610900879, "step": 87 }, { "epoch": 0.05474339035769829, "grad_norm": 36.3905143737793, "learning_rate": 4.4e-06, "logits/chosen": -90.26077270507812, "logits/rejected": -3.273578643798828, "logps/chosen": -405.2956848144531, "logps/rejected": -465.50115966796875, "loss": 0.5513, "rewards/accuracies": 0.625, "rewards/chosen": 0.6066957712173462, "rewards/margins": 2.2321441173553467, "rewards/rejected": -1.62544846534729, "step": 88 }, { "epoch": 0.05536547433903577, "grad_norm": 31.837020874023438, "learning_rate": 4.450000000000001e-06, "logits/chosen": -137.7227783203125, "logits/rejected": 40.328426361083984, "logps/chosen": -493.3175048828125, "logps/rejected": -984.7322387695312, "loss": 0.4894, "rewards/accuracies": 0.875, "rewards/chosen": -1.0812036991119385, "rewards/margins": 2.8331120014190674, "rewards/rejected": -3.9143154621124268, "step": 89 }, { "epoch": 0.05598755832037325, "grad_norm": 41.56230163574219, "learning_rate": 4.5e-06, "logits/chosen": -64.95536804199219, "logits/rejected": 92.50924682617188, "logps/chosen": -349.561279296875, "logps/rejected": -502.58575439453125, "loss": 0.4685, "rewards/accuracies": 0.875, "rewards/chosen": 0.26271653175354004, "rewards/margins": 1.5777441263198853, "rewards/rejected": -1.3150277137756348, "step": 90 }, { "epoch": 0.05660964230171073, "grad_norm": 48.96053695678711, "learning_rate": 4.5500000000000005e-06, "logits/chosen": -93.32193756103516, "logits/rejected": -22.064577102661133, "logps/chosen": -329.4854736328125, "logps/rejected": -380.0439147949219, "loss": 0.5001, "rewards/accuracies": 0.75, "rewards/chosen": 0.8870952725410461, "rewards/margins": 1.7570526599884033, "rewards/rejected": -0.869957447052002, "step": 91 }, { "epoch": 0.05723172628304821, "grad_norm": 656.781494140625, "learning_rate": 4.600000000000001e-06, "logits/chosen": -145.7423095703125, "logits/rejected": 41.493865966796875, "logps/chosen": -1423.809326171875, "logps/rejected": -2698.568359375, "loss": 0.7183, "rewards/accuracies": 0.75, "rewards/chosen": -0.01483917236328125, "rewards/margins": 4.9940667152404785, "rewards/rejected": -5.00890588760376, "step": 92 }, { "epoch": 0.05785381026438569, "grad_norm": 195.16009521484375, "learning_rate": 4.65e-06, "logits/chosen": -38.475616455078125, "logits/rejected": 14.098729133605957, "logps/chosen": -1465.155029296875, "logps/rejected": -1159.148681640625, "loss": 1.491, "rewards/accuracies": 0.75, "rewards/chosen": -0.05608868598937988, "rewards/margins": 3.2402305603027344, "rewards/rejected": -3.296319007873535, "step": 93 }, { "epoch": 0.05847589424572317, "grad_norm": 45.73384475708008, "learning_rate": 4.7e-06, "logits/chosen": -126.1684341430664, "logits/rejected": 35.370182037353516, "logps/chosen": -398.15966796875, "logps/rejected": -568.623291015625, "loss": 0.7825, "rewards/accuracies": 0.875, "rewards/chosen": 0.49736112356185913, "rewards/margins": 2.747555732727051, "rewards/rejected": -2.250194549560547, "step": 94 }, { "epoch": 0.05909797822706065, "grad_norm": 8.26572322845459, "learning_rate": 4.75e-06, "logits/chosen": -203.39146423339844, "logits/rejected": 15.635889053344727, "logps/chosen": -344.43292236328125, "logps/rejected": -629.9688720703125, "loss": 0.109, "rewards/accuracies": 1.0, "rewards/chosen": 0.5564532279968262, "rewards/margins": 3.88051438331604, "rewards/rejected": -3.324061393737793, "step": 95 }, { "epoch": 0.059720062208398136, "grad_norm": 15.101452827453613, "learning_rate": 4.800000000000001e-06, "logits/chosen": -49.0048828125, "logits/rejected": 65.80522155761719, "logps/chosen": -494.21405029296875, "logps/rejected": -569.2127685546875, "loss": 0.0993, "rewards/accuracies": 1.0, "rewards/chosen": 0.0874580442905426, "rewards/margins": 4.408256530761719, "rewards/rejected": -4.320797920227051, "step": 96 }, { "epoch": 0.060342146189735615, "grad_norm": 9.323420524597168, "learning_rate": 4.85e-06, "logits/chosen": -43.885765075683594, "logits/rejected": 54.64805221557617, "logps/chosen": -338.53265380859375, "logps/rejected": -482.4219970703125, "loss": 0.0983, "rewards/accuracies": 1.0, "rewards/chosen": 0.26558881998062134, "rewards/margins": 2.888289213180542, "rewards/rejected": -2.6227002143859863, "step": 97 }, { "epoch": 0.060964230171073094, "grad_norm": 8.815876007080078, "learning_rate": 4.9000000000000005e-06, "logits/chosen": -76.39271545410156, "logits/rejected": -0.7074289321899414, "logps/chosen": -247.3833770751953, "logps/rejected": -373.7536926269531, "loss": 0.1135, "rewards/accuracies": 1.0, "rewards/chosen": 0.3204587697982788, "rewards/margins": 2.7152743339538574, "rewards/rejected": -2.394815444946289, "step": 98 }, { "epoch": 0.06158631415241057, "grad_norm": 40.7353401184082, "learning_rate": 4.95e-06, "logits/chosen": -16.285900115966797, "logits/rejected": -7.603992462158203, "logps/chosen": -538.6366577148438, "logps/rejected": -453.6404724121094, "loss": 1.0995, "rewards/accuracies": 0.75, "rewards/chosen": -0.8419248461723328, "rewards/margins": 1.5059051513671875, "rewards/rejected": -2.347830057144165, "step": 99 }, { "epoch": 0.06220839813374806, "grad_norm": 26.501802444458008, "learning_rate": 5e-06, "logits/chosen": -56.957916259765625, "logits/rejected": 35.83625793457031, "logps/chosen": -506.07818603515625, "logps/rejected": -583.6228637695312, "loss": 0.3257, "rewards/accuracies": 0.875, "rewards/chosen": 0.07243508100509644, "rewards/margins": 1.629852533340454, "rewards/rejected": -1.557417631149292, "step": 100 }, { "epoch": 0.06283048211508553, "grad_norm": 10.708247184753418, "learning_rate": 4.994444444444445e-06, "logits/chosen": -136.39981079101562, "logits/rejected": -51.86599349975586, "logps/chosen": -1346.95263671875, "logps/rejected": -1899.767822265625, "loss": 0.0426, "rewards/accuracies": 1.0, "rewards/chosen": -5.631861209869385, "rewards/margins": 4.889688968658447, "rewards/rejected": -10.521549224853516, "step": 101 }, { "epoch": 0.06345256609642301, "grad_norm": 15.042457580566406, "learning_rate": 4.988888888888889e-06, "logits/chosen": -231.4070281982422, "logits/rejected": 69.22805786132812, "logps/chosen": -485.0896301269531, "logps/rejected": -2191.261474609375, "loss": 0.1551, "rewards/accuracies": 1.0, "rewards/chosen": -0.5175511837005615, "rewards/margins": 7.381418704986572, "rewards/rejected": -7.898970603942871, "step": 102 }, { "epoch": 0.0640746500777605, "grad_norm": 27.382177352905273, "learning_rate": 4.983333333333334e-06, "logits/chosen": -111.2535400390625, "logits/rejected": 37.88441848754883, "logps/chosen": -509.2220764160156, "logps/rejected": -849.4989013671875, "loss": 0.2718, "rewards/accuracies": 0.875, "rewards/chosen": -1.3222905397415161, "rewards/margins": 5.8231658935546875, "rewards/rejected": -7.145456790924072, "step": 103 }, { "epoch": 0.06469673405909798, "grad_norm": 18.49082374572754, "learning_rate": 4.977777777777778e-06, "logits/chosen": -32.00193405151367, "logits/rejected": 81.44490051269531, "logps/chosen": -659.6163330078125, "logps/rejected": -1448.41552734375, "loss": 0.2106, "rewards/accuracies": 0.875, "rewards/chosen": -5.421839237213135, "rewards/margins": 13.191776275634766, "rewards/rejected": -18.613616943359375, "step": 104 }, { "epoch": 0.06531881804043546, "grad_norm": 22.473134994506836, "learning_rate": 4.9722222222222224e-06, "logits/chosen": -47.05355453491211, "logits/rejected": -50.598838806152344, "logps/chosen": -406.9202880859375, "logps/rejected": -456.0419921875, "loss": 0.3507, "rewards/accuracies": 0.75, "rewards/chosen": -0.9130598902702332, "rewards/margins": 1.4639744758605957, "rewards/rejected": -2.3770344257354736, "step": 105 }, { "epoch": 0.06594090202177294, "grad_norm": 8.185256958007812, "learning_rate": 4.966666666666667e-06, "logits/chosen": -96.94132232666016, "logits/rejected": 16.857337951660156, "logps/chosen": -178.7850341796875, "logps/rejected": -339.53802490234375, "loss": 0.0945, "rewards/accuracies": 1.0, "rewards/chosen": 0.26321107149124146, "rewards/margins": 3.3856606483459473, "rewards/rejected": -3.1224493980407715, "step": 106 }, { "epoch": 0.06656298600311042, "grad_norm": 17.88851547241211, "learning_rate": 4.961111111111111e-06, "logits/chosen": -115.84870147705078, "logits/rejected": -16.648435592651367, "logps/chosen": -563.9876708984375, "logps/rejected": -615.9740600585938, "loss": 0.1409, "rewards/accuracies": 0.875, "rewards/chosen": 1.3874342441558838, "rewards/margins": 3.9797005653381348, "rewards/rejected": -2.592266082763672, "step": 107 }, { "epoch": 0.0671850699844479, "grad_norm": 36.711334228515625, "learning_rate": 4.9555555555555565e-06, "logits/chosen": -49.58351135253906, "logits/rejected": 18.118301391601562, "logps/chosen": -958.4739379882812, "logps/rejected": -1472.90771484375, "loss": 0.5068, "rewards/accuracies": 0.875, "rewards/chosen": -7.285399913787842, "rewards/margins": 16.346147537231445, "rewards/rejected": -23.631547927856445, "step": 108 }, { "epoch": 0.06780715396578538, "grad_norm": 55.09579849243164, "learning_rate": 4.95e-06, "logits/chosen": -6.980716705322266, "logits/rejected": -0.8190517425537109, "logps/chosen": -556.7156982421875, "logps/rejected": -573.9609375, "loss": 1.2101, "rewards/accuracies": 0.625, "rewards/chosen": -1.9468461275100708, "rewards/margins": 0.9715324640274048, "rewards/rejected": -2.9183783531188965, "step": 109 }, { "epoch": 0.06842923794712286, "grad_norm": 11.719588279724121, "learning_rate": 4.944444444444445e-06, "logits/chosen": -11.281723022460938, "logits/rejected": 104.90423583984375, "logps/chosen": -367.8063049316406, "logps/rejected": -508.12078857421875, "loss": 0.0971, "rewards/accuracies": 1.0, "rewards/chosen": 0.7925231456756592, "rewards/margins": 4.796985149383545, "rewards/rejected": -4.004461765289307, "step": 110 }, { "epoch": 0.06905132192846034, "grad_norm": 9.733543395996094, "learning_rate": 4.938888888888889e-06, "logits/chosen": -111.30741882324219, "logits/rejected": 94.40300750732422, "logps/chosen": -410.62982177734375, "logps/rejected": -594.442138671875, "loss": 0.053, "rewards/accuracies": 1.0, "rewards/chosen": -0.21276235580444336, "rewards/margins": 6.492918491363525, "rewards/rejected": -6.705680847167969, "step": 111 }, { "epoch": 0.06967340590979783, "grad_norm": 19.729164123535156, "learning_rate": 4.933333333333334e-06, "logits/chosen": -54.90504837036133, "logits/rejected": 58.449554443359375, "logps/chosen": -345.8414611816406, "logps/rejected": -501.15899658203125, "loss": 0.2946, "rewards/accuracies": 0.875, "rewards/chosen": -0.4412403106689453, "rewards/margins": 3.2088890075683594, "rewards/rejected": -3.6501290798187256, "step": 112 }, { "epoch": 0.07029548989113531, "grad_norm": 21.787220001220703, "learning_rate": 4.927777777777778e-06, "logits/chosen": -208.8677215576172, "logits/rejected": -7.547284126281738, "logps/chosen": -263.8484191894531, "logps/rejected": -508.4224548339844, "loss": 0.3227, "rewards/accuracies": 0.875, "rewards/chosen": -0.3869560658931732, "rewards/margins": 3.9047999382019043, "rewards/rejected": -4.291755676269531, "step": 113 }, { "epoch": 0.07091757387247279, "grad_norm": 13.62004280090332, "learning_rate": 4.922222222222223e-06, "logits/chosen": -92.05099487304688, "logits/rejected": -19.534648895263672, "logps/chosen": -208.94772338867188, "logps/rejected": -277.3564453125, "loss": 0.3163, "rewards/accuracies": 0.875, "rewards/chosen": -0.3742530941963196, "rewards/margins": 2.4637484550476074, "rewards/rejected": -2.8380017280578613, "step": 114 }, { "epoch": 0.07153965785381027, "grad_norm": 3.646902322769165, "learning_rate": 4.9166666666666665e-06, "logits/chosen": -138.9901580810547, "logits/rejected": 15.188562393188477, "logps/chosen": -500.8119812011719, "logps/rejected": -626.8800659179688, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": 1.5790271759033203, "rewards/margins": 4.337211608886719, "rewards/rejected": -2.7581844329833984, "step": 115 }, { "epoch": 0.07216174183514774, "grad_norm": 31.273012161254883, "learning_rate": 4.911111111111112e-06, "logits/chosen": -88.38819122314453, "logits/rejected": 37.753326416015625, "logps/chosen": -242.85638427734375, "logps/rejected": -390.9364929199219, "loss": 0.2397, "rewards/accuracies": 0.875, "rewards/chosen": 0.5226201415061951, "rewards/margins": 4.194112300872803, "rewards/rejected": -3.671492338180542, "step": 116 }, { "epoch": 0.07278382581648522, "grad_norm": 13.638354301452637, "learning_rate": 4.905555555555556e-06, "logits/chosen": -127.58756256103516, "logits/rejected": -20.61954689025879, "logps/chosen": -888.8870239257812, "logps/rejected": -1078.2861328125, "loss": 0.1516, "rewards/accuracies": 1.0, "rewards/chosen": -8.998729705810547, "rewards/margins": 5.405122756958008, "rewards/rejected": -14.403853416442871, "step": 117 }, { "epoch": 0.0734059097978227, "grad_norm": 21.47307586669922, "learning_rate": 4.9000000000000005e-06, "logits/chosen": -207.70059204101562, "logits/rejected": -36.95095443725586, "logps/chosen": -673.3997802734375, "logps/rejected": -1544.263916015625, "loss": 0.5779, "rewards/accuracies": 0.875, "rewards/chosen": -3.125640392303467, "rewards/margins": 18.952285766601562, "rewards/rejected": -22.07792854309082, "step": 118 }, { "epoch": 0.07402799377916018, "grad_norm": 20.151708602905273, "learning_rate": 4.894444444444445e-06, "logits/chosen": -133.85650634765625, "logits/rejected": 100.17530822753906, "logps/chosen": -208.63571166992188, "logps/rejected": -488.3178405761719, "loss": 0.218, "rewards/accuracies": 0.875, "rewards/chosen": 0.161653071641922, "rewards/margins": 3.8544931411743164, "rewards/rejected": -3.692840099334717, "step": 119 }, { "epoch": 0.07465007776049767, "grad_norm": 2.343539237976074, "learning_rate": 4.888888888888889e-06, "logits/chosen": -194.25608825683594, "logits/rejected": 35.305824279785156, "logps/chosen": -218.46517944335938, "logps/rejected": -541.6307373046875, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": -1.0702570676803589, "rewards/margins": 5.531541347503662, "rewards/rejected": -6.601799011230469, "step": 120 }, { "epoch": 0.07527216174183515, "grad_norm": 32.92597961425781, "learning_rate": 4.883333333333334e-06, "logits/chosen": -45.28303909301758, "logits/rejected": -19.86842918395996, "logps/chosen": -474.09234619140625, "logps/rejected": -529.525390625, "loss": 0.3638, "rewards/accuracies": 0.75, "rewards/chosen": -1.5224051475524902, "rewards/margins": 2.7598304748535156, "rewards/rejected": -4.282235145568848, "step": 121 }, { "epoch": 0.07589424572317263, "grad_norm": 14.678348541259766, "learning_rate": 4.877777777777778e-06, "logits/chosen": -90.25349426269531, "logits/rejected": -41.50727081298828, "logps/chosen": -574.9208984375, "logps/rejected": -1204.8919677734375, "loss": 0.3119, "rewards/accuracies": 0.75, "rewards/chosen": -7.2306365966796875, "rewards/margins": 10.171648025512695, "rewards/rejected": -17.402284622192383, "step": 122 }, { "epoch": 0.07651632970451011, "grad_norm": 2.9137473106384277, "learning_rate": 4.8722222222222225e-06, "logits/chosen": -46.517059326171875, "logits/rejected": 39.597869873046875, "logps/chosen": -519.730224609375, "logps/rejected": -541.9354248046875, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": 0.06262329965829849, "rewards/margins": 4.325374126434326, "rewards/rejected": -4.262750625610352, "step": 123 }, { "epoch": 0.07713841368584759, "grad_norm": 2.1369285583496094, "learning_rate": 4.866666666666667e-06, "logits/chosen": -129.95697021484375, "logits/rejected": 17.274600982666016, "logps/chosen": -150.88449096679688, "logps/rejected": -382.77569580078125, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 0.20197254419326782, "rewards/margins": 5.916993141174316, "rewards/rejected": -5.715021133422852, "step": 124 }, { "epoch": 0.07776049766718507, "grad_norm": 5.022590637207031, "learning_rate": 4.861111111111111e-06, "logits/chosen": -89.13758087158203, "logits/rejected": -20.545379638671875, "logps/chosen": -278.720947265625, "logps/rejected": -357.8970642089844, "loss": 0.1334, "rewards/accuracies": 0.875, "rewards/chosen": -0.7414289712905884, "rewards/margins": 4.924288749694824, "rewards/rejected": -5.665718078613281, "step": 125 }, { "epoch": 0.07838258164852255, "grad_norm": 25.788761138916016, "learning_rate": 4.855555555555556e-06, "logits/chosen": -148.35345458984375, "logits/rejected": -10.06379508972168, "logps/chosen": -546.356689453125, "logps/rejected": -1311.2293701171875, "loss": 0.1004, "rewards/accuracies": 1.0, "rewards/chosen": -7.099493980407715, "rewards/margins": 15.407082557678223, "rewards/rejected": -22.506576538085938, "step": 126 }, { "epoch": 0.07900466562986003, "grad_norm": 816.911376953125, "learning_rate": 4.85e-06, "logits/chosen": 38.08330535888672, "logits/rejected": 19.351613998413086, "logps/chosen": -1262.032470703125, "logps/rejected": -866.334716796875, "loss": 8.2373, "rewards/accuracies": 0.75, "rewards/chosen": -17.87533950805664, "rewards/margins": -3.016317367553711, "rewards/rejected": -14.859024047851562, "step": 127 }, { "epoch": 0.0796267496111975, "grad_norm": 29.095083236694336, "learning_rate": 4.8444444444444446e-06, "logits/chosen": -96.02214050292969, "logits/rejected": 72.57325744628906, "logps/chosen": -1057.8896484375, "logps/rejected": -1604.802734375, "loss": 0.3263, "rewards/accuracies": 0.875, "rewards/chosen": -10.290213584899902, "rewards/margins": 7.77634334564209, "rewards/rejected": -18.066556930541992, "step": 128 }, { "epoch": 0.080248833592535, "grad_norm": 33.06646728515625, "learning_rate": 4.838888888888889e-06, "logits/chosen": -43.56713104248047, "logits/rejected": -118.48346710205078, "logps/chosen": -596.7957763671875, "logps/rejected": -602.6388549804688, "loss": 0.4008, "rewards/accuracies": 0.875, "rewards/chosen": -3.077329158782959, "rewards/margins": 3.8963608741760254, "rewards/rejected": -6.973690032958984, "step": 129 }, { "epoch": 0.08087091757387248, "grad_norm": 16.123010635375977, "learning_rate": 4.833333333333333e-06, "logits/chosen": -211.56814575195312, "logits/rejected": 148.60684204101562, "logps/chosen": -188.67355346679688, "logps/rejected": -574.7741088867188, "loss": 0.1261, "rewards/accuracies": 0.875, "rewards/chosen": -0.25640222430229187, "rewards/margins": 7.156912803649902, "rewards/rejected": -7.4133148193359375, "step": 130 }, { "epoch": 0.08149300155520996, "grad_norm": 368.5513610839844, "learning_rate": 4.827777777777778e-06, "logits/chosen": -45.56849670410156, "logits/rejected": -24.962718963623047, "logps/chosen": -1612.7254638671875, "logps/rejected": -1780.591064453125, "loss": 3.3497, "rewards/accuracies": 0.75, "rewards/chosen": -12.489030838012695, "rewards/margins": -0.1436161994934082, "rewards/rejected": -12.345414161682129, "step": 131 }, { "epoch": 0.08211508553654744, "grad_norm": 8.705522537231445, "learning_rate": 4.822222222222222e-06, "logits/chosen": -140.19015502929688, "logits/rejected": 44.41850280761719, "logps/chosen": -1189.0040283203125, "logps/rejected": -1719.5478515625, "loss": 0.0596, "rewards/accuracies": 1.0, "rewards/chosen": -11.379548072814941, "rewards/margins": 16.330036163330078, "rewards/rejected": -27.709583282470703, "step": 132 }, { "epoch": 0.08273716951788491, "grad_norm": 0.7435898184776306, "learning_rate": 4.816666666666667e-06, "logits/chosen": -117.56230163574219, "logits/rejected": 25.640478134155273, "logps/chosen": -227.17523193359375, "logps/rejected": -806.6884765625, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -0.2815589904785156, "rewards/margins": 6.717327117919922, "rewards/rejected": -6.9988861083984375, "step": 133 }, { "epoch": 0.0833592534992224, "grad_norm": 0.41516929864883423, "learning_rate": 4.811111111111111e-06, "logits/chosen": -35.912174224853516, "logits/rejected": 34.06318664550781, "logps/chosen": -503.85064697265625, "logps/rejected": -717.1590576171875, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.0734636783599854, "rewards/margins": 8.313762664794922, "rewards/rejected": -10.387226104736328, "step": 134 }, { "epoch": 0.08398133748055987, "grad_norm": 894.6290893554688, "learning_rate": 4.805555555555556e-06, "logits/chosen": -27.244121551513672, "logits/rejected": 54.6364860534668, "logps/chosen": -1413.940673828125, "logps/rejected": -1126.459228515625, "loss": 3.7581, "rewards/accuracies": 0.875, "rewards/chosen": -12.825390815734863, "rewards/margins": 2.7760562896728516, "rewards/rejected": -15.601447105407715, "step": 135 }, { "epoch": 0.08460342146189735, "grad_norm": 33.976715087890625, "learning_rate": 4.800000000000001e-06, "logits/chosen": -80.97942352294922, "logits/rejected": 42.69459915161133, "logps/chosen": -377.5361633300781, "logps/rejected": -507.7229919433594, "loss": 0.4478, "rewards/accuracies": 0.625, "rewards/chosen": -2.260896682739258, "rewards/margins": 4.060694694519043, "rewards/rejected": -6.321591377258301, "step": 136 }, { "epoch": 0.08522550544323483, "grad_norm": 43.320640563964844, "learning_rate": 4.794444444444445e-06, "logits/chosen": -13.470466613769531, "logits/rejected": 29.156890869140625, "logps/chosen": -843.1610107421875, "logps/rejected": -924.053955078125, "loss": 0.2034, "rewards/accuracies": 0.875, "rewards/chosen": -2.105743408203125, "rewards/margins": 5.618618965148926, "rewards/rejected": -7.724361896514893, "step": 137 }, { "epoch": 0.08584758942457232, "grad_norm": 31.211551666259766, "learning_rate": 4.7888888888888894e-06, "logits/chosen": -49.941463470458984, "logits/rejected": 18.080299377441406, "logps/chosen": -641.8109741210938, "logps/rejected": -812.7971801757812, "loss": 0.1675, "rewards/accuracies": 0.875, "rewards/chosen": 0.8376624584197998, "rewards/margins": 5.72138786315918, "rewards/rejected": -4.883725166320801, "step": 138 }, { "epoch": 0.0864696734059098, "grad_norm": 5.8645219802856445, "learning_rate": 4.783333333333334e-06, "logits/chosen": -138.4449462890625, "logits/rejected": 53.766632080078125, "logps/chosen": -330.08343505859375, "logps/rejected": -581.9744262695312, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": -2.2344627380371094, "rewards/margins": 5.404593467712402, "rewards/rejected": -7.6390557289123535, "step": 139 }, { "epoch": 0.08709175738724728, "grad_norm": 8.757513999938965, "learning_rate": 4.777777777777778e-06, "logits/chosen": -70.00257110595703, "logits/rejected": 66.4307632446289, "logps/chosen": -593.6077880859375, "logps/rejected": -811.7509765625, "loss": 0.0444, "rewards/accuracies": 1.0, "rewards/chosen": -1.714566707611084, "rewards/margins": 5.989643096923828, "rewards/rejected": -7.704209327697754, "step": 140 }, { "epoch": 0.08771384136858476, "grad_norm": 15.735207557678223, "learning_rate": 4.772222222222223e-06, "logits/chosen": -84.39601135253906, "logits/rejected": 45.130043029785156, "logps/chosen": -308.6071472167969, "logps/rejected": -473.0471496582031, "loss": 0.1821, "rewards/accuracies": 0.875, "rewards/chosen": -2.3791651725769043, "rewards/margins": 5.166663646697998, "rewards/rejected": -7.545828819274902, "step": 141 }, { "epoch": 0.08833592534992224, "grad_norm": 5.770833492279053, "learning_rate": 4.766666666666667e-06, "logits/chosen": -74.12232971191406, "logits/rejected": 94.86646270751953, "logps/chosen": -304.89056396484375, "logps/rejected": -475.5865478515625, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": -2.42392635345459, "rewards/margins": 5.1642746925354, "rewards/rejected": -7.588200569152832, "step": 142 }, { "epoch": 0.08895800933125972, "grad_norm": 0.4466715157032013, "learning_rate": 4.7611111111111115e-06, "logits/chosen": -25.229652404785156, "logits/rejected": 9.860782623291016, "logps/chosen": -507.16070556640625, "logps/rejected": -485.4691162109375, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 0.13089904189109802, "rewards/margins": 6.393378257751465, "rewards/rejected": -6.262479782104492, "step": 143 }, { "epoch": 0.0895800933125972, "grad_norm": 38.91770553588867, "learning_rate": 4.755555555555556e-06, "logits/chosen": -65.49249267578125, "logits/rejected": 26.676326751708984, "logps/chosen": -385.1539001464844, "logps/rejected": -485.9601135253906, "loss": 0.4327, "rewards/accuracies": 0.75, "rewards/chosen": -2.0181236267089844, "rewards/margins": 5.359482765197754, "rewards/rejected": -7.377606391906738, "step": 144 }, { "epoch": 0.09020217729393468, "grad_norm": 10.08520793914795, "learning_rate": 4.75e-06, "logits/chosen": -102.21664428710938, "logits/rejected": 79.02078247070312, "logps/chosen": -149.50067138671875, "logps/rejected": -363.1020202636719, "loss": 0.0609, "rewards/accuracies": 1.0, "rewards/chosen": -0.9053165912628174, "rewards/margins": 4.60699462890625, "rewards/rejected": -5.5123114585876465, "step": 145 }, { "epoch": 0.09082426127527216, "grad_norm": 6.681402206420898, "learning_rate": 4.744444444444445e-06, "logits/chosen": -77.08467864990234, "logits/rejected": 31.71272850036621, "logps/chosen": -359.138916015625, "logps/rejected": -469.43060302734375, "loss": 0.106, "rewards/accuracies": 0.875, "rewards/chosen": -0.8409112095832825, "rewards/margins": 5.721317291259766, "rewards/rejected": -6.562228679656982, "step": 146 }, { "epoch": 0.09144634525660965, "grad_norm": 10.484644889831543, "learning_rate": 4.73888888888889e-06, "logits/chosen": -134.62881469726562, "logits/rejected": 5.831884384155273, "logps/chosen": -376.69793701171875, "logps/rejected": -806.8646240234375, "loss": 0.1368, "rewards/accuracies": 0.875, "rewards/chosen": -2.948415994644165, "rewards/margins": 5.414452075958252, "rewards/rejected": -8.36286735534668, "step": 147 }, { "epoch": 0.09206842923794713, "grad_norm": 759.1050415039062, "learning_rate": 4.7333333333333335e-06, "logits/chosen": -150.2550811767578, "logits/rejected": 59.547454833984375, "logps/chosen": -572.9852294921875, "logps/rejected": -1475.30712890625, "loss": 0.1846, "rewards/accuracies": 0.875, "rewards/chosen": -2.0559818744659424, "rewards/margins": 3.673940658569336, "rewards/rejected": -5.729922771453857, "step": 148 }, { "epoch": 0.0926905132192846, "grad_norm": 3.8935587406158447, "learning_rate": 4.727777777777779e-06, "logits/chosen": -128.90281677246094, "logits/rejected": 1.236379623413086, "logps/chosen": -554.9027709960938, "logps/rejected": -618.86865234375, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": -0.8603519797325134, "rewards/margins": 6.9805731773376465, "rewards/rejected": -7.8409247398376465, "step": 149 }, { "epoch": 0.09331259720062209, "grad_norm": 46.44478225708008, "learning_rate": 4.722222222222222e-06, "logits/chosen": -51.02794647216797, "logits/rejected": -42.85813522338867, "logps/chosen": -475.0333557128906, "logps/rejected": -554.1500244140625, "loss": 0.3531, "rewards/accuracies": 0.75, "rewards/chosen": -1.925708532333374, "rewards/margins": 3.6755919456481934, "rewards/rejected": -5.601300239562988, "step": 150 }, { "epoch": 0.09393468118195956, "grad_norm": 5.293424606323242, "learning_rate": 4.7166666666666675e-06, "logits/chosen": -29.86174774169922, "logits/rejected": -124.77369689941406, "logps/chosen": -569.3523559570312, "logps/rejected": -667.6962890625, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": -1.4492998123168945, "rewards/margins": 6.492640495300293, "rewards/rejected": -7.941939830780029, "step": 151 }, { "epoch": 0.09455676516329704, "grad_norm": 524.1181030273438, "learning_rate": 4.711111111111111e-06, "logits/chosen": -118.01875305175781, "logits/rejected": 65.27468872070312, "logps/chosen": -801.3173217773438, "logps/rejected": -1773.2293701171875, "loss": 1.874, "rewards/accuracies": 0.875, "rewards/chosen": -9.552244186401367, "rewards/margins": 6.156587600708008, "rewards/rejected": -15.708831787109375, "step": 152 }, { "epoch": 0.09517884914463452, "grad_norm": 42.236045837402344, "learning_rate": 4.705555555555556e-06, "logits/chosen": -99.01790618896484, "logits/rejected": 64.84452819824219, "logps/chosen": -434.4639587402344, "logps/rejected": -579.9295654296875, "loss": 0.3077, "rewards/accuracies": 0.875, "rewards/chosen": -3.439419984817505, "rewards/margins": 5.240507125854492, "rewards/rejected": -8.679927825927734, "step": 153 }, { "epoch": 0.095800933125972, "grad_norm": 225.9073944091797, "learning_rate": 4.7e-06, "logits/chosen": -67.7724838256836, "logits/rejected": 71.39949035644531, "logps/chosen": -835.3055419921875, "logps/rejected": -936.8106079101562, "loss": 0.8127, "rewards/accuracies": 0.875, "rewards/chosen": -5.6774749755859375, "rewards/margins": 6.11433219909668, "rewards/rejected": -11.791807174682617, "step": 154 }, { "epoch": 0.09642301710730948, "grad_norm": 32.26408767700195, "learning_rate": 4.694444444444445e-06, "logits/chosen": -153.36558532714844, "logits/rejected": 34.609283447265625, "logps/chosen": -243.09588623046875, "logps/rejected": -463.4990234375, "loss": 0.4722, "rewards/accuracies": 0.875, "rewards/chosen": -2.092219352722168, "rewards/margins": 5.078300476074219, "rewards/rejected": -7.170520305633545, "step": 155 }, { "epoch": 0.09704510108864697, "grad_norm": 49.06928253173828, "learning_rate": 4.6888888888888895e-06, "logits/chosen": -42.6507568359375, "logits/rejected": 101.07369995117188, "logps/chosen": -581.34130859375, "logps/rejected": -727.2525634765625, "loss": 0.4657, "rewards/accuracies": 0.875, "rewards/chosen": -2.9420390129089355, "rewards/margins": 4.026432514190674, "rewards/rejected": -6.968471527099609, "step": 156 }, { "epoch": 0.09766718506998445, "grad_norm": 25.430435180664062, "learning_rate": 4.683333333333334e-06, "logits/chosen": -86.69388580322266, "logits/rejected": -37.751461029052734, "logps/chosen": -286.734375, "logps/rejected": -481.1144714355469, "loss": 0.1582, "rewards/accuracies": 0.875, "rewards/chosen": -0.29795393347740173, "rewards/margins": 6.4003520011901855, "rewards/rejected": -6.698306083679199, "step": 157 }, { "epoch": 0.09828926905132193, "grad_norm": 14.230854034423828, "learning_rate": 4.677777777777778e-06, "logits/chosen": -195.95489501953125, "logits/rejected": -13.855463027954102, "logps/chosen": -427.7440185546875, "logps/rejected": -1571.00244140625, "loss": 0.1206, "rewards/accuracies": 1.0, "rewards/chosen": -3.1748383045196533, "rewards/margins": 28.36606216430664, "rewards/rejected": -31.54090118408203, "step": 158 }, { "epoch": 0.09891135303265941, "grad_norm": 21.907337188720703, "learning_rate": 4.672222222222223e-06, "logits/chosen": -119.91964721679688, "logits/rejected": -4.829996109008789, "logps/chosen": -234.52195739746094, "logps/rejected": -1207.064697265625, "loss": 0.2061, "rewards/accuracies": 0.875, "rewards/chosen": -1.990342378616333, "rewards/margins": 16.696937561035156, "rewards/rejected": -18.687278747558594, "step": 159 }, { "epoch": 0.09953343701399689, "grad_norm": 2.368584632873535, "learning_rate": 4.666666666666667e-06, "logits/chosen": -60.153717041015625, "logits/rejected": 50.49053955078125, "logps/chosen": -436.2689208984375, "logps/rejected": -570.0453491210938, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -2.8190696239471436, "rewards/margins": 7.334978103637695, "rewards/rejected": -10.154047966003418, "step": 160 }, { "epoch": 0.10015552099533437, "grad_norm": 816.0321655273438, "learning_rate": 4.6611111111111116e-06, "logits/chosen": -80.4203109741211, "logits/rejected": -7.057236671447754, "logps/chosen": -1813.9412841796875, "logps/rejected": -1180.687255859375, "loss": 7.0566, "rewards/accuracies": 0.875, "rewards/chosen": -15.76583480834961, "rewards/margins": -0.5474824905395508, "rewards/rejected": -15.218353271484375, "step": 161 }, { "epoch": 0.10077760497667185, "grad_norm": 35.41217803955078, "learning_rate": 4.655555555555556e-06, "logits/chosen": -52.78041458129883, "logits/rejected": 120.70462799072266, "logps/chosen": -456.3648681640625, "logps/rejected": -698.6376342773438, "loss": 0.2869, "rewards/accuracies": 0.875, "rewards/chosen": -2.0043466091156006, "rewards/margins": 7.978643417358398, "rewards/rejected": -9.982990264892578, "step": 162 }, { "epoch": 0.10139968895800933, "grad_norm": 17.337677001953125, "learning_rate": 4.65e-06, "logits/chosen": -110.18732452392578, "logits/rejected": -37.3474006652832, "logps/chosen": -1515.3572998046875, "logps/rejected": -1715.7703857421875, "loss": 0.1318, "rewards/accuracies": 0.875, "rewards/chosen": -18.540019989013672, "rewards/margins": 10.801197052001953, "rewards/rejected": -29.341217041015625, "step": 163 }, { "epoch": 0.1020217729393468, "grad_norm": 16.449697494506836, "learning_rate": 4.644444444444445e-06, "logits/chosen": -31.45309066772461, "logits/rejected": 35.266944885253906, "logps/chosen": -405.5921630859375, "logps/rejected": -473.3642883300781, "loss": 0.0792, "rewards/accuracies": 1.0, "rewards/chosen": -2.4139010906219482, "rewards/margins": 6.1966552734375, "rewards/rejected": -8.610555648803711, "step": 164 }, { "epoch": 0.1026438569206843, "grad_norm": 9.951751708984375, "learning_rate": 4.638888888888889e-06, "logits/chosen": -92.63522338867188, "logits/rejected": 26.308361053466797, "logps/chosen": -356.34979248046875, "logps/rejected": -565.6304931640625, "loss": 0.0664, "rewards/accuracies": 1.0, "rewards/chosen": -2.23789381980896, "rewards/margins": 7.9019551277160645, "rewards/rejected": -10.139848709106445, "step": 165 }, { "epoch": 0.10326594090202178, "grad_norm": 0.2486131191253662, "learning_rate": 4.633333333333334e-06, "logits/chosen": -36.45603942871094, "logits/rejected": 130.2530975341797, "logps/chosen": -646.247314453125, "logps/rejected": -957.4547119140625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -5.621523857116699, "rewards/margins": 8.341659545898438, "rewards/rejected": -13.963183403015137, "step": 166 }, { "epoch": 0.10388802488335926, "grad_norm": 30.210628509521484, "learning_rate": 4.627777777777778e-06, "logits/chosen": -131.19161987304688, "logits/rejected": 146.65216064453125, "logps/chosen": -477.0729675292969, "logps/rejected": -807.9055786132812, "loss": 0.1324, "rewards/accuracies": 0.875, "rewards/chosen": -2.876908779144287, "rewards/margins": 7.5301971435546875, "rewards/rejected": -10.407105445861816, "step": 167 }, { "epoch": 0.10451010886469674, "grad_norm": 2.3592190742492676, "learning_rate": 4.622222222222222e-06, "logits/chosen": -54.75543975830078, "logits/rejected": -2.629974365234375, "logps/chosen": -425.58154296875, "logps/rejected": -501.09234619140625, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -2.056523561477661, "rewards/margins": 5.526520729064941, "rewards/rejected": -7.583044052124023, "step": 168 }, { "epoch": 0.10513219284603421, "grad_norm": 0.6539332866668701, "learning_rate": 4.616666666666667e-06, "logits/chosen": -228.02420043945312, "logits/rejected": 25.053728103637695, "logps/chosen": -297.6983337402344, "logps/rejected": -607.0726928710938, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -1.355224847793579, "rewards/margins": 7.341706275939941, "rewards/rejected": -8.696929931640625, "step": 169 }, { "epoch": 0.1057542768273717, "grad_norm": 23.737112045288086, "learning_rate": 4.611111111111112e-06, "logits/chosen": -61.03352355957031, "logits/rejected": -46.659637451171875, "logps/chosen": -472.5522766113281, "logps/rejected": -529.188232421875, "loss": 0.235, "rewards/accuracies": 0.875, "rewards/chosen": -2.0487112998962402, "rewards/margins": 5.43203067779541, "rewards/rejected": -7.48074197769165, "step": 170 }, { "epoch": 0.10637636080870917, "grad_norm": 10.78062629699707, "learning_rate": 4.605555555555556e-06, "logits/chosen": -219.480224609375, "logits/rejected": 16.399112701416016, "logps/chosen": -377.349365234375, "logps/rejected": -604.8502197265625, "loss": 0.0418, "rewards/accuracies": 1.0, "rewards/chosen": -0.027693569660186768, "rewards/margins": 7.55369758605957, "rewards/rejected": -7.581391334533691, "step": 171 }, { "epoch": 0.10699844479004665, "grad_norm": 48.85132598876953, "learning_rate": 4.600000000000001e-06, "logits/chosen": 10.033275604248047, "logits/rejected": 55.14773178100586, "logps/chosen": -453.04339599609375, "logps/rejected": -519.9110107421875, "loss": 0.7674, "rewards/accuracies": 0.75, "rewards/chosen": -4.891550540924072, "rewards/margins": 5.427800178527832, "rewards/rejected": -10.319350242614746, "step": 172 }, { "epoch": 0.10762052877138413, "grad_norm": 1.1792778968811035, "learning_rate": 4.594444444444444e-06, "logits/chosen": -57.03020477294922, "logits/rejected": 42.06708526611328, "logps/chosen": -499.9429626464844, "logps/rejected": -643.1578979492188, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -2.7095754146575928, "rewards/margins": 7.932033538818359, "rewards/rejected": -10.641609191894531, "step": 173 }, { "epoch": 0.10824261275272162, "grad_norm": 104.11849975585938, "learning_rate": 4.58888888888889e-06, "logits/chosen": -119.0609130859375, "logits/rejected": 32.29179382324219, "logps/chosen": -463.960693359375, "logps/rejected": -1497.199462890625, "loss": 0.6921, "rewards/accuracies": 0.875, "rewards/chosen": -2.7970821857452393, "rewards/margins": 30.111461639404297, "rewards/rejected": -32.90854263305664, "step": 174 }, { "epoch": 0.1088646967340591, "grad_norm": 0.41236698627471924, "learning_rate": 4.583333333333333e-06, "logits/chosen": -74.53089141845703, "logits/rejected": 65.25262451171875, "logps/chosen": -374.88031005859375, "logps/rejected": -681.6280517578125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -5.4716973304748535, "rewards/margins": 9.439125061035156, "rewards/rejected": -14.910822868347168, "step": 175 }, { "epoch": 0.10948678071539658, "grad_norm": 25.013166427612305, "learning_rate": 4.5777777777777785e-06, "logits/chosen": -39.65898132324219, "logits/rejected": -10.906882286071777, "logps/chosen": -488.16375732421875, "logps/rejected": -556.0191650390625, "loss": 0.2494, "rewards/accuracies": 0.875, "rewards/chosen": -1.7389155626296997, "rewards/margins": 5.059553146362305, "rewards/rejected": -6.798468589782715, "step": 176 }, { "epoch": 0.11010886469673406, "grad_norm": 44.22224807739258, "learning_rate": 4.572222222222222e-06, "logits/chosen": -80.19673919677734, "logits/rejected": 122.91427612304688, "logps/chosen": -436.0129699707031, "logps/rejected": -604.2403564453125, "loss": 0.6567, "rewards/accuracies": 0.875, "rewards/chosen": 0.7558496594429016, "rewards/margins": 8.272817611694336, "rewards/rejected": -7.516968727111816, "step": 177 }, { "epoch": 0.11073094867807154, "grad_norm": 2.2757623195648193, "learning_rate": 4.566666666666667e-06, "logits/chosen": -118.6231689453125, "logits/rejected": 9.284912109375, "logps/chosen": -171.06834411621094, "logps/rejected": -516.5938110351562, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": -1.3611491918563843, "rewards/margins": 8.120443344116211, "rewards/rejected": -9.481593132019043, "step": 178 }, { "epoch": 0.11135303265940902, "grad_norm": 36.23075866699219, "learning_rate": 4.561111111111112e-06, "logits/chosen": -3.9724044799804688, "logits/rejected": 108.16490173339844, "logps/chosen": -585.9118041992188, "logps/rejected": -748.2664184570312, "loss": 0.9038, "rewards/accuracies": 0.875, "rewards/chosen": -7.5321125984191895, "rewards/margins": 5.860141277313232, "rewards/rejected": -13.392252922058105, "step": 179 }, { "epoch": 0.1119751166407465, "grad_norm": 44.442787170410156, "learning_rate": 4.555555555555556e-06, "logits/chosen": -120.70065307617188, "logits/rejected": 4.410699844360352, "logps/chosen": -402.6814880371094, "logps/rejected": -565.3322143554688, "loss": 0.6339, "rewards/accuracies": 0.75, "rewards/chosen": -0.7019529938697815, "rewards/margins": 3.8681015968322754, "rewards/rejected": -4.570054531097412, "step": 180 }, { "epoch": 0.11259720062208398, "grad_norm": 34.72930145263672, "learning_rate": 4.5500000000000005e-06, "logits/chosen": -112.9830322265625, "logits/rejected": 40.60749053955078, "logps/chosen": -760.9752197265625, "logps/rejected": -1223.5533447265625, "loss": 0.2344, "rewards/accuracies": 0.875, "rewards/chosen": -9.836041450500488, "rewards/margins": 13.805087089538574, "rewards/rejected": -23.641128540039062, "step": 181 }, { "epoch": 0.11321928460342146, "grad_norm": 0.49302956461906433, "learning_rate": 4.544444444444445e-06, "logits/chosen": -105.63148498535156, "logits/rejected": 89.42071533203125, "logps/chosen": -325.6039733886719, "logps/rejected": -632.583740234375, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -1.4772449731826782, "rewards/margins": 12.680350303649902, "rewards/rejected": -14.15759563446045, "step": 182 }, { "epoch": 0.11384136858475895, "grad_norm": 1.4100648164749146, "learning_rate": 4.538888888888889e-06, "logits/chosen": -104.06034851074219, "logits/rejected": -40.527347564697266, "logps/chosen": -1607.376708984375, "logps/rejected": -1648.577392578125, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -15.994466781616211, "rewards/margins": 12.004467010498047, "rewards/rejected": -27.99893569946289, "step": 183 }, { "epoch": 0.11446345256609643, "grad_norm": 0.9935240745544434, "learning_rate": 4.533333333333334e-06, "logits/chosen": -68.62060546875, "logits/rejected": -47.34330368041992, "logps/chosen": -445.2177429199219, "logps/rejected": -536.7042846679688, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -0.8180896043777466, "rewards/margins": 7.331015586853027, "rewards/rejected": -8.149105072021484, "step": 184 }, { "epoch": 0.1150855365474339, "grad_norm": 819.3656005859375, "learning_rate": 4.527777777777778e-06, "logits/chosen": -141.9098358154297, "logits/rejected": -60.83519744873047, "logps/chosen": -1649.9873046875, "logps/rejected": -2477.72802734375, "loss": 5.755, "rewards/accuracies": 0.875, "rewards/chosen": -15.289737701416016, "rewards/margins": 18.761110305786133, "rewards/rejected": -34.05084991455078, "step": 185 }, { "epoch": 0.11570762052877138, "grad_norm": 2.472080707550049, "learning_rate": 4.5222222222222225e-06, "logits/chosen": -100.92279052734375, "logits/rejected": 48.834712982177734, "logps/chosen": -541.3685913085938, "logps/rejected": -719.9791259765625, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -3.0289676189422607, "rewards/margins": 6.541708946228027, "rewards/rejected": -9.570676803588867, "step": 186 }, { "epoch": 0.11632970451010886, "grad_norm": 4.140819549560547, "learning_rate": 4.516666666666667e-06, "logits/chosen": -53.79328155517578, "logits/rejected": 132.23597717285156, "logps/chosen": -408.4837646484375, "logps/rejected": -1409.4010009765625, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -1.4427204132080078, "rewards/margins": 8.537384986877441, "rewards/rejected": -9.98010540008545, "step": 187 }, { "epoch": 0.11695178849144634, "grad_norm": 6.938059329986572, "learning_rate": 4.511111111111111e-06, "logits/chosen": -66.68578338623047, "logits/rejected": 35.657230377197266, "logps/chosen": -515.35888671875, "logps/rejected": -1199.377685546875, "loss": 0.0366, "rewards/accuracies": 1.0, "rewards/chosen": -0.8689206838607788, "rewards/margins": 21.66095733642578, "rewards/rejected": -22.529876708984375, "step": 188 }, { "epoch": 0.11757387247278382, "grad_norm": 1.0334467887878418, "learning_rate": 4.505555555555556e-06, "logits/chosen": -29.12325668334961, "logits/rejected": 78.59813690185547, "logps/chosen": -366.9556579589844, "logps/rejected": -521.24951171875, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -2.89388108253479, "rewards/margins": 9.782254219055176, "rewards/rejected": -12.676135063171387, "step": 189 }, { "epoch": 0.1181959564541213, "grad_norm": 562.7681884765625, "learning_rate": 4.5e-06, "logits/chosen": -19.161190032958984, "logits/rejected": -12.427164077758789, "logps/chosen": -1195.766845703125, "logps/rejected": -1653.085693359375, "loss": 6.006, "rewards/accuracies": 0.875, "rewards/chosen": -8.844135284423828, "rewards/margins": 13.617812156677246, "rewards/rejected": -22.46194839477539, "step": 190 }, { "epoch": 0.1188180404354588, "grad_norm": 9.043701171875, "learning_rate": 4.4944444444444445e-06, "logits/chosen": -123.5665283203125, "logits/rejected": 28.78691864013672, "logps/chosen": -151.1602020263672, "logps/rejected": -429.7079162597656, "loss": 0.0689, "rewards/accuracies": 1.0, "rewards/chosen": -0.21753016114234924, "rewards/margins": 9.149826049804688, "rewards/rejected": -9.367356300354004, "step": 191 }, { "epoch": 0.11944012441679627, "grad_norm": 0.20668452978134155, "learning_rate": 4.488888888888889e-06, "logits/chosen": -160.41571044921875, "logits/rejected": 35.9819221496582, "logps/chosen": -422.51019287109375, "logps/rejected": -734.3595581054688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.0518970489501953, "rewards/margins": 10.355688095092773, "rewards/rejected": -13.407585144042969, "step": 192 }, { "epoch": 0.12006220839813375, "grad_norm": 1208.0975341796875, "learning_rate": 4.483333333333333e-06, "logits/chosen": -72.91759490966797, "logits/rejected": 82.73658752441406, "logps/chosen": -360.498779296875, "logps/rejected": -1821.1766357421875, "loss": 1.7645, "rewards/accuracies": 0.875, "rewards/chosen": -2.279343366622925, "rewards/margins": 7.21323299407959, "rewards/rejected": -9.492576599121094, "step": 193 }, { "epoch": 0.12068429237947123, "grad_norm": 5.3821210861206055, "learning_rate": 4.477777777777778e-06, "logits/chosen": -102.9739990234375, "logits/rejected": 21.03948974609375, "logps/chosen": -446.7309875488281, "logps/rejected": -665.4340209960938, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": -2.5288028717041016, "rewards/margins": 8.07142448425293, "rewards/rejected": -10.600227355957031, "step": 194 }, { "epoch": 0.12130637636080871, "grad_norm": 39.19600296020508, "learning_rate": 4.472222222222223e-06, "logits/chosen": -94.78016662597656, "logits/rejected": -7.096273422241211, "logps/chosen": -334.62451171875, "logps/rejected": -553.868408203125, "loss": 0.245, "rewards/accuracies": 0.875, "rewards/chosen": -1.0174301862716675, "rewards/margins": 6.497749328613281, "rewards/rejected": -7.5151801109313965, "step": 195 }, { "epoch": 0.12192846034214619, "grad_norm": 44.08115005493164, "learning_rate": 4.4666666666666665e-06, "logits/chosen": -42.81006622314453, "logits/rejected": 57.45598220825195, "logps/chosen": -537.6096801757812, "logps/rejected": -653.092529296875, "loss": 0.1469, "rewards/accuracies": 0.875, "rewards/chosen": -5.298211574554443, "rewards/margins": 6.989678859710693, "rewards/rejected": -12.287890434265137, "step": 196 }, { "epoch": 0.12255054432348367, "grad_norm": 42.237674713134766, "learning_rate": 4.461111111111112e-06, "logits/chosen": -75.44426727294922, "logits/rejected": -84.54901123046875, "logps/chosen": -476.52142333984375, "logps/rejected": -384.2850646972656, "loss": 0.5706, "rewards/accuracies": 0.75, "rewards/chosen": -1.897087812423706, "rewards/margins": 4.667360782623291, "rewards/rejected": -6.564448356628418, "step": 197 }, { "epoch": 0.12317262830482115, "grad_norm": 13.51016902923584, "learning_rate": 4.455555555555555e-06, "logits/chosen": 81.34991455078125, "logits/rejected": -6.475566864013672, "logps/chosen": -593.2236938476562, "logps/rejected": -469.53924560546875, "loss": 0.0547, "rewards/accuracies": 1.0, "rewards/chosen": -1.973331332206726, "rewards/margins": 7.563385009765625, "rewards/rejected": -9.53671646118164, "step": 198 }, { "epoch": 0.12379471228615863, "grad_norm": 7.882123947143555, "learning_rate": 4.450000000000001e-06, "logits/chosen": -129.04055786132812, "logits/rejected": -115.16475677490234, "logps/chosen": -1033.7010498046875, "logps/rejected": -1351.49609375, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -4.89893913269043, "rewards/margins": 8.966411590576172, "rewards/rejected": -13.865349769592285, "step": 199 }, { "epoch": 0.12441679626749612, "grad_norm": 2.0412981510162354, "learning_rate": 4.444444444444444e-06, "logits/chosen": -83.5471420288086, "logits/rejected": 53.061832427978516, "logps/chosen": -354.264892578125, "logps/rejected": -480.20465087890625, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -3.1932616233825684, "rewards/margins": 8.848127365112305, "rewards/rejected": -12.041389465332031, "step": 200 }, { "epoch": 0.12503888024883358, "grad_norm": 1271.6708984375, "learning_rate": 4.438888888888889e-06, "logits/chosen": -134.76922607421875, "logits/rejected": 53.32666778564453, "logps/chosen": -1418.3175048828125, "logps/rejected": -1816.8594970703125, "loss": 1.7386, "rewards/accuracies": 0.875, "rewards/chosen": -7.28961181640625, "rewards/margins": 10.436012268066406, "rewards/rejected": -17.725624084472656, "step": 201 }, { "epoch": 0.12566096423017106, "grad_norm": 4.561136245727539, "learning_rate": 4.433333333333334e-06, "logits/chosen": -188.95184326171875, "logits/rejected": -5.911472797393799, "logps/chosen": -438.9146728515625, "logps/rejected": -1536.612060546875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -3.267402172088623, "rewards/margins": 14.836353302001953, "rewards/rejected": -18.103755950927734, "step": 202 }, { "epoch": 0.12628304821150854, "grad_norm": 0.6827449798583984, "learning_rate": 4.427777777777778e-06, "logits/chosen": -20.990068435668945, "logits/rejected": 17.401819229125977, "logps/chosen": -1516.034423828125, "logps/rejected": -1240.0006103515625, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -2.42390775680542, "rewards/margins": 8.713464736938477, "rewards/rejected": -11.137372016906738, "step": 203 }, { "epoch": 0.12690513219284602, "grad_norm": 172.34103393554688, "learning_rate": 4.422222222222223e-06, "logits/chosen": -261.39410400390625, "logits/rejected": 19.426868438720703, "logps/chosen": -598.462646484375, "logps/rejected": -1468.048583984375, "loss": 0.2135, "rewards/accuracies": 0.875, "rewards/chosen": -1.3650782108306885, "rewards/margins": 11.839160919189453, "rewards/rejected": -13.204238891601562, "step": 204 }, { "epoch": 0.12752721617418353, "grad_norm": 0.18030370771884918, "learning_rate": 4.416666666666667e-06, "logits/chosen": 37.76616668701172, "logits/rejected": 159.8732452392578, "logps/chosen": -309.0323486328125, "logps/rejected": -467.54339599609375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -3.8101727962493896, "rewards/margins": 9.078652381896973, "rewards/rejected": -12.888824462890625, "step": 205 }, { "epoch": 0.128149300155521, "grad_norm": 3.521991491317749, "learning_rate": 4.411111111111111e-06, "logits/chosen": -99.75079345703125, "logits/rejected": -55.2625732421875, "logps/chosen": -445.90338134765625, "logps/rejected": -552.7967529296875, "loss": 0.0599, "rewards/accuracies": 1.0, "rewards/chosen": -1.2001302242279053, "rewards/margins": 9.562485694885254, "rewards/rejected": -10.762616157531738, "step": 206 }, { "epoch": 0.12877138413685849, "grad_norm": 19.420740127563477, "learning_rate": 4.405555555555556e-06, "logits/chosen": -26.671131134033203, "logits/rejected": 144.05999755859375, "logps/chosen": -497.43182373046875, "logps/rejected": -1046.5242919921875, "loss": 0.0817, "rewards/accuracies": 1.0, "rewards/chosen": -5.32816743850708, "rewards/margins": 12.222490310668945, "rewards/rejected": -17.5506591796875, "step": 207 }, { "epoch": 0.12939346811819596, "grad_norm": 0.9553231000900269, "learning_rate": 4.4e-06, "logits/chosen": -62.89033126831055, "logits/rejected": 55.89208984375, "logps/chosen": -332.24432373046875, "logps/rejected": -474.091064453125, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -2.8135929107666016, "rewards/margins": 8.67721176147461, "rewards/rejected": -11.490804672241211, "step": 208 }, { "epoch": 0.13001555209953344, "grad_norm": 1.8421519994735718, "learning_rate": 4.3944444444444455e-06, "logits/chosen": -70.02822875976562, "logits/rejected": 24.179702758789062, "logps/chosen": -423.72821044921875, "logps/rejected": -620.8206176757812, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -2.760422945022583, "rewards/margins": 11.215438842773438, "rewards/rejected": -13.975862503051758, "step": 209 }, { "epoch": 0.13063763608087092, "grad_norm": 1.8452153205871582, "learning_rate": 4.388888888888889e-06, "logits/chosen": -115.5679931640625, "logits/rejected": -9.19500732421875, "logps/chosen": -544.1236572265625, "logps/rejected": -677.5909423828125, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -4.625911712646484, "rewards/margins": 9.667537689208984, "rewards/rejected": -14.293450355529785, "step": 210 }, { "epoch": 0.1312597200622084, "grad_norm": 3.6674461364746094, "learning_rate": 4.383333333333334e-06, "logits/chosen": -69.38706970214844, "logits/rejected": 7.140203475952148, "logps/chosen": -485.5529479980469, "logps/rejected": -714.7760009765625, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -3.154878616333008, "rewards/margins": 12.349056243896484, "rewards/rejected": -15.503934860229492, "step": 211 }, { "epoch": 0.13188180404354588, "grad_norm": 707.322021484375, "learning_rate": 4.377777777777778e-06, "logits/chosen": -47.84323501586914, "logits/rejected": -25.143016815185547, "logps/chosen": -1287.410400390625, "logps/rejected": -1123.915771484375, "loss": 1.3878, "rewards/accuracies": 0.875, "rewards/chosen": -8.845760345458984, "rewards/margins": 5.70574951171875, "rewards/rejected": -14.551508903503418, "step": 212 }, { "epoch": 0.13250388802488336, "grad_norm": 91.04940795898438, "learning_rate": 4.372222222222223e-06, "logits/chosen": -22.665477752685547, "logits/rejected": -78.15141296386719, "logps/chosen": -535.7206420898438, "logps/rejected": -520.3279418945312, "loss": 1.3858, "rewards/accuracies": 0.625, "rewards/chosen": -4.796276092529297, "rewards/margins": 7.679721355438232, "rewards/rejected": -12.475997924804688, "step": 213 }, { "epoch": 0.13312597200622084, "grad_norm": 950.543212890625, "learning_rate": 4.366666666666667e-06, "logits/chosen": -81.86799621582031, "logits/rejected": 55.01408004760742, "logps/chosen": -674.6660766601562, "logps/rejected": -1881.266357421875, "loss": 1.3101, "rewards/accuracies": 0.875, "rewards/chosen": -4.164730072021484, "rewards/margins": 4.798211097717285, "rewards/rejected": -8.962940216064453, "step": 214 }, { "epoch": 0.13374805598755832, "grad_norm": 13.955273628234863, "learning_rate": 4.361111111111112e-06, "logits/chosen": -88.04432678222656, "logits/rejected": 83.24031066894531, "logps/chosen": -434.697509765625, "logps/rejected": -708.8373413085938, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": -4.179958343505859, "rewards/margins": 10.543561935424805, "rewards/rejected": -14.723520278930664, "step": 215 }, { "epoch": 0.1343701399688958, "grad_norm": 20.67734718322754, "learning_rate": 4.3555555555555555e-06, "logits/chosen": -200.67318725585938, "logits/rejected": -116.87725830078125, "logps/chosen": -389.1851501464844, "logps/rejected": -1283.8551025390625, "loss": 0.6108, "rewards/accuracies": 0.875, "rewards/chosen": -4.204230785369873, "rewards/margins": 13.91234302520752, "rewards/rejected": -18.116573333740234, "step": 216 }, { "epoch": 0.13499222395023328, "grad_norm": 23.113025665283203, "learning_rate": 4.350000000000001e-06, "logits/chosen": -79.14599609375, "logits/rejected": 38.11246871948242, "logps/chosen": -572.5641479492188, "logps/rejected": -1106.51806640625, "loss": 0.1143, "rewards/accuracies": 0.875, "rewards/chosen": -5.014432907104492, "rewards/margins": 8.986339569091797, "rewards/rejected": -14.000772476196289, "step": 217 }, { "epoch": 0.13561430793157075, "grad_norm": 1.0002957582473755, "learning_rate": 4.344444444444445e-06, "logits/chosen": -36.13572692871094, "logits/rejected": 35.47079086303711, "logps/chosen": -267.2012023925781, "logps/rejected": -376.1571960449219, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -2.531876564025879, "rewards/margins": 9.571502685546875, "rewards/rejected": -12.10338020324707, "step": 218 }, { "epoch": 0.13623639191290823, "grad_norm": 1.1016252040863037, "learning_rate": 4.3388888888888895e-06, "logits/chosen": -55.88152313232422, "logits/rejected": 32.44096374511719, "logps/chosen": -459.1560974121094, "logps/rejected": -640.2955322265625, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -6.297446250915527, "rewards/margins": 7.258005142211914, "rewards/rejected": -13.555451393127441, "step": 219 }, { "epoch": 0.1368584758942457, "grad_norm": 10.77340030670166, "learning_rate": 4.333333333333334e-06, "logits/chosen": -40.64595413208008, "logits/rejected": 83.64126586914062, "logps/chosen": -470.32366943359375, "logps/rejected": -638.5794677734375, "loss": 0.0522, "rewards/accuracies": 1.0, "rewards/chosen": -5.752457141876221, "rewards/margins": 9.254434585571289, "rewards/rejected": -15.006892204284668, "step": 220 }, { "epoch": 0.1374805598755832, "grad_norm": 15.683928489685059, "learning_rate": 4.327777777777778e-06, "logits/chosen": -36.521583557128906, "logits/rejected": 107.06440734863281, "logps/chosen": -482.6795349121094, "logps/rejected": -681.322998046875, "loss": 0.0599, "rewards/accuracies": 1.0, "rewards/chosen": -2.4665985107421875, "rewards/margins": 9.241741180419922, "rewards/rejected": -11.708341598510742, "step": 221 }, { "epoch": 0.13810264385692067, "grad_norm": 0.00209957268089056, "learning_rate": 4.322222222222223e-06, "logits/chosen": -156.964599609375, "logits/rejected": 68.1474838256836, "logps/chosen": -308.63250732421875, "logps/rejected": -1433.441162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.345400810241699, "rewards/margins": 14.245375633239746, "rewards/rejected": -16.590776443481445, "step": 222 }, { "epoch": 0.13872472783825818, "grad_norm": 1.0401504039764404, "learning_rate": 4.316666666666667e-06, "logits/chosen": -62.208839416503906, "logits/rejected": 59.34383010864258, "logps/chosen": -293.83642578125, "logps/rejected": -450.13824462890625, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -3.0725646018981934, "rewards/margins": 9.725111961364746, "rewards/rejected": -12.797677040100098, "step": 223 }, { "epoch": 0.13934681181959566, "grad_norm": 0.1062418669462204, "learning_rate": 4.3111111111111115e-06, "logits/chosen": -252.36419677734375, "logits/rejected": -35.19046401977539, "logps/chosen": -193.02786254882812, "logps/rejected": -581.735595703125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.163696765899658, "rewards/margins": 14.537510871887207, "rewards/rejected": -16.70120620727539, "step": 224 }, { "epoch": 0.13996889580093314, "grad_norm": 1363.5177001953125, "learning_rate": 4.305555555555556e-06, "logits/chosen": -224.72604370117188, "logits/rejected": -12.077668190002441, "logps/chosen": -483.1179504394531, "logps/rejected": -1253.017578125, "loss": 1.0296, "rewards/accuracies": 0.875, "rewards/chosen": -3.0739872455596924, "rewards/margins": 9.360611915588379, "rewards/rejected": -12.434598922729492, "step": 225 }, { "epoch": 0.14059097978227061, "grad_norm": 178.8345489501953, "learning_rate": 4.3e-06, "logits/chosen": -36.46290588378906, "logits/rejected": 73.2750244140625, "logps/chosen": -657.7337646484375, "logps/rejected": -904.173095703125, "loss": 1.3975, "rewards/accuracies": 0.75, "rewards/chosen": -6.523371696472168, "rewards/margins": 4.747688293457031, "rewards/rejected": -11.2710599899292, "step": 226 }, { "epoch": 0.1412130637636081, "grad_norm": 1.2638640403747559, "learning_rate": 4.294444444444445e-06, "logits/chosen": -69.05242919921875, "logits/rejected": -23.688312530517578, "logps/chosen": -483.412841796875, "logps/rejected": -598.4628295898438, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -2.1640727519989014, "rewards/margins": 12.293594360351562, "rewards/rejected": -14.457666397094727, "step": 227 }, { "epoch": 0.14183514774494557, "grad_norm": 1.8287287950515747, "learning_rate": 4.288888888888889e-06, "logits/chosen": -81.38130950927734, "logits/rejected": 15.612325668334961, "logps/chosen": -223.4542236328125, "logps/rejected": -1206.806396484375, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": -2.142986536026001, "rewards/margins": 12.141874313354492, "rewards/rejected": -14.284858703613281, "step": 228 }, { "epoch": 0.14245723172628305, "grad_norm": 0.03576798364520073, "learning_rate": 4.2833333333333335e-06, "logits/chosen": -145.61241149902344, "logits/rejected": 48.79133605957031, "logps/chosen": -311.8283996582031, "logps/rejected": -646.3853759765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.087911605834961, "rewards/margins": 12.079728126525879, "rewards/rejected": -15.16763973236084, "step": 229 }, { "epoch": 0.14307931570762053, "grad_norm": 60.046142578125, "learning_rate": 4.277777777777778e-06, "logits/chosen": -123.12039184570312, "logits/rejected": 73.53103637695312, "logps/chosen": -417.9576416015625, "logps/rejected": -689.488037109375, "loss": 0.0755, "rewards/accuracies": 1.0, "rewards/chosen": -6.67849588394165, "rewards/margins": 13.664691925048828, "rewards/rejected": -20.343189239501953, "step": 230 }, { "epoch": 0.143701399688958, "grad_norm": 395.104736328125, "learning_rate": 4.272222222222222e-06, "logits/chosen": -133.56097412109375, "logits/rejected": 23.526477813720703, "logps/chosen": -1087.7457275390625, "logps/rejected": -1235.45361328125, "loss": 1.5558, "rewards/accuracies": 0.875, "rewards/chosen": -14.986712455749512, "rewards/margins": 7.4186811447143555, "rewards/rejected": -22.405393600463867, "step": 231 }, { "epoch": 0.1443234836702955, "grad_norm": 0.46897974610328674, "learning_rate": 4.266666666666668e-06, "logits/chosen": -112.3342056274414, "logits/rejected": 54.664878845214844, "logps/chosen": -360.5389404296875, "logps/rejected": -606.0662231445312, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -5.058225631713867, "rewards/margins": 11.536385536193848, "rewards/rejected": -16.59461212158203, "step": 232 }, { "epoch": 0.14494556765163297, "grad_norm": 0.020245853811502457, "learning_rate": 4.261111111111111e-06, "logits/chosen": -89.12425231933594, "logits/rejected": -1.3640751838684082, "logps/chosen": -440.0721435546875, "logps/rejected": -1086.8466796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.6582441329956055, "rewards/margins": 20.80549430847168, "rewards/rejected": -26.46373748779297, "step": 233 }, { "epoch": 0.14556765163297045, "grad_norm": 0.6324901580810547, "learning_rate": 4.255555555555556e-06, "logits/chosen": -26.195343017578125, "logits/rejected": 1.507059097290039, "logps/chosen": -444.99249267578125, "logps/rejected": -547.2989501953125, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -5.180506706237793, "rewards/margins": 11.823253631591797, "rewards/rejected": -17.003761291503906, "step": 234 }, { "epoch": 0.14618973561430793, "grad_norm": 4.431450366973877, "learning_rate": 4.25e-06, "logits/chosen": -58.11582565307617, "logits/rejected": -88.2032470703125, "logps/chosen": -590.2505493164062, "logps/rejected": -601.1549072265625, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": -5.642101287841797, "rewards/margins": 10.171136856079102, "rewards/rejected": -15.813238143920898, "step": 235 }, { "epoch": 0.1468118195956454, "grad_norm": 48.918113708496094, "learning_rate": 4.244444444444445e-06, "logits/chosen": -89.24057006835938, "logits/rejected": -12.015077590942383, "logps/chosen": -332.2572021484375, "logps/rejected": -474.6900329589844, "loss": 0.6515, "rewards/accuracies": 0.75, "rewards/chosen": -3.3754212856292725, "rewards/margins": 6.634277820587158, "rewards/rejected": -10.009698867797852, "step": 236 }, { "epoch": 0.14743390357698288, "grad_norm": 12.535520553588867, "learning_rate": 4.238888888888889e-06, "logits/chosen": -108.12762451171875, "logits/rejected": 21.804882049560547, "logps/chosen": -495.7753601074219, "logps/rejected": -1685.7740478515625, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": -7.691643714904785, "rewards/margins": 31.041622161865234, "rewards/rejected": -38.7332649230957, "step": 237 }, { "epoch": 0.14805598755832036, "grad_norm": 48.46318054199219, "learning_rate": 4.233333333333334e-06, "logits/chosen": -133.13714599609375, "logits/rejected": 3.3474159240722656, "logps/chosen": -393.3863830566406, "logps/rejected": -1669.630615234375, "loss": 0.8408, "rewards/accuracies": 0.875, "rewards/chosen": -7.554618835449219, "rewards/margins": 26.56247329711914, "rewards/rejected": -34.117095947265625, "step": 238 }, { "epoch": 0.14867807153965784, "grad_norm": 1.408521294593811, "learning_rate": 4.227777777777778e-06, "logits/chosen": -124.53326416015625, "logits/rejected": 36.056739807128906, "logps/chosen": -613.0487060546875, "logps/rejected": -918.08349609375, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -4.068056583404541, "rewards/margins": 16.9825382232666, "rewards/rejected": -21.050594329833984, "step": 239 }, { "epoch": 0.14930015552099535, "grad_norm": 0.09460745751857758, "learning_rate": 4.222222222222223e-06, "logits/chosen": -152.53660583496094, "logits/rejected": -84.72179412841797, "logps/chosen": -371.6024169921875, "logps/rejected": -632.5328369140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.674338340759277, "rewards/margins": 16.2750186920166, "rewards/rejected": -20.949356079101562, "step": 240 }, { "epoch": 0.14992223950233283, "grad_norm": 3.636526346206665, "learning_rate": 4.216666666666667e-06, "logits/chosen": -56.533790588378906, "logits/rejected": -4.415596961975098, "logps/chosen": -358.1724853515625, "logps/rejected": -541.024169921875, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -6.470689296722412, "rewards/margins": 11.451211929321289, "rewards/rejected": -17.92190170288086, "step": 241 }, { "epoch": 0.1505443234836703, "grad_norm": 47.824302673339844, "learning_rate": 4.211111111111112e-06, "logits/chosen": -91.47882843017578, "logits/rejected": -69.10425567626953, "logps/chosen": -540.6019897460938, "logps/rejected": -690.1019287109375, "loss": 0.5677, "rewards/accuracies": 0.75, "rewards/chosen": -9.745138168334961, "rewards/margins": 8.721460342407227, "rewards/rejected": -18.466596603393555, "step": 242 }, { "epoch": 0.15116640746500778, "grad_norm": 0.13309380412101746, "learning_rate": 4.205555555555556e-06, "logits/chosen": -193.06991577148438, "logits/rejected": -33.58900451660156, "logps/chosen": -478.8428039550781, "logps/rejected": -748.1595458984375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.902139663696289, "rewards/margins": 14.91961669921875, "rewards/rejected": -19.821758270263672, "step": 243 }, { "epoch": 0.15178849144634526, "grad_norm": 224.40997314453125, "learning_rate": 4.2000000000000004e-06, "logits/chosen": 24.04697036743164, "logits/rejected": 22.431787490844727, "logps/chosen": -1376.851806640625, "logps/rejected": -1297.66064453125, "loss": 0.598, "rewards/accuracies": 0.875, "rewards/chosen": -17.364931106567383, "rewards/margins": 11.012131690979004, "rewards/rejected": -28.377063751220703, "step": 244 }, { "epoch": 0.15241057542768274, "grad_norm": 18.00416374206543, "learning_rate": 4.194444444444445e-06, "logits/chosen": -43.32766342163086, "logits/rejected": 41.49739456176758, "logps/chosen": -824.9656982421875, "logps/rejected": -1648.3134765625, "loss": 0.0638, "rewards/accuracies": 1.0, "rewards/chosen": -12.045077323913574, "rewards/margins": 22.89803123474121, "rewards/rejected": -34.94310760498047, "step": 245 }, { "epoch": 0.15303265940902022, "grad_norm": 29.11594009399414, "learning_rate": 4.188888888888889e-06, "logits/chosen": -76.60344696044922, "logits/rejected": -10.571682929992676, "logps/chosen": -374.260986328125, "logps/rejected": -524.7108764648438, "loss": 0.1618, "rewards/accuracies": 0.875, "rewards/chosen": -6.047565937042236, "rewards/margins": 11.70809268951416, "rewards/rejected": -17.755657196044922, "step": 246 }, { "epoch": 0.1536547433903577, "grad_norm": 16.381990432739258, "learning_rate": 4.183333333333334e-06, "logits/chosen": -53.29045486450195, "logits/rejected": 105.15092468261719, "logps/chosen": -647.8858032226562, "logps/rejected": -802.72802734375, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": -6.4345293045043945, "rewards/margins": 10.723045349121094, "rewards/rejected": -17.157573699951172, "step": 247 }, { "epoch": 0.15427682737169518, "grad_norm": 0.14272263646125793, "learning_rate": 4.177777777777778e-06, "logits/chosen": -95.52243041992188, "logits/rejected": 40.01774597167969, "logps/chosen": -889.985595703125, "logps/rejected": -1533.0863037109375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -18.64466667175293, "rewards/margins": 24.226701736450195, "rewards/rejected": -42.87137222290039, "step": 248 }, { "epoch": 0.15489891135303266, "grad_norm": 394.806884765625, "learning_rate": 4.1722222222222225e-06, "logits/chosen": -102.8668212890625, "logits/rejected": -13.423986434936523, "logps/chosen": -1345.7633056640625, "logps/rejected": -2521.29443359375, "loss": 9.1083, "rewards/accuracies": 0.875, "rewards/chosen": -25.63570785522461, "rewards/margins": 19.62613296508789, "rewards/rejected": -45.2618408203125, "step": 249 }, { "epoch": 0.15552099533437014, "grad_norm": 0.2909677028656006, "learning_rate": 4.166666666666667e-06, "logits/chosen": 8.441726684570312, "logits/rejected": 63.819305419921875, "logps/chosen": -931.527587890625, "logps/rejected": -1163.916259765625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -20.98105239868164, "rewards/margins": 13.163368225097656, "rewards/rejected": -34.1444206237793, "step": 250 }, { "epoch": 0.15614307931570762, "grad_norm": 1.7748571634292603, "learning_rate": 4.161111111111111e-06, "logits/chosen": -49.30294418334961, "logits/rejected": 16.479263305664062, "logps/chosen": -438.07220458984375, "logps/rejected": -631.3031005859375, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -7.326571464538574, "rewards/margins": 13.786811828613281, "rewards/rejected": -21.113384246826172, "step": 251 }, { "epoch": 0.1567651632970451, "grad_norm": 0.49761223793029785, "learning_rate": 4.155555555555556e-06, "logits/chosen": -49.62656784057617, "logits/rejected": -22.667827606201172, "logps/chosen": -660.537841796875, "logps/rejected": -695.955810546875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.2384819984436035, "rewards/margins": 15.621650695800781, "rewards/rejected": -21.860132217407227, "step": 252 }, { "epoch": 0.15738724727838257, "grad_norm": 0.022095149382948875, "learning_rate": 4.15e-06, "logits/chosen": -37.41250228881836, "logits/rejected": 36.46703338623047, "logps/chosen": -1039.084716796875, "logps/rejected": -2234.86328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.223960876464844, "rewards/margins": 42.786781311035156, "rewards/rejected": -66.0107421875, "step": 253 }, { "epoch": 0.15800933125972005, "grad_norm": 0.012150655500590801, "learning_rate": 4.1444444444444445e-06, "logits/chosen": -155.50177001953125, "logits/rejected": -13.029396057128906, "logps/chosen": -1377.6597900390625, "logps/rejected": -1830.522216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -30.517871856689453, "rewards/margins": 17.041025161743164, "rewards/rejected": -47.558895111083984, "step": 254 }, { "epoch": 0.15863141524105753, "grad_norm": 0.007127499207854271, "learning_rate": 4.138888888888889e-06, "logits/chosen": -139.62171936035156, "logits/rejected": 15.012825012207031, "logps/chosen": -202.41717529296875, "logps/rejected": -493.1833190917969, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.7710065841674805, "rewards/margins": 15.728225708007812, "rewards/rejected": -20.49923324584961, "step": 255 }, { "epoch": 0.159253499222395, "grad_norm": 0.22433629631996155, "learning_rate": 4.133333333333333e-06, "logits/chosen": -50.39942932128906, "logits/rejected": 44.610008239746094, "logps/chosen": -1186.3587646484375, "logps/rejected": -1802.1329345703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -11.523059844970703, "rewards/margins": 25.886259078979492, "rewards/rejected": -37.40931701660156, "step": 256 }, { "epoch": 0.1598755832037325, "grad_norm": 21.162879943847656, "learning_rate": 4.1277777777777785e-06, "logits/chosen": -137.6605682373047, "logits/rejected": 34.9881706237793, "logps/chosen": -1174.3975830078125, "logps/rejected": -1637.04150390625, "loss": 0.0653, "rewards/accuracies": 1.0, "rewards/chosen": -29.67385482788086, "rewards/margins": 24.67789649963379, "rewards/rejected": -54.351749420166016, "step": 257 }, { "epoch": 0.16049766718507, "grad_norm": 0.5918627381324768, "learning_rate": 4.122222222222222e-06, "logits/chosen": -157.87362670898438, "logits/rejected": -65.07425689697266, "logps/chosen": -666.122314453125, "logps/rejected": -1682.2291259765625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -9.591352462768555, "rewards/margins": 17.436546325683594, "rewards/rejected": -27.02789878845215, "step": 258 }, { "epoch": 0.16111975116640748, "grad_norm": 0.0013033996801823378, "learning_rate": 4.116666666666667e-06, "logits/chosen": -95.3739242553711, "logits/rejected": 50.27738952636719, "logps/chosen": -714.5494384765625, "logps/rejected": -1158.992431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.566201210021973, "rewards/margins": 16.695865631103516, "rewards/rejected": -31.262065887451172, "step": 259 }, { "epoch": 0.16174183514774496, "grad_norm": 57.698577880859375, "learning_rate": 4.111111111111111e-06, "logits/chosen": -58.82536315917969, "logits/rejected": -46.63689422607422, "logps/chosen": -538.1187744140625, "logps/rejected": -643.44091796875, "loss": 0.3719, "rewards/accuracies": 0.875, "rewards/chosen": -9.514969825744629, "rewards/margins": 10.724163055419922, "rewards/rejected": -20.239133834838867, "step": 260 }, { "epoch": 0.16236391912908243, "grad_norm": 3.197234869003296, "learning_rate": 4.105555555555556e-06, "logits/chosen": -55.260223388671875, "logits/rejected": 64.656494140625, "logps/chosen": -399.9578857421875, "logps/rejected": -612.0451049804688, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -10.918329238891602, "rewards/margins": 13.628928184509277, "rewards/rejected": -24.547256469726562, "step": 261 }, { "epoch": 0.1629860031104199, "grad_norm": 0.0011677710572257638, "learning_rate": 4.1e-06, "logits/chosen": -185.35882568359375, "logits/rejected": -12.751130104064941, "logps/chosen": -780.572509765625, "logps/rejected": -1255.75927734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.761516571044922, "rewards/margins": 20.07486343383789, "rewards/rejected": -32.83638000488281, "step": 262 }, { "epoch": 0.1636080870917574, "grad_norm": 377.3010559082031, "learning_rate": 4.094444444444445e-06, "logits/chosen": -64.6948013305664, "logits/rejected": -78.46444702148438, "logps/chosen": -1579.6298828125, "logps/rejected": -1329.2633056640625, "loss": 3.3402, "rewards/accuracies": 0.75, "rewards/chosen": -31.966421127319336, "rewards/margins": 11.36286449432373, "rewards/rejected": -43.32928466796875, "step": 263 }, { "epoch": 0.16423017107309487, "grad_norm": 931.4512939453125, "learning_rate": 4.088888888888889e-06, "logits/chosen": -75.25047302246094, "logits/rejected": -163.8924560546875, "logps/chosen": -1551.2550048828125, "logps/rejected": -700.6729736328125, "loss": 5.3763, "rewards/accuracies": 0.75, "rewards/chosen": -14.774190902709961, "rewards/margins": 2.478893756866455, "rewards/rejected": -17.25308609008789, "step": 264 }, { "epoch": 0.16485225505443235, "grad_norm": 0.10275363177061081, "learning_rate": 4.083333333333334e-06, "logits/chosen": -177.67669677734375, "logits/rejected": 70.0986099243164, "logps/chosen": -252.44256591796875, "logps/rejected": -627.2208251953125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.116037368774414, "rewards/margins": 12.965600967407227, "rewards/rejected": -20.08163833618164, "step": 265 }, { "epoch": 0.16547433903576983, "grad_norm": 52.540992736816406, "learning_rate": 4.077777777777778e-06, "logits/chosen": -146.22366333007812, "logits/rejected": -26.168853759765625, "logps/chosen": -359.4095458984375, "logps/rejected": -840.5860595703125, "loss": 0.8446, "rewards/accuracies": 0.875, "rewards/chosen": -5.002624034881592, "rewards/margins": 17.389392852783203, "rewards/rejected": -22.392017364501953, "step": 266 }, { "epoch": 0.1660964230171073, "grad_norm": 24.875370025634766, "learning_rate": 4.0722222222222226e-06, "logits/chosen": -53.2791633605957, "logits/rejected": 73.49838256835938, "logps/chosen": -528.3607788085938, "logps/rejected": -697.6058349609375, "loss": 0.0726, "rewards/accuracies": 1.0, "rewards/chosen": -7.979475021362305, "rewards/margins": 12.397570610046387, "rewards/rejected": -20.377046585083008, "step": 267 }, { "epoch": 0.1667185069984448, "grad_norm": 78.24815368652344, "learning_rate": 4.066666666666667e-06, "logits/chosen": -133.01332092285156, "logits/rejected": 38.480079650878906, "logps/chosen": -1328.702880859375, "logps/rejected": -2605.635498046875, "loss": 1.1055, "rewards/accuracies": 0.875, "rewards/chosen": -18.64280128479004, "rewards/margins": 16.73213768005371, "rewards/rejected": -35.37493896484375, "step": 268 }, { "epoch": 0.16734059097978227, "grad_norm": 37.84096145629883, "learning_rate": 4.061111111111111e-06, "logits/chosen": -48.04667663574219, "logits/rejected": 59.057701110839844, "logps/chosen": -546.296142578125, "logps/rejected": -813.5984497070312, "loss": 0.1912, "rewards/accuracies": 0.875, "rewards/chosen": -7.109343528747559, "rewards/margins": 15.374835968017578, "rewards/rejected": -22.48417854309082, "step": 269 }, { "epoch": 0.16796267496111975, "grad_norm": 0.290396124124527, "learning_rate": 4.055555555555556e-06, "logits/chosen": -44.20844650268555, "logits/rejected": 122.50775146484375, "logps/chosen": -439.98260498046875, "logps/rejected": -635.5829467773438, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.673893451690674, "rewards/margins": 14.905357360839844, "rewards/rejected": -22.57925033569336, "step": 270 }, { "epoch": 0.16858475894245722, "grad_norm": 48.94499588012695, "learning_rate": 4.05e-06, "logits/chosen": -55.466094970703125, "logits/rejected": 51.63720703125, "logps/chosen": -298.3916015625, "logps/rejected": -547.94091796875, "loss": 0.1554, "rewards/accuracies": 0.875, "rewards/chosen": -6.9176025390625, "rewards/margins": 13.484373092651367, "rewards/rejected": -20.401975631713867, "step": 271 }, { "epoch": 0.1692068429237947, "grad_norm": 59.389888763427734, "learning_rate": 4.044444444444445e-06, "logits/chosen": -21.730342864990234, "logits/rejected": 99.76119995117188, "logps/chosen": -521.2410278320312, "logps/rejected": -1422.240478515625, "loss": 0.6392, "rewards/accuracies": 0.875, "rewards/chosen": -6.905114650726318, "rewards/margins": 23.20140266418457, "rewards/rejected": -30.106517791748047, "step": 272 }, { "epoch": 0.16982892690513218, "grad_norm": 15.176685333251953, "learning_rate": 4.038888888888889e-06, "logits/chosen": -69.79249572753906, "logits/rejected": 42.34770965576172, "logps/chosen": -692.4677124023438, "logps/rejected": -2291.95654296875, "loss": 0.1343, "rewards/accuracies": 0.875, "rewards/chosen": -8.579049110412598, "rewards/margins": 20.669668197631836, "rewards/rejected": -29.24871826171875, "step": 273 }, { "epoch": 0.17045101088646966, "grad_norm": 0.023893853649497032, "learning_rate": 4.033333333333333e-06, "logits/chosen": -117.4695816040039, "logits/rejected": 56.047119140625, "logps/chosen": -310.63433837890625, "logps/rejected": -594.7291259765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.253411293029785, "rewards/margins": 17.663066864013672, "rewards/rejected": -23.91647720336914, "step": 274 }, { "epoch": 0.17107309486780714, "grad_norm": 0.06323827058076859, "learning_rate": 4.027777777777779e-06, "logits/chosen": -79.18694305419922, "logits/rejected": 62.436309814453125, "logps/chosen": -1076.4027099609375, "logps/rejected": -1574.6282958984375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -19.60197639465332, "rewards/margins": 22.598434448242188, "rewards/rejected": -42.20041275024414, "step": 275 }, { "epoch": 0.17169517884914465, "grad_norm": 53.213069915771484, "learning_rate": 4.022222222222222e-06, "logits/chosen": -227.540771484375, "logits/rejected": -0.8813142776489258, "logps/chosen": -523.339599609375, "logps/rejected": -876.95458984375, "loss": 0.7018, "rewards/accuracies": 0.875, "rewards/chosen": -10.004651069641113, "rewards/margins": 15.0563325881958, "rewards/rejected": -25.060983657836914, "step": 276 }, { "epoch": 0.17231726283048213, "grad_norm": 324.11370849609375, "learning_rate": 4.0166666666666675e-06, "logits/chosen": -35.3266487121582, "logits/rejected": 48.96232604980469, "logps/chosen": -1527.662353515625, "logps/rejected": -1728.04736328125, "loss": 3.4055, "rewards/accuracies": 0.875, "rewards/chosen": -13.272540092468262, "rewards/margins": 8.585360527038574, "rewards/rejected": -21.857900619506836, "step": 277 }, { "epoch": 0.1729393468118196, "grad_norm": 0.0019032398704439402, "learning_rate": 4.011111111111111e-06, "logits/chosen": -104.66905975341797, "logits/rejected": 56.54798126220703, "logps/chosen": -396.6845703125, "logps/rejected": -716.3125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.6417036056518555, "rewards/margins": 21.381805419921875, "rewards/rejected": -28.023509979248047, "step": 278 }, { "epoch": 0.17356143079315708, "grad_norm": 0.010957333259284496, "learning_rate": 4.005555555555556e-06, "logits/chosen": -126.09434509277344, "logits/rejected": 29.715524673461914, "logps/chosen": -258.21197509765625, "logps/rejected": -525.008056640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.031264305114746, "rewards/margins": 19.413511276245117, "rewards/rejected": -24.444774627685547, "step": 279 }, { "epoch": 0.17418351477449456, "grad_norm": 88.01595306396484, "learning_rate": 4.000000000000001e-06, "logits/chosen": -55.90053176879883, "logits/rejected": 42.09088134765625, "logps/chosen": -472.49334716796875, "logps/rejected": -646.2977294921875, "loss": 1.3913, "rewards/accuracies": 0.75, "rewards/chosen": -10.895673751831055, "rewards/margins": 11.499914169311523, "rewards/rejected": -22.395587921142578, "step": 280 }, { "epoch": 0.17480559875583204, "grad_norm": 65.1297836303711, "learning_rate": 3.994444444444445e-06, "logits/chosen": -91.52102661132812, "logits/rejected": 8.934975624084473, "logps/chosen": -593.5267333984375, "logps/rejected": -749.871826171875, "loss": 0.3123, "rewards/accuracies": 0.875, "rewards/chosen": -7.475317001342773, "rewards/margins": 13.415494918823242, "rewards/rejected": -20.890811920166016, "step": 281 }, { "epoch": 0.17542768273716952, "grad_norm": 271.2584228515625, "learning_rate": 3.9888888888888895e-06, "logits/chosen": 33.98577880859375, "logits/rejected": -0.7103757858276367, "logps/chosen": -1322.2896728515625, "logps/rejected": -1360.5802001953125, "loss": 3.7985, "rewards/accuracies": 0.625, "rewards/chosen": -13.193499565124512, "rewards/margins": 5.2101898193359375, "rewards/rejected": -18.403688430786133, "step": 282 }, { "epoch": 0.176049766718507, "grad_norm": 19.676036834716797, "learning_rate": 3.983333333333334e-06, "logits/chosen": -234.993896484375, "logits/rejected": -30.02489471435547, "logps/chosen": -233.2265625, "logps/rejected": -1447.2353515625, "loss": 0.1559, "rewards/accuracies": 0.875, "rewards/chosen": -5.833844184875488, "rewards/margins": 20.37204360961914, "rewards/rejected": -26.205886840820312, "step": 283 }, { "epoch": 0.17667185069984448, "grad_norm": 25.152101516723633, "learning_rate": 3.977777777777778e-06, "logits/chosen": -128.47718811035156, "logits/rejected": -67.97398376464844, "logps/chosen": -773.2230224609375, "logps/rejected": -1110.775634765625, "loss": 0.1104, "rewards/accuracies": 1.0, "rewards/chosen": -11.543960571289062, "rewards/margins": 13.805700302124023, "rewards/rejected": -25.349660873413086, "step": 284 }, { "epoch": 0.17729393468118196, "grad_norm": 0.00012249402061570436, "learning_rate": 3.972222222222223e-06, "logits/chosen": -237.1243896484375, "logits/rejected": -8.595453262329102, "logps/chosen": -458.8919677734375, "logps/rejected": -874.627685546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.105069160461426, "rewards/margins": 17.698455810546875, "rewards/rejected": -20.803525924682617, "step": 285 }, { "epoch": 0.17791601866251944, "grad_norm": 0.13776512444019318, "learning_rate": 3.966666666666667e-06, "logits/chosen": -106.46934509277344, "logits/rejected": 17.300634384155273, "logps/chosen": -318.7574157714844, "logps/rejected": -475.3732604980469, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -6.737678527832031, "rewards/margins": 11.72336196899414, "rewards/rejected": -18.461040496826172, "step": 286 }, { "epoch": 0.17853810264385692, "grad_norm": 168.7507781982422, "learning_rate": 3.9611111111111115e-06, "logits/chosen": -142.20005798339844, "logits/rejected": -49.58868408203125, "logps/chosen": -625.783447265625, "logps/rejected": -957.2504272460938, "loss": 1.2224, "rewards/accuracies": 0.75, "rewards/chosen": -10.871160507202148, "rewards/margins": 11.00088882446289, "rewards/rejected": -21.872051239013672, "step": 287 }, { "epoch": 0.1791601866251944, "grad_norm": 0.8291933536529541, "learning_rate": 3.955555555555556e-06, "logits/chosen": -94.43851470947266, "logits/rejected": -85.9405746459961, "logps/chosen": -593.3635864257812, "logps/rejected": -680.4984130859375, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -6.297945976257324, "rewards/margins": 16.587263107299805, "rewards/rejected": -22.885210037231445, "step": 288 }, { "epoch": 0.17978227060653187, "grad_norm": 4.625677585601807, "learning_rate": 3.95e-06, "logits/chosen": -34.3211669921875, "logits/rejected": 61.005558013916016, "logps/chosen": -413.08929443359375, "logps/rejected": -618.64990234375, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": -6.595094680786133, "rewards/margins": 13.828773498535156, "rewards/rejected": -20.42386817932129, "step": 289 }, { "epoch": 0.18040435458786935, "grad_norm": 0.08995424956083298, "learning_rate": 3.944444444444445e-06, "logits/chosen": -184.75894165039062, "logits/rejected": 45.330711364746094, "logps/chosen": -173.41065979003906, "logps/rejected": -641.794677734375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.184260129928589, "rewards/margins": 16.57125473022461, "rewards/rejected": -19.75551414489746, "step": 290 }, { "epoch": 0.18102643856920683, "grad_norm": 0.01685081049799919, "learning_rate": 3.938888888888889e-06, "logits/chosen": -83.94288635253906, "logits/rejected": 59.46269226074219, "logps/chosen": -263.14862060546875, "logps/rejected": -563.9887084960938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.997714042663574, "rewards/margins": 15.791666984558105, "rewards/rejected": -21.789379119873047, "step": 291 }, { "epoch": 0.1816485225505443, "grad_norm": 27.972543716430664, "learning_rate": 3.9333333333333335e-06, "logits/chosen": -151.85569763183594, "logits/rejected": 6.557146072387695, "logps/chosen": -428.752685546875, "logps/rejected": -1057.887451171875, "loss": 0.0965, "rewards/accuracies": 0.875, "rewards/chosen": -5.731788635253906, "rewards/margins": 14.543946266174316, "rewards/rejected": -20.275733947753906, "step": 292 }, { "epoch": 0.1822706065318818, "grad_norm": 0.43663766980171204, "learning_rate": 3.927777777777778e-06, "logits/chosen": -121.76500701904297, "logits/rejected": 78.33795928955078, "logps/chosen": -400.1545104980469, "logps/rejected": -704.7317504882812, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -8.754512786865234, "rewards/margins": 15.872625350952148, "rewards/rejected": -24.627140045166016, "step": 293 }, { "epoch": 0.1828926905132193, "grad_norm": 8.65080437506549e-05, "learning_rate": 3.922222222222223e-06, "logits/chosen": -117.71896362304688, "logits/rejected": 92.28666687011719, "logps/chosen": -388.2379150390625, "logps/rejected": -803.2340698242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.700864791870117, "rewards/margins": 22.124835968017578, "rewards/rejected": -26.825698852539062, "step": 294 }, { "epoch": 0.18351477449455678, "grad_norm": 329.11553955078125, "learning_rate": 3.916666666666667e-06, "logits/chosen": -41.35985565185547, "logits/rejected": 31.113405227661133, "logps/chosen": -1101.9654541015625, "logps/rejected": -1520.463623046875, "loss": 1.8435, "rewards/accuracies": 0.875, "rewards/chosen": -10.790802955627441, "rewards/margins": 6.932387351989746, "rewards/rejected": -17.723190307617188, "step": 295 }, { "epoch": 0.18413685847589426, "grad_norm": 0.0033107344061136246, "learning_rate": 3.911111111111112e-06, "logits/chosen": -165.0397186279297, "logits/rejected": 28.18156623840332, "logps/chosen": -658.0228271484375, "logps/rejected": -1422.99267578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.381956100463867, "rewards/margins": 26.15964126586914, "rewards/rejected": -36.541595458984375, "step": 296 }, { "epoch": 0.18475894245723173, "grad_norm": 47.07502746582031, "learning_rate": 3.9055555555555555e-06, "logits/chosen": -70.08489227294922, "logits/rejected": -45.325096130371094, "logps/chosen": -655.1519775390625, "logps/rejected": -684.25244140625, "loss": 0.3491, "rewards/accuracies": 0.875, "rewards/chosen": -7.9051055908203125, "rewards/margins": 11.411504745483398, "rewards/rejected": -19.31661033630371, "step": 297 }, { "epoch": 0.1853810264385692, "grad_norm": 324.0539245605469, "learning_rate": 3.900000000000001e-06, "logits/chosen": -85.72071075439453, "logits/rejected": -71.43274688720703, "logps/chosen": -1659.177490234375, "logps/rejected": -2215.758056640625, "loss": 1.3888, "rewards/accuracies": 0.75, "rewards/chosen": -19.033245086669922, "rewards/margins": 34.705684661865234, "rewards/rejected": -53.73892593383789, "step": 298 }, { "epoch": 0.1860031104199067, "grad_norm": 39.58768081665039, "learning_rate": 3.894444444444444e-06, "logits/chosen": -66.65591430664062, "logits/rejected": -14.52166748046875, "logps/chosen": -595.5680541992188, "logps/rejected": -738.6573486328125, "loss": 0.0917, "rewards/accuracies": 0.875, "rewards/chosen": -7.478846549987793, "rewards/margins": 13.127381324768066, "rewards/rejected": -20.606229782104492, "step": 299 }, { "epoch": 0.18662519440124417, "grad_norm": 1.447102665901184, "learning_rate": 3.88888888888889e-06, "logits/chosen": -173.89190673828125, "logits/rejected": -71.16383361816406, "logps/chosen": -333.695068359375, "logps/rejected": -568.7110595703125, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -3.381143569946289, "rewards/margins": 18.95089340209961, "rewards/rejected": -22.33203887939453, "step": 300 }, { "epoch": 0.18724727838258165, "grad_norm": 2.3716578483581543, "learning_rate": 3.883333333333333e-06, "logits/chosen": -153.14694213867188, "logits/rejected": -4.815773963928223, "logps/chosen": -353.2789611816406, "logps/rejected": -889.0303955078125, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": -4.088547706604004, "rewards/margins": 19.21744728088379, "rewards/rejected": -23.30599594116211, "step": 301 }, { "epoch": 0.18786936236391913, "grad_norm": 527.762451171875, "learning_rate": 3.877777777777778e-06, "logits/chosen": -53.56449508666992, "logits/rejected": -21.997596740722656, "logps/chosen": -2358.97998046875, "logps/rejected": -2450.525634765625, "loss": 7.9059, "rewards/accuracies": 0.75, "rewards/chosen": -30.382492065429688, "rewards/margins": 5.680783271789551, "rewards/rejected": -36.06327819824219, "step": 302 }, { "epoch": 0.1884914463452566, "grad_norm": 15.114870071411133, "learning_rate": 3.872222222222223e-06, "logits/chosen": -98.48558044433594, "logits/rejected": -142.0872344970703, "logps/chosen": -562.879638671875, "logps/rejected": -713.4676513671875, "loss": 0.0631, "rewards/accuracies": 1.0, "rewards/chosen": -9.72712516784668, "rewards/margins": 15.245447158813477, "rewards/rejected": -24.972572326660156, "step": 303 }, { "epoch": 0.1891135303265941, "grad_norm": 0.1717885583639145, "learning_rate": 3.866666666666667e-06, "logits/chosen": -117.6749267578125, "logits/rejected": 91.6963882446289, "logps/chosen": -311.0071716308594, "logps/rejected": -615.1345825195312, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -5.776980876922607, "rewards/margins": 14.40366268157959, "rewards/rejected": -20.18064308166504, "step": 304 }, { "epoch": 0.18973561430793157, "grad_norm": 0.00012132651318097487, "learning_rate": 3.861111111111112e-06, "logits/chosen": -82.01860046386719, "logits/rejected": 63.629150390625, "logps/chosen": -205.47999572753906, "logps/rejected": -594.0843505859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.530620574951172, "rewards/margins": 22.311023712158203, "rewards/rejected": -24.841644287109375, "step": 305 }, { "epoch": 0.19035769828926905, "grad_norm": 0.0422993041574955, "learning_rate": 3.855555555555556e-06, "logits/chosen": -68.24407958984375, "logits/rejected": -39.94296646118164, "logps/chosen": -574.03515625, "logps/rejected": -639.1591796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.4280829429626465, "rewards/margins": 15.216024398803711, "rewards/rejected": -22.644107818603516, "step": 306 }, { "epoch": 0.19097978227060652, "grad_norm": 1.3037992715835571, "learning_rate": 3.85e-06, "logits/chosen": -87.46559143066406, "logits/rejected": 10.783121109008789, "logps/chosen": -450.4046325683594, "logps/rejected": -762.1058959960938, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -8.844273567199707, "rewards/margins": 12.002908706665039, "rewards/rejected": -20.84718132019043, "step": 307 }, { "epoch": 0.191601866251944, "grad_norm": 0.0012752667535096407, "learning_rate": 3.844444444444445e-06, "logits/chosen": -183.85960388183594, "logits/rejected": -12.364642143249512, "logps/chosen": -424.20654296875, "logps/rejected": -774.3251342773438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.210565090179443, "rewards/margins": 22.788055419921875, "rewards/rejected": -28.998619079589844, "step": 308 }, { "epoch": 0.19222395023328148, "grad_norm": 1.2326602935791016, "learning_rate": 3.838888888888889e-06, "logits/chosen": -120.3619384765625, "logits/rejected": -70.99644470214844, "logps/chosen": -427.50616455078125, "logps/rejected": -573.31396484375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -8.5419921875, "rewards/margins": 13.205772399902344, "rewards/rejected": -21.747764587402344, "step": 309 }, { "epoch": 0.19284603421461896, "grad_norm": 48.27250289916992, "learning_rate": 3.833333333333334e-06, "logits/chosen": -89.84599304199219, "logits/rejected": 37.71154022216797, "logps/chosen": -435.7120056152344, "logps/rejected": -706.96826171875, "loss": 0.7796, "rewards/accuracies": 0.875, "rewards/chosen": -7.682112693786621, "rewards/margins": 18.409378051757812, "rewards/rejected": -26.09149169921875, "step": 310 }, { "epoch": 0.19346811819595647, "grad_norm": 4.97343426104635e-05, "learning_rate": 3.827777777777778e-06, "logits/chosen": -237.95999145507812, "logits/rejected": -32.64695739746094, "logps/chosen": -343.23577880859375, "logps/rejected": -730.1314086914062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.523386478424072, "rewards/margins": 19.760616302490234, "rewards/rejected": -25.28400230407715, "step": 311 }, { "epoch": 0.19409020217729395, "grad_norm": 0.6693364381790161, "learning_rate": 3.8222222222222224e-06, "logits/chosen": -53.665897369384766, "logits/rejected": 40.538909912109375, "logps/chosen": -825.9369506835938, "logps/rejected": -820.968505859375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -9.593791961669922, "rewards/margins": 15.198070526123047, "rewards/rejected": -24.79186248779297, "step": 312 }, { "epoch": 0.19471228615863143, "grad_norm": 651.1571655273438, "learning_rate": 3.816666666666667e-06, "logits/chosen": -123.14221954345703, "logits/rejected": 4.1334381103515625, "logps/chosen": -982.306884765625, "logps/rejected": -1195.7767333984375, "loss": 1.3577, "rewards/accuracies": 0.875, "rewards/chosen": -13.423538208007812, "rewards/margins": 7.8361358642578125, "rewards/rejected": -21.259674072265625, "step": 313 }, { "epoch": 0.1953343701399689, "grad_norm": 66.61783599853516, "learning_rate": 3.8111111111111117e-06, "logits/chosen": -64.84664154052734, "logits/rejected": -38.234928131103516, "logps/chosen": -1347.921630859375, "logps/rejected": -1199.28466796875, "loss": 0.7448, "rewards/accuracies": 0.875, "rewards/chosen": -11.432280540466309, "rewards/margins": 15.497456550598145, "rewards/rejected": -26.929737091064453, "step": 314 }, { "epoch": 0.19595645412130638, "grad_norm": 1483.7852783203125, "learning_rate": 3.8055555555555556e-06, "logits/chosen": -100.2542953491211, "logits/rejected": -3.6627116203308105, "logps/chosen": -1932.4744873046875, "logps/rejected": -2337.487060546875, "loss": 2.1735, "rewards/accuracies": 0.75, "rewards/chosen": -1.4238407611846924, "rewards/margins": 10.310941696166992, "rewards/rejected": -11.734783172607422, "step": 315 }, { "epoch": 0.19657853810264386, "grad_norm": 175.72740173339844, "learning_rate": 3.8000000000000005e-06, "logits/chosen": -129.99354553222656, "logits/rejected": -45.9954719543457, "logps/chosen": -360.54742431640625, "logps/rejected": -715.593994140625, "loss": 1.3528, "rewards/accuracies": 0.75, "rewards/chosen": -10.429464340209961, "rewards/margins": 12.705400466918945, "rewards/rejected": -23.134864807128906, "step": 316 }, { "epoch": 0.19720062208398134, "grad_norm": 1.9701203107833862, "learning_rate": 3.7944444444444444e-06, "logits/chosen": -110.27639770507812, "logits/rejected": -33.80345153808594, "logps/chosen": -433.2222595214844, "logps/rejected": -693.2185668945312, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -7.709738731384277, "rewards/margins": 19.486099243164062, "rewards/rejected": -27.19583511352539, "step": 317 }, { "epoch": 0.19782270606531882, "grad_norm": 537.9247436523438, "learning_rate": 3.7888888888888893e-06, "logits/chosen": -98.09420013427734, "logits/rejected": 19.074195861816406, "logps/chosen": -971.3485107421875, "logps/rejected": -1684.6815185546875, "loss": 0.4593, "rewards/accuracies": 0.875, "rewards/chosen": -0.7409607172012329, "rewards/margins": 15.404199600219727, "rewards/rejected": -16.145160675048828, "step": 318 }, { "epoch": 0.1984447900466563, "grad_norm": 89.17838287353516, "learning_rate": 3.7833333333333337e-06, "logits/chosen": -91.54933166503906, "logits/rejected": -45.25230026245117, "logps/chosen": -397.206298828125, "logps/rejected": -530.8782958984375, "loss": 0.7855, "rewards/accuracies": 0.875, "rewards/chosen": -8.98363208770752, "rewards/margins": 10.694034576416016, "rewards/rejected": -19.67766761779785, "step": 319 }, { "epoch": 0.19906687402799378, "grad_norm": 0.03172789514064789, "learning_rate": 3.777777777777778e-06, "logits/chosen": -106.44336700439453, "logits/rejected": 75.95691680908203, "logps/chosen": -406.4058837890625, "logps/rejected": -760.5960083007812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.572707176208496, "rewards/margins": 18.770610809326172, "rewards/rejected": -28.34331512451172, "step": 320 }, { "epoch": 0.19968895800933126, "grad_norm": 0.5009683966636658, "learning_rate": 3.7722222222222225e-06, "logits/chosen": -93.40985870361328, "logits/rejected": 21.014463424682617, "logps/chosen": -396.630126953125, "logps/rejected": -711.1671142578125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -7.7811689376831055, "rewards/margins": 18.405826568603516, "rewards/rejected": -26.186992645263672, "step": 321 }, { "epoch": 0.20031104199066874, "grad_norm": 0.05691106617450714, "learning_rate": 3.766666666666667e-06, "logits/chosen": -109.3495101928711, "logits/rejected": 91.24756622314453, "logps/chosen": -311.5810241699219, "logps/rejected": -644.2427978515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.7836527824401855, "rewards/margins": 19.25943946838379, "rewards/rejected": -27.043092727661133, "step": 322 }, { "epoch": 0.20093312597200622, "grad_norm": 0.012120095081627369, "learning_rate": 3.7611111111111113e-06, "logits/chosen": -125.31501007080078, "logits/rejected": -44.88500213623047, "logps/chosen": -448.06329345703125, "logps/rejected": -618.538330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.541372776031494, "rewards/margins": 12.933399200439453, "rewards/rejected": -19.474773406982422, "step": 323 }, { "epoch": 0.2015552099533437, "grad_norm": 65.7038803100586, "learning_rate": 3.7555555555555557e-06, "logits/chosen": -86.11681365966797, "logits/rejected": 19.869386672973633, "logps/chosen": -527.541748046875, "logps/rejected": -1591.1641845703125, "loss": 0.9344, "rewards/accuracies": 0.875, "rewards/chosen": -5.735625743865967, "rewards/margins": 23.825611114501953, "rewards/rejected": -29.561237335205078, "step": 324 }, { "epoch": 0.20217729393468117, "grad_norm": 0.06554151326417923, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -117.25420379638672, "logits/rejected": -48.35134506225586, "logps/chosen": -443.79705810546875, "logps/rejected": -921.9839477539062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.772656440734863, "rewards/margins": 22.910354614257812, "rewards/rejected": -29.68301010131836, "step": 325 }, { "epoch": 0.20279937791601865, "grad_norm": 106.63958740234375, "learning_rate": 3.744444444444445e-06, "logits/chosen": -41.87525177001953, "logits/rejected": 104.0947036743164, "logps/chosen": -714.8843994140625, "logps/rejected": -1145.5439453125, "loss": 0.2199, "rewards/accuracies": 0.875, "rewards/chosen": -12.238298416137695, "rewards/margins": 15.485051155090332, "rewards/rejected": -27.723350524902344, "step": 326 }, { "epoch": 0.20342146189735613, "grad_norm": 959.6294555664062, "learning_rate": 3.7388888888888893e-06, "logits/chosen": -68.39481353759766, "logits/rejected": -37.59004211425781, "logps/chosen": -1493.4400634765625, "logps/rejected": -1652.668212890625, "loss": 3.8991, "rewards/accuracies": 0.875, "rewards/chosen": -13.809399604797363, "rewards/margins": 8.94527816772461, "rewards/rejected": -22.75467872619629, "step": 327 }, { "epoch": 0.2040435458786936, "grad_norm": 13.548354148864746, "learning_rate": 3.7333333333333337e-06, "logits/chosen": -125.30322265625, "logits/rejected": 24.608505249023438, "logps/chosen": -263.6572265625, "logps/rejected": -606.1792602539062, "loss": 0.3708, "rewards/accuracies": 0.875, "rewards/chosen": -8.128238677978516, "rewards/margins": 19.79342269897461, "rewards/rejected": -27.921661376953125, "step": 328 }, { "epoch": 0.20466562986003112, "grad_norm": 0.5974065661430359, "learning_rate": 3.727777777777778e-06, "logits/chosen": -63.26960754394531, "logits/rejected": 43.33330535888672, "logps/chosen": -1156.432373046875, "logps/rejected": -1894.190673828125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -16.975799560546875, "rewards/margins": 31.458057403564453, "rewards/rejected": -48.43385696411133, "step": 329 }, { "epoch": 0.2052877138413686, "grad_norm": 11.976784706115723, "learning_rate": 3.7222222222222225e-06, "logits/chosen": -99.28781127929688, "logits/rejected": -30.93073272705078, "logps/chosen": -343.15020751953125, "logps/rejected": -523.055419921875, "loss": 0.0443, "rewards/accuracies": 1.0, "rewards/chosen": -2.499878168106079, "rewards/margins": 12.459024429321289, "rewards/rejected": -14.958902359008789, "step": 330 }, { "epoch": 0.20590979782270608, "grad_norm": 41.797157287597656, "learning_rate": 3.716666666666667e-06, "logits/chosen": -70.28546142578125, "logits/rejected": -32.62881851196289, "logps/chosen": -556.646728515625, "logps/rejected": -660.83251953125, "loss": 0.1763, "rewards/accuracies": 0.875, "rewards/chosen": -6.983889102935791, "rewards/margins": 12.959762573242188, "rewards/rejected": -19.943653106689453, "step": 331 }, { "epoch": 0.20653188180404355, "grad_norm": 0.011063056997954845, "learning_rate": 3.7111111111111113e-06, "logits/chosen": -102.5046615600586, "logits/rejected": -7.533534049987793, "logps/chosen": -369.3616027832031, "logps/rejected": -599.0872802734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.101081848144531, "rewards/margins": 14.522573471069336, "rewards/rejected": -20.6236572265625, "step": 332 }, { "epoch": 0.20715396578538103, "grad_norm": 56.443336486816406, "learning_rate": 3.705555555555556e-06, "logits/chosen": -40.077388763427734, "logits/rejected": 42.22065734863281, "logps/chosen": -1249.8460693359375, "logps/rejected": -1451.278564453125, "loss": 0.3921, "rewards/accuracies": 0.875, "rewards/chosen": -11.238434791564941, "rewards/margins": 8.488759994506836, "rewards/rejected": -19.72719383239746, "step": 333 }, { "epoch": 0.2077760497667185, "grad_norm": 5.5146098136901855, "learning_rate": 3.7e-06, "logits/chosen": -50.196533203125, "logits/rejected": 68.25907897949219, "logps/chosen": -966.582763671875, "logps/rejected": -1457.1748046875, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": -7.5091166496276855, "rewards/margins": 18.27229118347168, "rewards/rejected": -25.781408309936523, "step": 334 }, { "epoch": 0.208398133748056, "grad_norm": 0.0020942864939570427, "learning_rate": 3.694444444444445e-06, "logits/chosen": -129.73797607421875, "logits/rejected": -45.378849029541016, "logps/chosen": -504.2147521972656, "logps/rejected": -625.545166015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.989770889282227, "rewards/margins": 16.5666446685791, "rewards/rejected": -21.556415557861328, "step": 335 }, { "epoch": 0.20902021772939347, "grad_norm": 248.68968200683594, "learning_rate": 3.688888888888889e-06, "logits/chosen": -20.8752498626709, "logits/rejected": 75.82577514648438, "logps/chosen": -838.8102416992188, "logps/rejected": -785.3487548828125, "loss": 1.4379, "rewards/accuracies": 0.75, "rewards/chosen": -6.090600967407227, "rewards/margins": 9.259601593017578, "rewards/rejected": -15.350202560424805, "step": 336 }, { "epoch": 0.20964230171073095, "grad_norm": 3.501882314682007, "learning_rate": 3.6833333333333338e-06, "logits/chosen": -135.36602783203125, "logits/rejected": 8.955324172973633, "logps/chosen": -255.08009338378906, "logps/rejected": -502.6075744628906, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -4.770010948181152, "rewards/margins": 13.61465835571289, "rewards/rejected": -18.38467025756836, "step": 337 }, { "epoch": 0.21026438569206843, "grad_norm": 64.75476837158203, "learning_rate": 3.6777777777777778e-06, "logits/chosen": -62.86491012573242, "logits/rejected": 6.104921340942383, "logps/chosen": -544.9927978515625, "logps/rejected": -692.874267578125, "loss": 0.8696, "rewards/accuracies": 0.875, "rewards/chosen": -7.586861610412598, "rewards/margins": 12.443012237548828, "rewards/rejected": -20.029874801635742, "step": 338 }, { "epoch": 0.2108864696734059, "grad_norm": 0.016747971996665, "learning_rate": 3.6722222222222226e-06, "logits/chosen": -54.45949172973633, "logits/rejected": -2.937422275543213, "logps/chosen": -1008.626953125, "logps/rejected": -1518.3985595703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.92531967163086, "rewards/margins": 21.686124801635742, "rewards/rejected": -30.61144256591797, "step": 339 }, { "epoch": 0.2115085536547434, "grad_norm": 64.20643615722656, "learning_rate": 3.6666666666666666e-06, "logits/chosen": -75.48532104492188, "logits/rejected": 71.22933959960938, "logps/chosen": -426.7383728027344, "logps/rejected": -619.314697265625, "loss": 0.6968, "rewards/accuracies": 0.875, "rewards/chosen": -6.053163528442383, "rewards/margins": 14.111700057983398, "rewards/rejected": -20.16486358642578, "step": 340 }, { "epoch": 0.21213063763608087, "grad_norm": 2.007647752761841, "learning_rate": 3.6611111111111114e-06, "logits/chosen": -121.6822509765625, "logits/rejected": 40.239051818847656, "logps/chosen": -1208.0361328125, "logps/rejected": -1688.90234375, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -10.360464096069336, "rewards/margins": 21.194856643676758, "rewards/rejected": -31.555320739746094, "step": 341 }, { "epoch": 0.21275272161741834, "grad_norm": 301.562744140625, "learning_rate": 3.6555555555555562e-06, "logits/chosen": -106.43183898925781, "logits/rejected": 34.55303955078125, "logps/chosen": -397.6550598144531, "logps/rejected": -796.5233154296875, "loss": 0.6098, "rewards/accuracies": 0.875, "rewards/chosen": -3.5642027854919434, "rewards/margins": 15.746432304382324, "rewards/rejected": -19.31063461303711, "step": 342 }, { "epoch": 0.21337480559875582, "grad_norm": 1.8176939487457275, "learning_rate": 3.65e-06, "logits/chosen": -127.08445739746094, "logits/rejected": 31.43716812133789, "logps/chosen": -1293.3975830078125, "logps/rejected": -1177.4854736328125, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -7.979880332946777, "rewards/margins": 13.4581880569458, "rewards/rejected": -21.438068389892578, "step": 343 }, { "epoch": 0.2139968895800933, "grad_norm": 0.009203294292092323, "learning_rate": 3.644444444444445e-06, "logits/chosen": -108.68656921386719, "logits/rejected": 8.826461791992188, "logps/chosen": -391.76220703125, "logps/rejected": -680.0717163085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.383183479309082, "rewards/margins": 16.02044105529785, "rewards/rejected": -23.40362548828125, "step": 344 }, { "epoch": 0.21461897356143078, "grad_norm": 319.1959228515625, "learning_rate": 3.638888888888889e-06, "logits/chosen": -91.69474792480469, "logits/rejected": -4.902622222900391, "logps/chosen": -710.3324584960938, "logps/rejected": -2387.7646484375, "loss": 0.0896, "rewards/accuracies": 1.0, "rewards/chosen": -6.263444423675537, "rewards/margins": 19.98883819580078, "rewards/rejected": -26.252281188964844, "step": 345 }, { "epoch": 0.21524105754276826, "grad_norm": 458.37286376953125, "learning_rate": 3.633333333333334e-06, "logits/chosen": -94.76748657226562, "logits/rejected": -40.294281005859375, "logps/chosen": -1457.803955078125, "logps/rejected": -1741.6097412109375, "loss": 0.4057, "rewards/accuracies": 0.875, "rewards/chosen": -14.21316146850586, "rewards/margins": 10.13693904876709, "rewards/rejected": -24.350101470947266, "step": 346 }, { "epoch": 0.21586314152410577, "grad_norm": 10.011838912963867, "learning_rate": 3.627777777777778e-06, "logits/chosen": -57.94906997680664, "logits/rejected": 80.63256072998047, "logps/chosen": -522.3330688476562, "logps/rejected": -804.0316162109375, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": -7.080537796020508, "rewards/margins": 18.05306053161621, "rewards/rejected": -25.13359832763672, "step": 347 }, { "epoch": 0.21648522550544325, "grad_norm": 24.263965606689453, "learning_rate": 3.6222222222222226e-06, "logits/chosen": -128.87252807617188, "logits/rejected": -98.2238540649414, "logps/chosen": -464.12066650390625, "logps/rejected": -599.8432006835938, "loss": 0.0713, "rewards/accuracies": 1.0, "rewards/chosen": -5.51241397857666, "rewards/margins": 13.049528121948242, "rewards/rejected": -18.56194305419922, "step": 348 }, { "epoch": 0.21710730948678073, "grad_norm": 54.54481506347656, "learning_rate": 3.616666666666667e-06, "logits/chosen": -81.96731567382812, "logits/rejected": 29.761962890625, "logps/chosen": -484.6531066894531, "logps/rejected": -660.6582641601562, "loss": 0.7286, "rewards/accuracies": 0.875, "rewards/chosen": -7.416828155517578, "rewards/margins": 14.80782699584961, "rewards/rejected": -22.224655151367188, "step": 349 }, { "epoch": 0.2177293934681182, "grad_norm": 0.015609249472618103, "learning_rate": 3.6111111111111115e-06, "logits/chosen": -172.38836669921875, "logits/rejected": -0.9722156524658203, "logps/chosen": -281.3834228515625, "logps/rejected": -1592.1644287109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.5489654541015625, "rewards/margins": 25.17580795288086, "rewards/rejected": -29.724773406982422, "step": 350 }, { "epoch": 0.21835147744945568, "grad_norm": 10.519749641418457, "learning_rate": 3.605555555555556e-06, "logits/chosen": -130.91201782226562, "logits/rejected": -59.284690856933594, "logps/chosen": -1550.6080322265625, "logps/rejected": -1695.50537109375, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": -15.390326499938965, "rewards/margins": 14.143209457397461, "rewards/rejected": -29.533536911010742, "step": 351 }, { "epoch": 0.21897356143079316, "grad_norm": 0.0023348673712462187, "learning_rate": 3.6000000000000003e-06, "logits/chosen": -177.0453338623047, "logits/rejected": -39.225040435791016, "logps/chosen": -302.92254638671875, "logps/rejected": -1235.0694580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.616997241973877, "rewards/margins": 23.09372329711914, "rewards/rejected": -26.71072006225586, "step": 352 }, { "epoch": 0.21959564541213064, "grad_norm": 22.11195945739746, "learning_rate": 3.5944444444444447e-06, "logits/chosen": -107.12177276611328, "logits/rejected": 5.742656707763672, "logps/chosen": -442.1231384277344, "logps/rejected": -568.9662475585938, "loss": 0.095, "rewards/accuracies": 1.0, "rewards/chosen": -3.946739435195923, "rewards/margins": 14.284902572631836, "rewards/rejected": -18.23164176940918, "step": 353 }, { "epoch": 0.22021772939346812, "grad_norm": 0.07021205127239227, "learning_rate": 3.588888888888889e-06, "logits/chosen": -133.2052001953125, "logits/rejected": 43.24049758911133, "logps/chosen": -295.9682312011719, "logps/rejected": -619.7750854492188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.071015357971191, "rewards/margins": 15.475378036499023, "rewards/rejected": -21.5463924407959, "step": 354 }, { "epoch": 0.2208398133748056, "grad_norm": 72.47554779052734, "learning_rate": 3.5833333333333335e-06, "logits/chosen": -71.62528228759766, "logits/rejected": -42.441410064697266, "logps/chosen": -446.50762939453125, "logps/rejected": -592.2301635742188, "loss": 0.1436, "rewards/accuracies": 0.875, "rewards/chosen": -5.317691802978516, "rewards/margins": 12.433881759643555, "rewards/rejected": -17.75157356262207, "step": 355 }, { "epoch": 0.22146189735614308, "grad_norm": 5.872790813446045, "learning_rate": 3.577777777777778e-06, "logits/chosen": -79.82473754882812, "logits/rejected": 42.5994758605957, "logps/chosen": -1404.6407470703125, "logps/rejected": -2003.2579345703125, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -25.941162109375, "rewards/margins": 20.8463077545166, "rewards/rejected": -46.78746795654297, "step": 356 }, { "epoch": 0.22208398133748056, "grad_norm": 5.235248565673828, "learning_rate": 3.5722222222222223e-06, "logits/chosen": 29.089263916015625, "logits/rejected": 89.17533874511719, "logps/chosen": -1139.515380859375, "logps/rejected": -1577.792724609375, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": -20.418697357177734, "rewards/margins": 19.221214294433594, "rewards/rejected": -39.63991165161133, "step": 357 }, { "epoch": 0.22270606531881804, "grad_norm": 3.5328643321990967, "learning_rate": 3.566666666666667e-06, "logits/chosen": -91.34324645996094, "logits/rejected": 50.99628448486328, "logps/chosen": -313.564697265625, "logps/rejected": -509.1180725097656, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -5.041337013244629, "rewards/margins": 10.620466232299805, "rewards/rejected": -15.661802291870117, "step": 358 }, { "epoch": 0.22332814930015552, "grad_norm": 8.105415344238281, "learning_rate": 3.561111111111111e-06, "logits/chosen": -59.44166946411133, "logits/rejected": -10.000699043273926, "logps/chosen": -652.322509765625, "logps/rejected": -794.9805908203125, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": -2.618825912475586, "rewards/margins": 14.101570129394531, "rewards/rejected": -16.720394134521484, "step": 359 }, { "epoch": 0.223950233281493, "grad_norm": 376.0079650878906, "learning_rate": 3.555555555555556e-06, "logits/chosen": -148.70285034179688, "logits/rejected": 4.519952774047852, "logps/chosen": -1295.69921875, "logps/rejected": -1466.269775390625, "loss": 0.6231, "rewards/accuracies": 0.875, "rewards/chosen": -17.421775817871094, "rewards/margins": 14.910486221313477, "rewards/rejected": -32.33226013183594, "step": 360 }, { "epoch": 0.22457231726283047, "grad_norm": 586.3753051757812, "learning_rate": 3.5500000000000003e-06, "logits/chosen": -66.73367309570312, "logits/rejected": -110.09033203125, "logps/chosen": -1397.334228515625, "logps/rejected": -1379.1051025390625, "loss": 3.1413, "rewards/accuracies": 0.625, "rewards/chosen": -28.715700149536133, "rewards/margins": 3.9195353984832764, "rewards/rejected": -32.63523483276367, "step": 361 }, { "epoch": 0.22519440124416795, "grad_norm": 0.41345924139022827, "learning_rate": 3.5444444444444447e-06, "logits/chosen": -118.64250946044922, "logits/rejected": 51.89346694946289, "logps/chosen": -586.6959838867188, "logps/rejected": -2039.8699951171875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.224053382873535, "rewards/margins": 30.730127334594727, "rewards/rejected": -37.95418167114258, "step": 362 }, { "epoch": 0.22581648522550543, "grad_norm": 2.762491464614868, "learning_rate": 3.538888888888889e-06, "logits/chosen": -121.12859344482422, "logits/rejected": -13.714950561523438, "logps/chosen": -442.7908935546875, "logps/rejected": -1402.1416015625, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -5.699723243713379, "rewards/margins": 30.039241790771484, "rewards/rejected": -35.73896408081055, "step": 363 }, { "epoch": 0.2264385692068429, "grad_norm": 0.01917291060090065, "learning_rate": 3.5333333333333335e-06, "logits/chosen": -69.51181030273438, "logits/rejected": 79.45323181152344, "logps/chosen": -543.193603515625, "logps/rejected": -806.804443359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.1773176193237305, "rewards/margins": 17.44906234741211, "rewards/rejected": -23.626379013061523, "step": 364 }, { "epoch": 0.22706065318818042, "grad_norm": 0.0005554378731176257, "learning_rate": 3.5277777777777784e-06, "logits/chosen": -104.48978424072266, "logits/rejected": 10.762042999267578, "logps/chosen": -710.517822265625, "logps/rejected": -1120.050537109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.575695753097534, "rewards/margins": 15.565239906311035, "rewards/rejected": -19.140933990478516, "step": 365 }, { "epoch": 0.2276827371695179, "grad_norm": 0.7405343651771545, "learning_rate": 3.5222222222222223e-06, "logits/chosen": -130.28977966308594, "logits/rejected": 100.90352630615234, "logps/chosen": -363.6015930175781, "logps/rejected": -1290.927978515625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -6.894262313842773, "rewards/margins": 16.395580291748047, "rewards/rejected": -23.28984260559082, "step": 366 }, { "epoch": 0.22830482115085537, "grad_norm": 371.4366455078125, "learning_rate": 3.516666666666667e-06, "logits/chosen": -63.7216796875, "logits/rejected": 58.61530303955078, "logps/chosen": -573.6763305664062, "logps/rejected": -991.2572021484375, "loss": 1.836, "rewards/accuracies": 0.875, "rewards/chosen": -10.576478958129883, "rewards/margins": 16.621641159057617, "rewards/rejected": -27.1981201171875, "step": 367 }, { "epoch": 0.22892690513219285, "grad_norm": 0.03157585859298706, "learning_rate": 3.511111111111111e-06, "logits/chosen": -206.547607421875, "logits/rejected": -15.480863571166992, "logps/chosen": -788.4251708984375, "logps/rejected": -1649.10107421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.173426628112793, "rewards/margins": 22.535289764404297, "rewards/rejected": -31.708717346191406, "step": 368 }, { "epoch": 0.22954898911353033, "grad_norm": 0.18826603889465332, "learning_rate": 3.505555555555556e-06, "logits/chosen": -152.0640869140625, "logits/rejected": 13.557126998901367, "logps/chosen": -354.9673156738281, "logps/rejected": -720.952880859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.372082710266113, "rewards/margins": 14.377341270446777, "rewards/rejected": -19.74942398071289, "step": 369 }, { "epoch": 0.2301710730948678, "grad_norm": 2.1918697357177734, "learning_rate": 3.5e-06, "logits/chosen": -156.42156982421875, "logits/rejected": 2.8257694244384766, "logps/chosen": -442.8943176269531, "logps/rejected": -987.8575439453125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -6.757343769073486, "rewards/margins": 15.5772705078125, "rewards/rejected": -22.334613800048828, "step": 370 }, { "epoch": 0.2307931570762053, "grad_norm": 0.3515419661998749, "learning_rate": 3.4944444444444448e-06, "logits/chosen": -117.63778686523438, "logits/rejected": 25.478975296020508, "logps/chosen": -597.004150390625, "logps/rejected": -900.9102783203125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -8.306331634521484, "rewards/margins": 12.588811874389648, "rewards/rejected": -20.895143508911133, "step": 371 }, { "epoch": 0.23141524105754277, "grad_norm": 0.03313883766531944, "learning_rate": 3.4888888888888896e-06, "logits/chosen": -130.5347900390625, "logits/rejected": 59.264678955078125, "logps/chosen": -269.6769714355469, "logps/rejected": -676.0841064453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.977070331573486, "rewards/margins": 17.738605499267578, "rewards/rejected": -22.715675354003906, "step": 372 }, { "epoch": 0.23203732503888025, "grad_norm": 20.810562133789062, "learning_rate": 3.4833333333333336e-06, "logits/chosen": -86.77752685546875, "logits/rejected": 10.947017669677734, "logps/chosen": -287.11968994140625, "logps/rejected": -451.7987365722656, "loss": 0.3878, "rewards/accuracies": 0.875, "rewards/chosen": -4.1386189460754395, "rewards/margins": 10.602840423583984, "rewards/rejected": -14.741458892822266, "step": 373 }, { "epoch": 0.23265940902021773, "grad_norm": 52.725616455078125, "learning_rate": 3.4777777777777784e-06, "logits/chosen": -103.44730377197266, "logits/rejected": -47.148643493652344, "logps/chosen": -591.4392700195312, "logps/rejected": -1431.695068359375, "loss": 0.4473, "rewards/accuracies": 0.875, "rewards/chosen": -7.04750919342041, "rewards/margins": 22.493053436279297, "rewards/rejected": -29.54056167602539, "step": 374 }, { "epoch": 0.2332814930015552, "grad_norm": 674.2523193359375, "learning_rate": 3.4722222222222224e-06, "logits/chosen": -123.1291275024414, "logits/rejected": 15.596362113952637, "logps/chosen": -1357.3099365234375, "logps/rejected": -1761.390625, "loss": 0.9952, "rewards/accuracies": 0.75, "rewards/chosen": -12.88896369934082, "rewards/margins": 15.476181983947754, "rewards/rejected": -28.365148544311523, "step": 375 }, { "epoch": 0.23390357698289269, "grad_norm": 0.0008855348569341004, "learning_rate": 3.4666666666666672e-06, "logits/chosen": -117.54137420654297, "logits/rejected": 66.66167449951172, "logps/chosen": -462.3901672363281, "logps/rejected": -737.3338623046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.631288528442383, "rewards/margins": 15.326826095581055, "rewards/rejected": -17.958114624023438, "step": 376 }, { "epoch": 0.23452566096423016, "grad_norm": 6.291079521179199, "learning_rate": 3.461111111111111e-06, "logits/chosen": -191.65391540527344, "logits/rejected": -11.821935653686523, "logps/chosen": -380.5734558105469, "logps/rejected": -850.367431640625, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": -1.6781725883483887, "rewards/margins": 14.6233549118042, "rewards/rejected": -16.301528930664062, "step": 377 }, { "epoch": 0.23514774494556764, "grad_norm": 8.107344627380371, "learning_rate": 3.455555555555556e-06, "logits/chosen": -141.66831970214844, "logits/rejected": 2.9234256744384766, "logps/chosen": -394.3192138671875, "logps/rejected": -692.56298828125, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": -7.958208084106445, "rewards/margins": 16.48084831237793, "rewards/rejected": -24.439056396484375, "step": 378 }, { "epoch": 0.23576982892690512, "grad_norm": 0.6237285137176514, "learning_rate": 3.45e-06, "logits/chosen": -58.561676025390625, "logits/rejected": -5.5674333572387695, "logps/chosen": -275.06304931640625, "logps/rejected": -497.35552978515625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -4.2961554527282715, "rewards/margins": 14.047036170959473, "rewards/rejected": -18.343191146850586, "step": 379 }, { "epoch": 0.2363919129082426, "grad_norm": 40.88282012939453, "learning_rate": 3.444444444444445e-06, "logits/chosen": -138.43690490722656, "logits/rejected": 13.539619445800781, "logps/chosen": -435.38677978515625, "logps/rejected": -676.7967529296875, "loss": 0.1705, "rewards/accuracies": 0.875, "rewards/chosen": -5.915517807006836, "rewards/margins": 14.400333404541016, "rewards/rejected": -20.31584930419922, "step": 380 }, { "epoch": 0.23701399688958008, "grad_norm": 8.650310516357422, "learning_rate": 3.4388888888888892e-06, "logits/chosen": -106.99457550048828, "logits/rejected": 86.53215026855469, "logps/chosen": -316.3610534667969, "logps/rejected": -644.145751953125, "loss": 0.0601, "rewards/accuracies": 1.0, "rewards/chosen": -5.160066604614258, "rewards/margins": 14.472969055175781, "rewards/rejected": -19.633033752441406, "step": 381 }, { "epoch": 0.2376360808709176, "grad_norm": 630.4070434570312, "learning_rate": 3.4333333333333336e-06, "logits/chosen": -78.77342224121094, "logits/rejected": -82.28997039794922, "logps/chosen": -620.2591552734375, "logps/rejected": -851.9679565429688, "loss": 1.0937, "rewards/accuracies": 0.75, "rewards/chosen": -3.2615342140197754, "rewards/margins": 11.174216270446777, "rewards/rejected": -14.435749053955078, "step": 382 }, { "epoch": 0.23825816485225507, "grad_norm": 1.6711571216583252, "learning_rate": 3.427777777777778e-06, "logits/chosen": -11.867300033569336, "logits/rejected": -48.793121337890625, "logps/chosen": -527.724365234375, "logps/rejected": -467.13067626953125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -2.7372941970825195, "rewards/margins": 11.043073654174805, "rewards/rejected": -13.780366897583008, "step": 383 }, { "epoch": 0.23888024883359255, "grad_norm": 0.008090958930552006, "learning_rate": 3.4222222222222224e-06, "logits/chosen": -166.04830932617188, "logits/rejected": -5.239498138427734, "logps/chosen": -170.27587890625, "logps/rejected": -1325.967529296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.809947967529297, "rewards/margins": 14.274275779724121, "rewards/rejected": -17.084224700927734, "step": 384 }, { "epoch": 0.23950233281493002, "grad_norm": 18.369712829589844, "learning_rate": 3.416666666666667e-06, "logits/chosen": 33.24064254760742, "logits/rejected": 43.05028533935547, "logps/chosen": -408.5423889160156, "logps/rejected": -532.668701171875, "loss": 0.2212, "rewards/accuracies": 0.875, "rewards/chosen": -2.5401060581207275, "rewards/margins": 11.265649795532227, "rewards/rejected": -13.805755615234375, "step": 385 }, { "epoch": 0.2401244167962675, "grad_norm": 0.0006316156359389424, "learning_rate": 3.4111111111111113e-06, "logits/chosen": -139.8675079345703, "logits/rejected": 6.416229248046875, "logps/chosen": -327.0350341796875, "logps/rejected": -1023.48388671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.4335947036743164, "rewards/margins": 21.100631713867188, "rewards/rejected": -23.53422737121582, "step": 386 }, { "epoch": 0.24074650077760498, "grad_norm": 12.359541893005371, "learning_rate": 3.4055555555555557e-06, "logits/chosen": -1.087533950805664, "logits/rejected": 7.781442642211914, "logps/chosen": -649.0916137695312, "logps/rejected": -686.0594482421875, "loss": 0.0403, "rewards/accuracies": 1.0, "rewards/chosen": -6.420577049255371, "rewards/margins": 9.270051956176758, "rewards/rejected": -15.690629959106445, "step": 387 }, { "epoch": 0.24136858475894246, "grad_norm": 4.517975807189941, "learning_rate": 3.4000000000000005e-06, "logits/chosen": -74.16693115234375, "logits/rejected": 32.27442169189453, "logps/chosen": -303.7044982910156, "logps/rejected": -530.9987182617188, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": -4.17034912109375, "rewards/margins": 13.824332237243652, "rewards/rejected": -17.99468231201172, "step": 388 }, { "epoch": 0.24199066874027994, "grad_norm": 3.021022319793701, "learning_rate": 3.3944444444444445e-06, "logits/chosen": -149.9408416748047, "logits/rejected": -9.669374465942383, "logps/chosen": -355.4606628417969, "logps/rejected": -673.7430419921875, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -5.885824203491211, "rewards/margins": 15.087596893310547, "rewards/rejected": -20.97342300415039, "step": 389 }, { "epoch": 0.24261275272161742, "grad_norm": 19.910234451293945, "learning_rate": 3.3888888888888893e-06, "logits/chosen": -6.911846160888672, "logits/rejected": 93.02581787109375, "logps/chosen": -839.9605712890625, "logps/rejected": -1322.192138671875, "loss": 0.0965, "rewards/accuracies": 0.875, "rewards/chosen": -6.46550178527832, "rewards/margins": 18.611003875732422, "rewards/rejected": -25.076505661010742, "step": 390 }, { "epoch": 0.2432348367029549, "grad_norm": 596.4502563476562, "learning_rate": 3.3833333333333333e-06, "logits/chosen": 15.306695938110352, "logits/rejected": 79.26055908203125, "logps/chosen": -799.0841674804688, "logps/rejected": -688.6866455078125, "loss": 2.6223, "rewards/accuracies": 0.625, "rewards/chosen": -8.828536987304688, "rewards/margins": 3.7239246368408203, "rewards/rejected": -12.552461624145508, "step": 391 }, { "epoch": 0.24385692068429238, "grad_norm": 0.18046356737613678, "learning_rate": 3.377777777777778e-06, "logits/chosen": -192.1083984375, "logits/rejected": 98.80660247802734, "logps/chosen": -319.278076171875, "logps/rejected": -763.6867065429688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.886842727661133, "rewards/margins": 17.893203735351562, "rewards/rejected": -21.780046463012695, "step": 392 }, { "epoch": 0.24447900466562986, "grad_norm": 48.78953170776367, "learning_rate": 3.372222222222222e-06, "logits/chosen": -77.42414855957031, "logits/rejected": -2.5174922943115234, "logps/chosen": -520.592529296875, "logps/rejected": -634.89453125, "loss": 0.3498, "rewards/accuracies": 0.875, "rewards/chosen": -7.18349552154541, "rewards/margins": 10.799402236938477, "rewards/rejected": -17.982898712158203, "step": 393 }, { "epoch": 0.24510108864696734, "grad_norm": 1681.7200927734375, "learning_rate": 3.366666666666667e-06, "logits/chosen": -61.378662109375, "logits/rejected": 41.8256721496582, "logps/chosen": -1396.827880859375, "logps/rejected": -692.0110473632812, "loss": 7.2095, "rewards/accuracies": 0.875, "rewards/chosen": -12.222773551940918, "rewards/margins": 9.70986557006836, "rewards/rejected": -21.932640075683594, "step": 394 }, { "epoch": 0.24572317262830481, "grad_norm": 180.07151794433594, "learning_rate": 3.3611111111111117e-06, "logits/chosen": -147.60113525390625, "logits/rejected": -22.299514770507812, "logps/chosen": -2443.734375, "logps/rejected": -2397.772705078125, "loss": 0.467, "rewards/accuracies": 0.75, "rewards/chosen": -9.64985179901123, "rewards/margins": 15.558685302734375, "rewards/rejected": -25.208538055419922, "step": 395 }, { "epoch": 0.2463452566096423, "grad_norm": 12.27956485748291, "learning_rate": 3.3555555555555557e-06, "logits/chosen": -155.65386962890625, "logits/rejected": 93.7176513671875, "logps/chosen": -336.73785400390625, "logps/rejected": -731.3508911132812, "loss": 0.0712, "rewards/accuracies": 1.0, "rewards/chosen": -6.66259241104126, "rewards/margins": 13.152559280395508, "rewards/rejected": -19.81515121459961, "step": 396 }, { "epoch": 0.24696734059097977, "grad_norm": 1.576117753982544, "learning_rate": 3.3500000000000005e-06, "logits/chosen": -141.27207946777344, "logits/rejected": -12.174142837524414, "logps/chosen": -1001.6227416992188, "logps/rejected": -1731.196533203125, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -10.73685359954834, "rewards/margins": 20.61839485168457, "rewards/rejected": -31.355249404907227, "step": 397 }, { "epoch": 0.24758942457231725, "grad_norm": 0.13068464398384094, "learning_rate": 3.3444444444444445e-06, "logits/chosen": -94.00955200195312, "logits/rejected": 46.675323486328125, "logps/chosen": -355.37127685546875, "logps/rejected": -621.1605834960938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.467815399169922, "rewards/margins": 13.027853012084961, "rewards/rejected": -18.495668411254883, "step": 398 }, { "epoch": 0.24821150855365473, "grad_norm": 0.44366735219955444, "learning_rate": 3.3388888888888893e-06, "logits/chosen": -119.177978515625, "logits/rejected": 111.02654266357422, "logps/chosen": -881.8138427734375, "logps/rejected": -1646.3941650390625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -8.025562286376953, "rewards/margins": 18.158206939697266, "rewards/rejected": -26.183767318725586, "step": 399 }, { "epoch": 0.24883359253499224, "grad_norm": 14.764765739440918, "learning_rate": 3.3333333333333333e-06, "logits/chosen": -252.46035766601562, "logits/rejected": 2.7044849395751953, "logps/chosen": -585.8213500976562, "logps/rejected": -1538.1910400390625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -4.082944869995117, "rewards/margins": 14.39876937866211, "rewards/rejected": -18.481714248657227, "step": 400 }, { "epoch": 0.24945567651632972, "grad_norm": 729.4806518554688, "learning_rate": 3.327777777777778e-06, "logits/chosen": -135.88565063476562, "logits/rejected": 4.79701042175293, "logps/chosen": -399.1455078125, "logps/rejected": -884.3629760742188, "loss": 0.9007, "rewards/accuracies": 0.875, "rewards/chosen": -5.26923942565918, "rewards/margins": 9.685879707336426, "rewards/rejected": -14.955120086669922, "step": 401 }, { "epoch": 0.25007776049766717, "grad_norm": 0.0030219340696930885, "learning_rate": 3.322222222222222e-06, "logits/chosen": -123.68274688720703, "logits/rejected": 42.362483978271484, "logps/chosen": -390.8702087402344, "logps/rejected": -686.2620849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.082259654998779, "rewards/margins": 18.833240509033203, "rewards/rejected": -24.915498733520508, "step": 402 }, { "epoch": 0.2506998444790047, "grad_norm": 5.747935771942139, "learning_rate": 3.316666666666667e-06, "logits/chosen": -94.12886810302734, "logits/rejected": -46.02672576904297, "logps/chosen": -309.6138000488281, "logps/rejected": -499.94207763671875, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": -5.190237998962402, "rewards/margins": 10.148759841918945, "rewards/rejected": -15.338996887207031, "step": 403 }, { "epoch": 0.2513219284603421, "grad_norm": 6.6589674949646, "learning_rate": 3.3111111111111118e-06, "logits/chosen": -97.32427215576172, "logits/rejected": -77.25851440429688, "logps/chosen": -410.43914794921875, "logps/rejected": -502.42645263671875, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": -4.221623420715332, "rewards/margins": 11.55864143371582, "rewards/rejected": -15.780264854431152, "step": 404 }, { "epoch": 0.25194401244167963, "grad_norm": 1058.126708984375, "learning_rate": 3.3055555555555558e-06, "logits/chosen": -41.790687561035156, "logits/rejected": 64.3950424194336, "logps/chosen": -1015.30224609375, "logps/rejected": -1965.0015869140625, "loss": 0.4031, "rewards/accuracies": 0.875, "rewards/chosen": -5.637389183044434, "rewards/margins": 15.507315635681152, "rewards/rejected": -21.144704818725586, "step": 405 }, { "epoch": 0.2525660964230171, "grad_norm": 0.0528901182115078, "learning_rate": 3.3000000000000006e-06, "logits/chosen": -164.34054565429688, "logits/rejected": 38.72697448730469, "logps/chosen": -532.4224243164062, "logps/rejected": -883.7044677734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.277135848999023, "rewards/margins": 19.573875427246094, "rewards/rejected": -25.851011276245117, "step": 406 }, { "epoch": 0.2531881804043546, "grad_norm": 39.068077087402344, "learning_rate": 3.2944444444444446e-06, "logits/chosen": -18.097436904907227, "logits/rejected": -46.41596221923828, "logps/chosen": -628.3483276367188, "logps/rejected": -625.6399536132812, "loss": 0.6065, "rewards/accuracies": 0.875, "rewards/chosen": -2.0565736293792725, "rewards/margins": 9.428094863891602, "rewards/rejected": -11.484668731689453, "step": 407 }, { "epoch": 0.25381026438569204, "grad_norm": 0.0038228770717978477, "learning_rate": 3.2888888888888894e-06, "logits/chosen": -66.23749542236328, "logits/rejected": 60.735870361328125, "logps/chosen": -383.85614013671875, "logps/rejected": -705.1497802734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.588505744934082, "rewards/margins": 17.27199363708496, "rewards/rejected": -23.86050033569336, "step": 408 }, { "epoch": 0.25443234836702955, "grad_norm": 0.29659342765808105, "learning_rate": 3.2833333333333334e-06, "logits/chosen": -220.43112182617188, "logits/rejected": -42.31532287597656, "logps/chosen": -317.3853454589844, "logps/rejected": -692.1878662109375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.534301519393921, "rewards/margins": 18.10776138305664, "rewards/rejected": -20.64206314086914, "step": 409 }, { "epoch": 0.25505443234836706, "grad_norm": 0.017020490020513535, "learning_rate": 3.277777777777778e-06, "logits/chosen": -61.83311462402344, "logits/rejected": 49.902198791503906, "logps/chosen": -818.7548828125, "logps/rejected": -1310.494873046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.213472366333008, "rewards/margins": 21.71554183959961, "rewards/rejected": -33.92901611328125, "step": 410 }, { "epoch": 0.2556765163297045, "grad_norm": 37.13934326171875, "learning_rate": 3.2722222222222226e-06, "logits/chosen": -129.06886291503906, "logits/rejected": -34.2319221496582, "logps/chosen": -1623.17236328125, "logps/rejected": -2471.27880859375, "loss": 0.2658, "rewards/accuracies": 0.875, "rewards/chosen": -17.291173934936523, "rewards/margins": 21.077394485473633, "rewards/rejected": -38.368568420410156, "step": 411 }, { "epoch": 0.256298600311042, "grad_norm": 3.3281667232513428, "learning_rate": 3.266666666666667e-06, "logits/chosen": 89.73773193359375, "logits/rejected": -32.43182373046875, "logps/chosen": -548.9151000976562, "logps/rejected": -478.91656494140625, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": -2.9080827236175537, "rewards/margins": 10.780369758605957, "rewards/rejected": -13.688451766967773, "step": 412 }, { "epoch": 0.25692068429237946, "grad_norm": 1.0963503122329712, "learning_rate": 3.2611111111111114e-06, "logits/chosen": -179.1199188232422, "logits/rejected": -119.29914855957031, "logps/chosen": -462.077392578125, "logps/rejected": -702.2341918945312, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -5.2516984939575195, "rewards/margins": 18.187808990478516, "rewards/rejected": -23.43950653076172, "step": 413 }, { "epoch": 0.25754276827371697, "grad_norm": 12.871938705444336, "learning_rate": 3.255555555555556e-06, "logits/chosen": -94.34614562988281, "logits/rejected": -14.224048614501953, "logps/chosen": -433.3106384277344, "logps/rejected": -645.54150390625, "loss": 0.0391, "rewards/accuracies": 1.0, "rewards/chosen": -4.472655296325684, "rewards/margins": 17.234699249267578, "rewards/rejected": -21.707353591918945, "step": 414 }, { "epoch": 0.2581648522550544, "grad_norm": 60.39976119995117, "learning_rate": 3.2500000000000002e-06, "logits/chosen": -94.54686737060547, "logits/rejected": -85.27310943603516, "logps/chosen": -1053.0517578125, "logps/rejected": -1780.8712158203125, "loss": 0.8056, "rewards/accuracies": 0.875, "rewards/chosen": -10.537264823913574, "rewards/margins": 24.534107208251953, "rewards/rejected": -35.07136917114258, "step": 415 }, { "epoch": 0.25878693623639193, "grad_norm": 8.70070798555389e-05, "learning_rate": 3.2444444444444446e-06, "logits/chosen": -106.50188446044922, "logits/rejected": -12.571943283081055, "logps/chosen": -1058.1910400390625, "logps/rejected": -1141.62158203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.56014347076416, "rewards/margins": 20.621925354003906, "rewards/rejected": -25.18206787109375, "step": 416 }, { "epoch": 0.2594090202177294, "grad_norm": 0.00048504630103707314, "learning_rate": 3.238888888888889e-06, "logits/chosen": -212.72743225097656, "logits/rejected": 47.12677001953125, "logps/chosen": -281.5140380859375, "logps/rejected": -785.1246337890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.432189464569092, "rewards/margins": 21.105979919433594, "rewards/rejected": -26.53816795349121, "step": 417 }, { "epoch": 0.2600311041990669, "grad_norm": 74.28561401367188, "learning_rate": 3.2333333333333334e-06, "logits/chosen": -39.09807586669922, "logits/rejected": -37.74265670776367, "logps/chosen": -576.8735961914062, "logps/rejected": -666.0925903320312, "loss": 1.0714, "rewards/accuracies": 0.75, "rewards/chosen": -5.235861301422119, "rewards/margins": 8.5001220703125, "rewards/rejected": -13.735982894897461, "step": 418 }, { "epoch": 0.26065318818040434, "grad_norm": 0.00406319834291935, "learning_rate": 3.227777777777778e-06, "logits/chosen": -125.38990020751953, "logits/rejected": 35.771236419677734, "logps/chosen": -271.1806945800781, "logps/rejected": -604.3306274414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.502455711364746, "rewards/margins": 18.31060218811035, "rewards/rejected": -22.81305694580078, "step": 419 }, { "epoch": 0.26127527216174184, "grad_norm": 0.06921503692865372, "learning_rate": 3.2222222222222227e-06, "logits/chosen": -156.49893188476562, "logits/rejected": -0.7185282707214355, "logps/chosen": -285.5757751464844, "logps/rejected": -568.9661254882812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.168907642364502, "rewards/margins": 17.00074005126953, "rewards/rejected": -22.169649124145508, "step": 420 }, { "epoch": 0.2618973561430793, "grad_norm": 23.771432876586914, "learning_rate": 3.2166666666666666e-06, "logits/chosen": -44.47278594970703, "logits/rejected": 40.99623107910156, "logps/chosen": -695.4105834960938, "logps/rejected": -2139.2373046875, "loss": 0.1028, "rewards/accuracies": 0.875, "rewards/chosen": -9.48936939239502, "rewards/margins": 34.03549575805664, "rewards/rejected": -43.524864196777344, "step": 421 }, { "epoch": 0.2625194401244168, "grad_norm": 0.5247805714607239, "learning_rate": 3.2111111111111115e-06, "logits/chosen": -28.369760513305664, "logits/rejected": 83.45079040527344, "logps/chosen": -426.06170654296875, "logps/rejected": -602.0322875976562, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -4.80647087097168, "rewards/margins": 13.468998908996582, "rewards/rejected": -18.275470733642578, "step": 422 }, { "epoch": 0.26314152410575425, "grad_norm": 2.696509599685669, "learning_rate": 3.2055555555555555e-06, "logits/chosen": -156.73190307617188, "logits/rejected": 2.062930107116699, "logps/chosen": -339.9702453613281, "logps/rejected": -550.0850830078125, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -6.730691909790039, "rewards/margins": 11.891387939453125, "rewards/rejected": -18.62207794189453, "step": 423 }, { "epoch": 0.26376360808709176, "grad_norm": 0.00010647011367836967, "learning_rate": 3.2000000000000003e-06, "logits/chosen": -99.37198638916016, "logits/rejected": 77.39677429199219, "logps/chosen": -299.10369873046875, "logps/rejected": -635.5921630859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.903994083404541, "rewards/margins": 19.517202377319336, "rewards/rejected": -23.42119598388672, "step": 424 }, { "epoch": 0.2643856920684292, "grad_norm": 0.021113064140081406, "learning_rate": 3.1944444444444443e-06, "logits/chosen": -139.9362030029297, "logits/rejected": -71.13326263427734, "logps/chosen": -497.7764587402344, "logps/rejected": -1247.9305419921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.445491790771484, "rewards/margins": 23.295848846435547, "rewards/rejected": -31.74134063720703, "step": 425 }, { "epoch": 0.2650077760497667, "grad_norm": 0.09791923314332962, "learning_rate": 3.188888888888889e-06, "logits/chosen": -75.31929779052734, "logits/rejected": 17.320018768310547, "logps/chosen": -360.2807312011719, "logps/rejected": -538.9600219726562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.31129953265190125, "rewards/margins": 13.098464965820312, "rewards/rejected": -13.409765243530273, "step": 426 }, { "epoch": 0.2656298600311042, "grad_norm": 0.41263052821159363, "learning_rate": 3.183333333333334e-06, "logits/chosen": -130.52133178710938, "logits/rejected": -54.75746154785156, "logps/chosen": -375.21234130859375, "logps/rejected": -544.177978515625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -5.421488285064697, "rewards/margins": 12.739078521728516, "rewards/rejected": -18.160564422607422, "step": 427 }, { "epoch": 0.2662519440124417, "grad_norm": 27.723438262939453, "learning_rate": 3.177777777777778e-06, "logits/chosen": -19.64472007751465, "logits/rejected": 92.70637512207031, "logps/chosen": -317.8572692871094, "logps/rejected": -506.4354553222656, "loss": 0.2744, "rewards/accuracies": 0.875, "rewards/chosen": -4.416043758392334, "rewards/margins": 8.239082336425781, "rewards/rejected": -12.655126571655273, "step": 428 }, { "epoch": 0.2668740279937792, "grad_norm": 494.7976989746094, "learning_rate": 3.1722222222222227e-06, "logits/chosen": -74.61936950683594, "logits/rejected": 43.494483947753906, "logps/chosen": -926.481689453125, "logps/rejected": -962.2109375, "loss": 1.2498, "rewards/accuracies": 0.75, "rewards/chosen": -10.9013032913208, "rewards/margins": 11.203941345214844, "rewards/rejected": -22.10524559020996, "step": 429 }, { "epoch": 0.26749611197511663, "grad_norm": 0.146775022149086, "learning_rate": 3.1666666666666667e-06, "logits/chosen": -58.03156280517578, "logits/rejected": 50.948081970214844, "logps/chosen": -750.5324096679688, "logps/rejected": -1299.094482421875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -6.916543006896973, "rewards/margins": 21.271076202392578, "rewards/rejected": -28.187618255615234, "step": 430 }, { "epoch": 0.26811819595645414, "grad_norm": 9.059052467346191, "learning_rate": 3.1611111111111115e-06, "logits/chosen": -113.13426208496094, "logits/rejected": -17.654590606689453, "logps/chosen": -289.0269470214844, "logps/rejected": -489.6974182128906, "loss": 0.1031, "rewards/accuracies": 0.875, "rewards/chosen": -4.035221099853516, "rewards/margins": 9.537609100341797, "rewards/rejected": -13.572829246520996, "step": 431 }, { "epoch": 0.2687402799377916, "grad_norm": 0.0363839752972126, "learning_rate": 3.1555555555555555e-06, "logits/chosen": -226.86129760742188, "logits/rejected": 18.085723876953125, "logps/chosen": -231.36138916015625, "logps/rejected": -651.9208984375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.3531336784362793, "rewards/margins": 14.629854202270508, "rewards/rejected": -17.982986450195312, "step": 432 }, { "epoch": 0.2693623639191291, "grad_norm": 0.08437441289424896, "learning_rate": 3.1500000000000003e-06, "logits/chosen": -145.5948944091797, "logits/rejected": 24.434499740600586, "logps/chosen": -573.1240844726562, "logps/rejected": -1511.19482421875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.507134437561035, "rewards/margins": 27.4521484375, "rewards/rejected": -35.95928192138672, "step": 433 }, { "epoch": 0.26998444790046655, "grad_norm": 6.243736267089844, "learning_rate": 3.144444444444445e-06, "logits/chosen": -66.72409057617188, "logits/rejected": 23.432781219482422, "logps/chosen": -385.52105712890625, "logps/rejected": -589.9515380859375, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": -5.707604885101318, "rewards/margins": 13.929487228393555, "rewards/rejected": -19.63709259033203, "step": 434 }, { "epoch": 0.27060653188180406, "grad_norm": 0.00268726097419858, "learning_rate": 3.138888888888889e-06, "logits/chosen": -65.43486785888672, "logits/rejected": 100.32328033447266, "logps/chosen": -283.3175964355469, "logps/rejected": -614.7366943359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.6462783813476562, "rewards/margins": 16.967817306518555, "rewards/rejected": -20.61409568786621, "step": 435 }, { "epoch": 0.2712286158631415, "grad_norm": 48.17670440673828, "learning_rate": 3.133333333333334e-06, "logits/chosen": -33.98659133911133, "logits/rejected": 108.13905334472656, "logps/chosen": -660.230712890625, "logps/rejected": -810.4891967773438, "loss": 0.3604, "rewards/accuracies": 0.875, "rewards/chosen": -6.400341510772705, "rewards/margins": 10.12678337097168, "rewards/rejected": -16.52712631225586, "step": 436 }, { "epoch": 0.271850699844479, "grad_norm": 29.38442611694336, "learning_rate": 3.127777777777778e-06, "logits/chosen": -87.74200439453125, "logits/rejected": -29.828880310058594, "logps/chosen": -1445.314453125, "logps/rejected": -1502.1890869140625, "loss": 0.063, "rewards/accuracies": 1.0, "rewards/chosen": -15.269041061401367, "rewards/margins": 26.24750518798828, "rewards/rejected": -41.516544342041016, "step": 437 }, { "epoch": 0.27247278382581647, "grad_norm": 0.010054297745227814, "learning_rate": 3.1222222222222228e-06, "logits/chosen": -60.69300079345703, "logits/rejected": 2.827303647994995, "logps/chosen": -512.135986328125, "logps/rejected": -725.8115234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.909261703491211, "rewards/margins": 16.795150756835938, "rewards/rejected": -21.70441436767578, "step": 438 }, { "epoch": 0.273094867807154, "grad_norm": 0.2552092969417572, "learning_rate": 3.1166666666666668e-06, "logits/chosen": -110.59637451171875, "logits/rejected": 0.2997760772705078, "logps/chosen": -325.08319091796875, "logps/rejected": -1317.03759765625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.342226266860962, "rewards/margins": 31.533035278320312, "rewards/rejected": -34.87526321411133, "step": 439 }, { "epoch": 0.2737169517884914, "grad_norm": 0.06681760400533676, "learning_rate": 3.1111111111111116e-06, "logits/chosen": -76.43096923828125, "logits/rejected": 55.34849548339844, "logps/chosen": -481.04364013671875, "logps/rejected": -1548.072265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.8295164108276367, "rewards/margins": 26.851810455322266, "rewards/rejected": -30.681325912475586, "step": 440 }, { "epoch": 0.27433903576982893, "grad_norm": 0.03426912799477577, "learning_rate": 3.1055555555555556e-06, "logits/chosen": -148.08352661132812, "logits/rejected": -82.73184967041016, "logps/chosen": -632.0838623046875, "logps/rejected": -934.6665649414062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.879688739776611, "rewards/margins": 17.636289596557617, "rewards/rejected": -24.515975952148438, "step": 441 }, { "epoch": 0.2749611197511664, "grad_norm": 0.10970082134008408, "learning_rate": 3.1000000000000004e-06, "logits/chosen": -166.16172790527344, "logits/rejected": 47.80799102783203, "logps/chosen": -172.85546875, "logps/rejected": -519.4528198242188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.813028335571289, "rewards/margins": 14.210065841674805, "rewards/rejected": -17.023094177246094, "step": 442 }, { "epoch": 0.2755832037325039, "grad_norm": 0.0017556071979925036, "learning_rate": 3.094444444444445e-06, "logits/chosen": -106.08656311035156, "logits/rejected": -24.39427947998047, "logps/chosen": -556.9698486328125, "logps/rejected": -1666.7896728515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.812121868133545, "rewards/margins": 22.71316909790039, "rewards/rejected": -26.525291442871094, "step": 443 }, { "epoch": 0.27620528771384134, "grad_norm": 0.060042910277843475, "learning_rate": 3.088888888888889e-06, "logits/chosen": -40.299835205078125, "logits/rejected": -26.960643768310547, "logps/chosen": -868.32080078125, "logps/rejected": -1039.2618408203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.772699356079102, "rewards/margins": 15.946779251098633, "rewards/rejected": -20.719478607177734, "step": 444 }, { "epoch": 0.27682737169517885, "grad_norm": 7.5279221534729, "learning_rate": 3.0833333333333336e-06, "logits/chosen": -73.40738677978516, "logits/rejected": 1.6918678283691406, "logps/chosen": -206.82574462890625, "logps/rejected": -474.97857666015625, "loss": 0.0857, "rewards/accuracies": 1.0, "rewards/chosen": -2.0310826301574707, "rewards/margins": 14.190022468566895, "rewards/rejected": -16.22110366821289, "step": 445 }, { "epoch": 0.27744945567651635, "grad_norm": 0.024084731936454773, "learning_rate": 3.077777777777778e-06, "logits/chosen": -234.2447509765625, "logits/rejected": -39.864105224609375, "logps/chosen": -316.56768798828125, "logps/rejected": -1702.8681640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.610107898712158, "rewards/margins": 21.403791427612305, "rewards/rejected": -25.013900756835938, "step": 446 }, { "epoch": 0.2780715396578538, "grad_norm": 679.0257568359375, "learning_rate": 3.0722222222222224e-06, "logits/chosen": -119.59489440917969, "logits/rejected": -41.67457580566406, "logps/chosen": -1387.164794921875, "logps/rejected": -2069.992431640625, "loss": 0.6163, "rewards/accuracies": 0.875, "rewards/chosen": -10.253739356994629, "rewards/margins": 31.145536422729492, "rewards/rejected": -41.39927291870117, "step": 447 }, { "epoch": 0.2786936236391913, "grad_norm": 0.5239561796188354, "learning_rate": 3.066666666666667e-06, "logits/chosen": -27.203866958618164, "logits/rejected": 19.18273162841797, "logps/chosen": -459.13616943359375, "logps/rejected": -672.71923828125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -6.401674747467041, "rewards/margins": 15.77939224243164, "rewards/rejected": -22.181068420410156, "step": 448 }, { "epoch": 0.27931570762052876, "grad_norm": 46.727783203125, "learning_rate": 3.0611111111111112e-06, "logits/chosen": -143.22959899902344, "logits/rejected": -2.729686737060547, "logps/chosen": -432.8746032714844, "logps/rejected": -685.0629272460938, "loss": 0.2572, "rewards/accuracies": 0.875, "rewards/chosen": -4.866744041442871, "rewards/margins": 13.104055404663086, "rewards/rejected": -17.970800399780273, "step": 449 }, { "epoch": 0.27993779160186627, "grad_norm": 0.007155199535191059, "learning_rate": 3.055555555555556e-06, "logits/chosen": -122.77845764160156, "logits/rejected": -22.09000015258789, "logps/chosen": -293.401611328125, "logps/rejected": -716.1458129882812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.5648956298828125, "rewards/margins": 15.705573081970215, "rewards/rejected": -19.270469665527344, "step": 450 }, { "epoch": 0.2805598755832037, "grad_norm": 0.0063332668505609035, "learning_rate": 3.05e-06, "logits/chosen": -206.1707000732422, "logits/rejected": -49.159324645996094, "logps/chosen": -591.9786987304688, "logps/rejected": -1175.1263427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.653476238250732, "rewards/margins": 20.61270523071289, "rewards/rejected": -25.26618194580078, "step": 451 }, { "epoch": 0.28118195956454123, "grad_norm": 0.3904751241207123, "learning_rate": 3.044444444444445e-06, "logits/chosen": -159.75433349609375, "logits/rejected": 69.57440948486328, "logps/chosen": -484.4688415527344, "logps/rejected": -796.8455200195312, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -5.573509693145752, "rewards/margins": 18.00706672668457, "rewards/rejected": -23.580577850341797, "step": 452 }, { "epoch": 0.2818040435458787, "grad_norm": 0.27007633447647095, "learning_rate": 3.038888888888889e-06, "logits/chosen": -129.496826171875, "logits/rejected": 28.780433654785156, "logps/chosen": -360.26422119140625, "logps/rejected": -729.1710205078125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.466520309448242, "rewards/margins": 18.017093658447266, "rewards/rejected": -23.483612060546875, "step": 453 }, { "epoch": 0.2824261275272162, "grad_norm": 0.050732944160699844, "learning_rate": 3.0333333333333337e-06, "logits/chosen": -158.59771728515625, "logits/rejected": 86.22364807128906, "logps/chosen": -289.087646484375, "logps/rejected": -626.59130859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.656994342803955, "rewards/margins": 15.556425094604492, "rewards/rejected": -21.21341896057129, "step": 454 }, { "epoch": 0.28304821150855364, "grad_norm": 245.98182678222656, "learning_rate": 3.0277777777777776e-06, "logits/chosen": -152.21572875976562, "logits/rejected": 25.665081024169922, "logps/chosen": -359.6564025878906, "logps/rejected": -943.8568725585938, "loss": 0.1684, "rewards/accuracies": 0.875, "rewards/chosen": -4.5480241775512695, "rewards/margins": 16.46710205078125, "rewards/rejected": -21.01512908935547, "step": 455 }, { "epoch": 0.28367029548989114, "grad_norm": 49.8556022644043, "learning_rate": 3.0222222222222225e-06, "logits/chosen": -77.38163757324219, "logits/rejected": 45.402305603027344, "logps/chosen": -770.3060913085938, "logps/rejected": -1426.64111328125, "loss": 0.0711, "rewards/accuracies": 1.0, "rewards/chosen": -4.318437576293945, "rewards/margins": 10.741496086120605, "rewards/rejected": -15.059934616088867, "step": 456 }, { "epoch": 0.2842923794712286, "grad_norm": 729.0494384765625, "learning_rate": 3.0166666666666673e-06, "logits/chosen": -37.32524490356445, "logits/rejected": 63.467018127441406, "logps/chosen": -1300.6883544921875, "logps/rejected": -1650.085205078125, "loss": 3.7185, "rewards/accuracies": 0.875, "rewards/chosen": -5.085547924041748, "rewards/margins": 5.731017112731934, "rewards/rejected": -10.816564559936523, "step": 457 }, { "epoch": 0.2849144634525661, "grad_norm": 0.34040364623069763, "learning_rate": 3.0111111111111113e-06, "logits/chosen": -21.840044021606445, "logits/rejected": 10.436660766601562, "logps/chosen": -1363.01953125, "logps/rejected": -1025.88525390625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -12.248331069946289, "rewards/margins": 16.65743637084961, "rewards/rejected": -28.905765533447266, "step": 458 }, { "epoch": 0.28553654743390355, "grad_norm": 21.645227432250977, "learning_rate": 3.005555555555556e-06, "logits/chosen": -163.7349090576172, "logits/rejected": 84.55571746826172, "logps/chosen": -249.3410186767578, "logps/rejected": -1120.706787109375, "loss": 0.1196, "rewards/accuracies": 0.875, "rewards/chosen": -4.506829261779785, "rewards/margins": 20.644468307495117, "rewards/rejected": -25.151296615600586, "step": 459 }, { "epoch": 0.28615863141524106, "grad_norm": 0.05377525836229324, "learning_rate": 3e-06, "logits/chosen": -120.49679565429688, "logits/rejected": -50.11427307128906, "logps/chosen": -479.7495422363281, "logps/rejected": -708.4085083007812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.105657577514648, "rewards/margins": 19.34479522705078, "rewards/rejected": -23.45045280456543, "step": 460 }, { "epoch": 0.2867807153965785, "grad_norm": 0.07196685671806335, "learning_rate": 2.994444444444445e-06, "logits/chosen": -61.55799102783203, "logits/rejected": 75.55865478515625, "logps/chosen": -487.673095703125, "logps/rejected": -721.8910522460938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.959700345993042, "rewards/margins": 15.674489974975586, "rewards/rejected": -18.634191513061523, "step": 461 }, { "epoch": 0.287402799377916, "grad_norm": 642.327880859375, "learning_rate": 2.988888888888889e-06, "logits/chosen": -50.24076843261719, "logits/rejected": 59.18190002441406, "logps/chosen": -1208.486572265625, "logps/rejected": -1524.85302734375, "loss": 4.6993, "rewards/accuracies": 0.75, "rewards/chosen": -10.42319393157959, "rewards/margins": 5.120976448059082, "rewards/rejected": -15.544169425964355, "step": 462 }, { "epoch": 0.2880248833592535, "grad_norm": 36.3924446105957, "learning_rate": 2.9833333333333337e-06, "logits/chosen": -94.99934387207031, "logits/rejected": 0.6685028076171875, "logps/chosen": -440.3847351074219, "logps/rejected": -625.8167724609375, "loss": 0.7421, "rewards/accuracies": 0.875, "rewards/chosen": -7.878257751464844, "rewards/margins": 12.733911514282227, "rewards/rejected": -20.61216926574707, "step": 463 }, { "epoch": 0.288646967340591, "grad_norm": 0.00011843784159282222, "learning_rate": 2.9777777777777777e-06, "logits/chosen": -125.81266021728516, "logits/rejected": 9.343620300292969, "logps/chosen": -414.1218566894531, "logps/rejected": -754.7615356445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.665410041809082, "rewards/margins": 20.270339965820312, "rewards/rejected": -25.935749053955078, "step": 464 }, { "epoch": 0.2892690513219285, "grad_norm": 4.792317867279053, "learning_rate": 2.9722222222222225e-06, "logits/chosen": -55.39715576171875, "logits/rejected": 5.407726287841797, "logps/chosen": -439.4058837890625, "logps/rejected": -948.4417724609375, "loss": 0.0302, "rewards/accuracies": 1.0, "rewards/chosen": -3.0605430603027344, "rewards/margins": 14.792844772338867, "rewards/rejected": -17.85338592529297, "step": 465 }, { "epoch": 0.28989113530326593, "grad_norm": 0.7684537768363953, "learning_rate": 2.9666666666666673e-06, "logits/chosen": -169.63790893554688, "logits/rejected": 25.33966827392578, "logps/chosen": -940.2826538085938, "logps/rejected": -1367.318115234375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -3.6616058349609375, "rewards/margins": 19.403545379638672, "rewards/rejected": -23.06515121459961, "step": 466 }, { "epoch": 0.29051321928460344, "grad_norm": 62.2896614074707, "learning_rate": 2.9611111111111113e-06, "logits/chosen": -80.41464233398438, "logits/rejected": 37.6744499206543, "logps/chosen": -1563.99853515625, "logps/rejected": -1814.621337890625, "loss": 0.3527, "rewards/accuracies": 0.75, "rewards/chosen": -6.504397392272949, "rewards/margins": 9.698775291442871, "rewards/rejected": -16.20317268371582, "step": 467 }, { "epoch": 0.2911353032659409, "grad_norm": 0.06806180626153946, "learning_rate": 2.955555555555556e-06, "logits/chosen": -112.95097351074219, "logits/rejected": 54.07991027832031, "logps/chosen": -842.3372802734375, "logps/rejected": -1629.6639404296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.871456146240234, "rewards/margins": 27.020767211914062, "rewards/rejected": -33.8922233581543, "step": 468 }, { "epoch": 0.2917573872472784, "grad_norm": 0.0015384845901280642, "learning_rate": 2.95e-06, "logits/chosen": -108.10063171386719, "logits/rejected": -45.151390075683594, "logps/chosen": -1476.0245361328125, "logps/rejected": -1512.776123046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.981031894683838, "rewards/margins": 19.118215560913086, "rewards/rejected": -27.099245071411133, "step": 469 }, { "epoch": 0.29237947122861585, "grad_norm": 0.0026862381491810083, "learning_rate": 2.944444444444445e-06, "logits/chosen": -243.16441345214844, "logits/rejected": -22.439212799072266, "logps/chosen": -247.9034423828125, "logps/rejected": -1433.7177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.7422385215759277, "rewards/margins": 25.459976196289062, "rewards/rejected": -28.202213287353516, "step": 470 }, { "epoch": 0.29300155520995336, "grad_norm": 80.00831604003906, "learning_rate": 2.938888888888889e-06, "logits/chosen": -138.35882568359375, "logits/rejected": -16.89215660095215, "logps/chosen": -573.883056640625, "logps/rejected": -1497.986083984375, "loss": 4.405, "rewards/accuracies": 0.875, "rewards/chosen": -3.3371615409851074, "rewards/margins": 11.74737548828125, "rewards/rejected": -15.0845365524292, "step": 471 }, { "epoch": 0.2936236391912908, "grad_norm": 0.02629709057509899, "learning_rate": 2.9333333333333338e-06, "logits/chosen": -95.78155517578125, "logits/rejected": 31.421701431274414, "logps/chosen": -436.87982177734375, "logps/rejected": -640.5528564453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.0469183921813965, "rewards/margins": 14.275647163391113, "rewards/rejected": -19.32256507873535, "step": 472 }, { "epoch": 0.2942457231726283, "grad_norm": 0.015096843242645264, "learning_rate": 2.927777777777778e-06, "logits/chosen": -218.71376037597656, "logits/rejected": -81.7887191772461, "logps/chosen": -539.2918701171875, "logps/rejected": -1633.03369140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.28540563583374, "rewards/margins": 32.45753479003906, "rewards/rejected": -39.74293899536133, "step": 473 }, { "epoch": 0.29486780715396577, "grad_norm": 0.37026557326316833, "learning_rate": 2.9222222222222226e-06, "logits/chosen": -91.54402160644531, "logits/rejected": -71.40422821044922, "logps/chosen": -565.3885498046875, "logps/rejected": -764.4093017578125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -5.220095634460449, "rewards/margins": 18.664575576782227, "rewards/rejected": -23.88467025756836, "step": 474 }, { "epoch": 0.2954898911353033, "grad_norm": 0.5328233242034912, "learning_rate": 2.916666666666667e-06, "logits/chosen": -92.68084716796875, "logits/rejected": 18.117137908935547, "logps/chosen": -322.7648620605469, "logps/rejected": -603.4907836914062, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.257126808166504, "rewards/margins": 19.536523818969727, "rewards/rejected": -22.793651580810547, "step": 475 }, { "epoch": 0.2961119751166407, "grad_norm": 2.082916259765625, "learning_rate": 2.9111111111111114e-06, "logits/chosen": -32.649600982666016, "logits/rejected": 53.8452033996582, "logps/chosen": -327.58843994140625, "logps/rejected": -515.9476318359375, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -6.633157730102539, "rewards/margins": 12.035253524780273, "rewards/rejected": -18.668411254882812, "step": 476 }, { "epoch": 0.29673405909797823, "grad_norm": 0.006789871491491795, "learning_rate": 2.9055555555555558e-06, "logits/chosen": -155.75148010253906, "logits/rejected": -42.266353607177734, "logps/chosen": -621.5592651367188, "logps/rejected": -912.0943603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.735085487365723, "rewards/margins": 18.854005813598633, "rewards/rejected": -26.589092254638672, "step": 477 }, { "epoch": 0.2973561430793157, "grad_norm": 0.002204960910603404, "learning_rate": 2.9e-06, "logits/chosen": -73.46788024902344, "logits/rejected": 13.16627311706543, "logps/chosen": -1982.020751953125, "logps/rejected": -2671.50439453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.904025077819824, "rewards/margins": 23.271419525146484, "rewards/rejected": -34.17544174194336, "step": 478 }, { "epoch": 0.2979782270606532, "grad_norm": 489.8451843261719, "learning_rate": 2.8944444444444446e-06, "logits/chosen": -76.97776794433594, "logits/rejected": 101.48400115966797, "logps/chosen": -831.5736694335938, "logps/rejected": -1267.337158203125, "loss": 0.2433, "rewards/accuracies": 0.875, "rewards/chosen": -8.081165313720703, "rewards/margins": 22.542129516601562, "rewards/rejected": -30.623294830322266, "step": 479 }, { "epoch": 0.2986003110419907, "grad_norm": 0.02077249065041542, "learning_rate": 2.888888888888889e-06, "logits/chosen": -109.78641510009766, "logits/rejected": 79.53196716308594, "logps/chosen": -347.164794921875, "logps/rejected": -679.7649536132812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.474149703979492, "rewards/margins": 20.157148361206055, "rewards/rejected": -25.631298065185547, "step": 480 }, { "epoch": 0.29922239502332815, "grad_norm": 45.295162200927734, "learning_rate": 2.8833333333333334e-06, "logits/chosen": -107.96310424804688, "logits/rejected": 117.78995513916016, "logps/chosen": -629.31005859375, "logps/rejected": -1444.530029296875, "loss": 0.1686, "rewards/accuracies": 0.875, "rewards/chosen": -11.005935668945312, "rewards/margins": 24.335113525390625, "rewards/rejected": -35.34104919433594, "step": 481 }, { "epoch": 0.29984447900466565, "grad_norm": 0.3226853907108307, "learning_rate": 2.8777777777777782e-06, "logits/chosen": -49.87257385253906, "logits/rejected": 51.80895233154297, "logps/chosen": -323.7877197265625, "logps/rejected": -646.0269165039062, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -5.0389885902404785, "rewards/margins": 19.276966094970703, "rewards/rejected": -24.315956115722656, "step": 482 }, { "epoch": 0.3004665629860031, "grad_norm": 0.03761100396513939, "learning_rate": 2.872222222222222e-06, "logits/chosen": -96.25479125976562, "logits/rejected": 5.03964376449585, "logps/chosen": -752.3473510742188, "logps/rejected": -998.7327880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.474496841430664, "rewards/margins": 17.76065444946289, "rewards/rejected": -26.235153198242188, "step": 483 }, { "epoch": 0.3010886469673406, "grad_norm": 0.36069121956825256, "learning_rate": 2.866666666666667e-06, "logits/chosen": -173.4355010986328, "logits/rejected": 49.04792022705078, "logps/chosen": -401.3187255859375, "logps/rejected": -843.3233642578125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -7.69771146774292, "rewards/margins": 17.023658752441406, "rewards/rejected": -24.721370697021484, "step": 484 }, { "epoch": 0.30171073094867806, "grad_norm": 28.672616958618164, "learning_rate": 2.861111111111111e-06, "logits/chosen": -34.224788665771484, "logits/rejected": 9.807415008544922, "logps/chosen": -601.130615234375, "logps/rejected": -724.80078125, "loss": 0.1442, "rewards/accuracies": 0.875, "rewards/chosen": -9.897284507751465, "rewards/margins": 20.957622528076172, "rewards/rejected": -30.85490608215332, "step": 485 }, { "epoch": 0.30233281493001557, "grad_norm": 0.00019635060743894428, "learning_rate": 2.855555555555556e-06, "logits/chosen": -84.38818359375, "logits/rejected": 2.8599853515625, "logps/chosen": -525.7063598632812, "logps/rejected": -906.6004638671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.741774559020996, "rewards/margins": 25.04084587097168, "rewards/rejected": -30.78261947631836, "step": 486 }, { "epoch": 0.302954898911353, "grad_norm": 10.450629234313965, "learning_rate": 2.85e-06, "logits/chosen": -98.06412506103516, "logits/rejected": 22.83841323852539, "logps/chosen": -1183.72900390625, "logps/rejected": -1558.55419921875, "loss": 0.021, "rewards/accuracies": 1.0, "rewards/chosen": -15.557866096496582, "rewards/margins": 15.628456115722656, "rewards/rejected": -31.186323165893555, "step": 487 }, { "epoch": 0.30357698289269053, "grad_norm": 7.842443301342428e-05, "learning_rate": 2.8444444444444446e-06, "logits/chosen": -117.96109771728516, "logits/rejected": -90.42742156982422, "logps/chosen": -442.8400573730469, "logps/rejected": -645.2767333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.57356071472168, "rewards/margins": 21.432964324951172, "rewards/rejected": -28.006526947021484, "step": 488 }, { "epoch": 0.304199066874028, "grad_norm": 67.84249114990234, "learning_rate": 2.8388888888888895e-06, "logits/chosen": -210.37301635742188, "logits/rejected": -2.8459110260009766, "logps/chosen": -419.57373046875, "logps/rejected": -916.9591064453125, "loss": 0.1489, "rewards/accuracies": 0.875, "rewards/chosen": -12.453338623046875, "rewards/margins": 23.584064483642578, "rewards/rejected": -36.03740310668945, "step": 489 }, { "epoch": 0.3048211508553655, "grad_norm": 30.741262435913086, "learning_rate": 2.8333333333333335e-06, "logits/chosen": -100.96687316894531, "logits/rejected": -26.402626037597656, "logps/chosen": -272.1378173828125, "logps/rejected": -622.3171997070312, "loss": 0.2731, "rewards/accuracies": 0.875, "rewards/chosen": -9.547800064086914, "rewards/margins": 19.88826560974121, "rewards/rejected": -29.436065673828125, "step": 490 }, { "epoch": 0.30544323483670294, "grad_norm": 2685.345458984375, "learning_rate": 2.8277777777777783e-06, "logits/chosen": -253.6761932373047, "logits/rejected": 35.6867790222168, "logps/chosen": -368.1029357910156, "logps/rejected": -1673.7933349609375, "loss": 2.9421, "rewards/accuracies": 0.875, "rewards/chosen": -7.811916351318359, "rewards/margins": 18.280925750732422, "rewards/rejected": -26.09284210205078, "step": 491 }, { "epoch": 0.30606531881804044, "grad_norm": 0.5363892912864685, "learning_rate": 2.8222222222222223e-06, "logits/chosen": -235.43011474609375, "logits/rejected": -21.03951644897461, "logps/chosen": -354.26470947265625, "logps/rejected": -807.6500244140625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -5.928719997406006, "rewards/margins": 25.34630584716797, "rewards/rejected": -31.275026321411133, "step": 492 }, { "epoch": 0.3066874027993779, "grad_norm": 1.1895990371704102, "learning_rate": 2.816666666666667e-06, "logits/chosen": -12.674884796142578, "logits/rejected": -10.235252380371094, "logps/chosen": -513.9210205078125, "logps/rejected": -711.9178466796875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -5.489371299743652, "rewards/margins": 25.730825424194336, "rewards/rejected": -31.220195770263672, "step": 493 }, { "epoch": 0.3073094867807154, "grad_norm": 0.9920693039894104, "learning_rate": 2.811111111111111e-06, "logits/chosen": -201.67410278320312, "logits/rejected": -9.008163452148438, "logps/chosen": -279.3111572265625, "logps/rejected": -760.4784545898438, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -6.135791778564453, "rewards/margins": 26.03512191772461, "rewards/rejected": -32.17091369628906, "step": 494 }, { "epoch": 0.30793157076205285, "grad_norm": 0.28792479634284973, "learning_rate": 2.805555555555556e-06, "logits/chosen": -64.67411804199219, "logits/rejected": 87.28395080566406, "logps/chosen": -476.91424560546875, "logps/rejected": -866.6495361328125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -6.609411239624023, "rewards/margins": 19.427776336669922, "rewards/rejected": -26.037185668945312, "step": 495 }, { "epoch": 0.30855365474339036, "grad_norm": 1.2016055734420661e-05, "learning_rate": 2.8000000000000003e-06, "logits/chosen": -69.18524169921875, "logits/rejected": 58.43632507324219, "logps/chosen": -714.4051513671875, "logps/rejected": -1082.318359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.72089958190918, "rewards/margins": 24.028512954711914, "rewards/rejected": -32.749412536621094, "step": 496 }, { "epoch": 0.3091757387247278, "grad_norm": 0.003477121703326702, "learning_rate": 2.7944444444444447e-06, "logits/chosen": -151.20669555664062, "logits/rejected": 33.437286376953125, "logps/chosen": -435.5581970214844, "logps/rejected": -962.4176025390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.203364372253418, "rewards/margins": 24.498659133911133, "rewards/rejected": -31.702022552490234, "step": 497 }, { "epoch": 0.3097978227060653, "grad_norm": 74.77229309082031, "learning_rate": 2.788888888888889e-06, "logits/chosen": -134.05197143554688, "logits/rejected": -82.54796600341797, "logps/chosen": -419.45721435546875, "logps/rejected": -600.8002319335938, "loss": 0.3426, "rewards/accuracies": 0.875, "rewards/chosen": -9.366911888122559, "rewards/margins": 16.563343048095703, "rewards/rejected": -25.930255889892578, "step": 498 }, { "epoch": 0.3104199066874028, "grad_norm": 5.165897846221924, "learning_rate": 2.7833333333333335e-06, "logits/chosen": -17.098461151123047, "logits/rejected": 7.432888031005859, "logps/chosen": -1204.3275146484375, "logps/rejected": -1410.51513671875, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": -13.975566864013672, "rewards/margins": 19.458595275878906, "rewards/rejected": -33.43416213989258, "step": 499 }, { "epoch": 0.3110419906687403, "grad_norm": 94.65229797363281, "learning_rate": 2.7777777777777783e-06, "logits/chosen": -42.413612365722656, "logits/rejected": -18.559284210205078, "logps/chosen": -855.442626953125, "logps/rejected": -1092.0269775390625, "loss": 0.3863, "rewards/accuracies": 0.875, "rewards/chosen": -7.468043804168701, "rewards/margins": 20.67894744873047, "rewards/rejected": -28.146991729736328, "step": 500 }, { "epoch": 0.3116640746500778, "grad_norm": 36.913421630859375, "learning_rate": 2.7722222222222223e-06, "logits/chosen": -33.20619201660156, "logits/rejected": 77.44849395751953, "logps/chosen": -256.71368408203125, "logps/rejected": -500.01513671875, "loss": 0.1103, "rewards/accuracies": 0.875, "rewards/chosen": -5.874395370483398, "rewards/margins": 16.99877166748047, "rewards/rejected": -22.873165130615234, "step": 501 }, { "epoch": 0.31228615863141523, "grad_norm": 0.03802089765667915, "learning_rate": 2.766666666666667e-06, "logits/chosen": -113.8816146850586, "logits/rejected": 3.1324386596679688, "logps/chosen": -475.576904296875, "logps/rejected": -750.1541137695312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.107429504394531, "rewards/margins": 20.550615310668945, "rewards/rejected": -29.658042907714844, "step": 502 }, { "epoch": 0.31290824261275274, "grad_norm": 4.517454624176025, "learning_rate": 2.761111111111111e-06, "logits/chosen": -150.9524383544922, "logits/rejected": 66.72630310058594, "logps/chosen": -1144.045166015625, "logps/rejected": -1886.173095703125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -12.91429615020752, "rewards/margins": 27.82852554321289, "rewards/rejected": -40.742820739746094, "step": 503 }, { "epoch": 0.3135303265940902, "grad_norm": 0.1685783714056015, "learning_rate": 2.755555555555556e-06, "logits/chosen": -16.639257431030273, "logits/rejected": 67.06172180175781, "logps/chosen": -1208.613037109375, "logps/rejected": -1881.0006103515625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -14.78108024597168, "rewards/margins": 24.708454132080078, "rewards/rejected": -39.48953628540039, "step": 504 }, { "epoch": 0.3141524105754277, "grad_norm": 8.875104904174805, "learning_rate": 2.7500000000000004e-06, "logits/chosen": -182.884033203125, "logits/rejected": -19.382522583007812, "logps/chosen": -349.4017333984375, "logps/rejected": -785.084716796875, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": -8.484785079956055, "rewards/margins": 21.588932037353516, "rewards/rejected": -30.07371711730957, "step": 505 }, { "epoch": 0.31477449455676515, "grad_norm": 0.039438195526599884, "learning_rate": 2.7444444444444448e-06, "logits/chosen": -204.4589080810547, "logits/rejected": 30.468223571777344, "logps/chosen": -703.905029296875, "logps/rejected": -1545.792724609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -14.54481315612793, "rewards/margins": 29.707618713378906, "rewards/rejected": -44.2524299621582, "step": 506 }, { "epoch": 0.31539657853810266, "grad_norm": 0.0001556405477458611, "learning_rate": 2.738888888888889e-06, "logits/chosen": -162.90982055664062, "logits/rejected": -24.905223846435547, "logps/chosen": -429.8111572265625, "logps/rejected": -690.36279296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.169452667236328, "rewards/margins": 20.00811767578125, "rewards/rejected": -24.177574157714844, "step": 507 }, { "epoch": 0.3160186625194401, "grad_norm": 3.83480167388916, "learning_rate": 2.7333333333333336e-06, "logits/chosen": -106.45069885253906, "logits/rejected": 89.6026840209961, "logps/chosen": -390.6701354980469, "logps/rejected": -1056.58642578125, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -6.510148525238037, "rewards/margins": 19.043895721435547, "rewards/rejected": -25.554046630859375, "step": 508 }, { "epoch": 0.3166407465007776, "grad_norm": 249.91635131835938, "learning_rate": 2.727777777777778e-06, "logits/chosen": -113.04769897460938, "logits/rejected": 52.910831451416016, "logps/chosen": -657.3864135742188, "logps/rejected": -1047.919921875, "loss": 1.1029, "rewards/accuracies": 0.875, "rewards/chosen": -10.305535316467285, "rewards/margins": 20.930843353271484, "rewards/rejected": -31.23638343811035, "step": 509 }, { "epoch": 0.31726283048211507, "grad_norm": 0.0001023645672830753, "learning_rate": 2.7222222222222224e-06, "logits/chosen": -111.2098617553711, "logits/rejected": -46.61152267456055, "logps/chosen": -332.561767578125, "logps/rejected": -2009.6092529296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.912018775939941, "rewards/margins": 46.59414291381836, "rewards/rejected": -53.50615692138672, "step": 510 }, { "epoch": 0.3178849144634526, "grad_norm": 9.813766479492188, "learning_rate": 2.7166666666666668e-06, "logits/chosen": -214.36294555664062, "logits/rejected": 8.500236511230469, "logps/chosen": -484.8146667480469, "logps/rejected": -1885.5816650390625, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": -10.142532348632812, "rewards/margins": 30.896425247192383, "rewards/rejected": -41.038963317871094, "step": 511 }, { "epoch": 0.31850699844479, "grad_norm": 0.00014505768194794655, "learning_rate": 2.7111111111111116e-06, "logits/chosen": -117.37469482421875, "logits/rejected": -30.81359100341797, "logps/chosen": -1240.773193359375, "logps/rejected": -1565.1182861328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.835609436035156, "rewards/margins": 22.83041763305664, "rewards/rejected": -45.66603088378906, "step": 512 }, { "epoch": 0.31912908242612753, "grad_norm": 0.04484693333506584, "learning_rate": 2.7055555555555556e-06, "logits/chosen": -118.07000732421875, "logits/rejected": 19.228988647460938, "logps/chosen": -417.59564208984375, "logps/rejected": -1071.6759033203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.964608192443848, "rewards/margins": 26.42868423461914, "rewards/rejected": -34.39329147338867, "step": 513 }, { "epoch": 0.319751166407465, "grad_norm": 733.9310913085938, "learning_rate": 2.7000000000000004e-06, "logits/chosen": -135.20254516601562, "logits/rejected": 7.584394454956055, "logps/chosen": -1490.989013671875, "logps/rejected": -1831.0753173828125, "loss": 1.2205, "rewards/accuracies": 0.875, "rewards/chosen": -22.450679779052734, "rewards/margins": 17.37295150756836, "rewards/rejected": -39.823631286621094, "step": 514 }, { "epoch": 0.3203732503888025, "grad_norm": 0.00440920889377594, "learning_rate": 2.6944444444444444e-06, "logits/chosen": -183.91673278808594, "logits/rejected": -2.545504093170166, "logps/chosen": -736.22607421875, "logps/rejected": -1045.3095703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.57273006439209, "rewards/margins": 21.090290069580078, "rewards/rejected": -30.66301918029785, "step": 515 }, { "epoch": 0.32099533437014, "grad_norm": 1.4607081766371266e-06, "learning_rate": 2.6888888888888892e-06, "logits/chosen": -56.57765197753906, "logits/rejected": -66.30777740478516, "logps/chosen": -634.3762817382812, "logps/rejected": -799.6243286132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.017427444458008, "rewards/margins": 24.705734252929688, "rewards/rejected": -32.72315979003906, "step": 516 }, { "epoch": 0.32161741835147745, "grad_norm": 0.0035732206888496876, "learning_rate": 2.683333333333333e-06, "logits/chosen": -136.55030822753906, "logits/rejected": 15.133339881896973, "logps/chosen": -510.66119384765625, "logps/rejected": -1623.222412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.230821132659912, "rewards/margins": 25.768672943115234, "rewards/rejected": -31.999494552612305, "step": 517 }, { "epoch": 0.32223950233281495, "grad_norm": 15.281998634338379, "learning_rate": 2.677777777777778e-06, "logits/chosen": -141.44114685058594, "logits/rejected": 37.759742736816406, "logps/chosen": -622.218017578125, "logps/rejected": -1389.559814453125, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": -9.901582717895508, "rewards/margins": 24.146142959594727, "rewards/rejected": -34.047725677490234, "step": 518 }, { "epoch": 0.3228615863141524, "grad_norm": 20.5241756439209, "learning_rate": 2.672222222222223e-06, "logits/chosen": -47.92503356933594, "logits/rejected": 64.93870544433594, "logps/chosen": -380.2177734375, "logps/rejected": -690.2943115234375, "loss": 0.0525, "rewards/accuracies": 1.0, "rewards/chosen": -5.080290794372559, "rewards/margins": 19.285602569580078, "rewards/rejected": -24.365894317626953, "step": 519 }, { "epoch": 0.3234836702954899, "grad_norm": 726.5747680664062, "learning_rate": 2.666666666666667e-06, "logits/chosen": -27.112945556640625, "logits/rejected": 46.60769271850586, "logps/chosen": -1942.02099609375, "logps/rejected": -2010.9442138671875, "loss": 1.4267, "rewards/accuracies": 0.875, "rewards/chosen": -17.372493743896484, "rewards/margins": 21.80316734313965, "rewards/rejected": -39.1756591796875, "step": 520 }, { "epoch": 0.32410575427682736, "grad_norm": 0.023551391437649727, "learning_rate": 2.6611111111111117e-06, "logits/chosen": -146.9193878173828, "logits/rejected": -11.396366119384766, "logps/chosen": -564.2581787109375, "logps/rejected": -1578.962890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.082893371582031, "rewards/margins": 28.404075622558594, "rewards/rejected": -36.486968994140625, "step": 521 }, { "epoch": 0.32472783825816487, "grad_norm": 0.03202517330646515, "learning_rate": 2.6555555555555556e-06, "logits/chosen": -145.31756591796875, "logits/rejected": -69.08807373046875, "logps/chosen": -1121.3101806640625, "logps/rejected": -1333.1702880859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.528709411621094, "rewards/margins": 19.132692337036133, "rewards/rejected": -29.661401748657227, "step": 522 }, { "epoch": 0.3253499222395023, "grad_norm": 1.7982759475708008, "learning_rate": 2.6500000000000005e-06, "logits/chosen": -24.10053825378418, "logits/rejected": 47.73780822753906, "logps/chosen": -382.15966796875, "logps/rejected": -619.9219970703125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -8.713485717773438, "rewards/margins": 19.28835678100586, "rewards/rejected": -28.001842498779297, "step": 523 }, { "epoch": 0.3259720062208398, "grad_norm": 1.1941759794353857e-06, "learning_rate": 2.6444444444444444e-06, "logits/chosen": -182.32423400878906, "logits/rejected": 30.640613555908203, "logps/chosen": -424.6380920410156, "logps/rejected": -802.3182373046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.684786319732666, "rewards/margins": 27.44087791442871, "rewards/rejected": -31.12566566467285, "step": 524 }, { "epoch": 0.3265940902021773, "grad_norm": 0.012141176499426365, "learning_rate": 2.6388888888888893e-06, "logits/chosen": -79.68421173095703, "logits/rejected": -27.655445098876953, "logps/chosen": -470.4711608886719, "logps/rejected": -764.1946411132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.8947296142578125, "rewards/margins": 24.341365814208984, "rewards/rejected": -30.23609161376953, "step": 525 }, { "epoch": 0.3272161741835148, "grad_norm": 2093.632080078125, "learning_rate": 2.6333333333333332e-06, "logits/chosen": -58.48688507080078, "logits/rejected": -12.876693725585938, "logps/chosen": -1797.82080078125, "logps/rejected": -2408.203857421875, "loss": 0.3546, "rewards/accuracies": 0.875, "rewards/chosen": -10.572139739990234, "rewards/margins": 20.393951416015625, "rewards/rejected": -30.966093063354492, "step": 526 }, { "epoch": 0.32783825816485224, "grad_norm": 53.939781188964844, "learning_rate": 2.627777777777778e-06, "logits/chosen": -98.4378662109375, "logits/rejected": -6.685610771179199, "logps/chosen": -902.432373046875, "logps/rejected": -1414.36962890625, "loss": 0.3548, "rewards/accuracies": 0.75, "rewards/chosen": -7.349522590637207, "rewards/margins": 19.975582122802734, "rewards/rejected": -27.325103759765625, "step": 527 }, { "epoch": 0.32846034214618974, "grad_norm": 0.00472272839397192, "learning_rate": 2.6222222222222225e-06, "logits/chosen": -72.72634887695312, "logits/rejected": 10.695472717285156, "logps/chosen": -528.1621704101562, "logps/rejected": -745.701416015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.852806568145752, "rewards/margins": 20.925395965576172, "rewards/rejected": -27.7782039642334, "step": 528 }, { "epoch": 0.3290824261275272, "grad_norm": 0.9566658139228821, "learning_rate": 2.616666666666667e-06, "logits/chosen": -163.9215850830078, "logits/rejected": 29.034271240234375, "logps/chosen": -1085.3968505859375, "logps/rejected": -1475.486083984375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -5.542291641235352, "rewards/margins": 27.62804412841797, "rewards/rejected": -33.17034149169922, "step": 529 }, { "epoch": 0.3297045101088647, "grad_norm": 0.020136937499046326, "learning_rate": 2.6111111111111113e-06, "logits/chosen": 55.85187530517578, "logits/rejected": 120.70094299316406, "logps/chosen": -524.1187744140625, "logps/rejected": -707.9207153320312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.550209045410156, "rewards/margins": 19.35879135131836, "rewards/rejected": -25.90900230407715, "step": 530 }, { "epoch": 0.33032659409020215, "grad_norm": 25.630029678344727, "learning_rate": 2.6055555555555557e-06, "logits/chosen": -35.70895004272461, "logits/rejected": 5.138917922973633, "logps/chosen": -752.0984497070312, "logps/rejected": -1045.410888671875, "loss": 0.1424, "rewards/accuracies": 0.875, "rewards/chosen": -1.899871587753296, "rewards/margins": 17.37139129638672, "rewards/rejected": -19.271263122558594, "step": 531 }, { "epoch": 0.33094867807153966, "grad_norm": 49.42053985595703, "learning_rate": 2.6e-06, "logits/chosen": -72.38311767578125, "logits/rejected": 55.47483825683594, "logps/chosen": -821.4680786132812, "logps/rejected": -1198.586181640625, "loss": 0.145, "rewards/accuracies": 0.875, "rewards/chosen": -5.30170202255249, "rewards/margins": 17.499011993408203, "rewards/rejected": -22.80071258544922, "step": 532 }, { "epoch": 0.33157076205287717, "grad_norm": 0.00021453335648402572, "learning_rate": 2.5944444444444445e-06, "logits/chosen": -59.47480010986328, "logits/rejected": -59.50837707519531, "logps/chosen": -902.0071411132812, "logps/rejected": -1434.232177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.2769646644592285, "rewards/margins": 23.48958969116211, "rewards/rejected": -27.766551971435547, "step": 533 }, { "epoch": 0.3321928460342146, "grad_norm": 0.0009830794297158718, "learning_rate": 2.5888888888888893e-06, "logits/chosen": -88.18829345703125, "logits/rejected": 75.58877563476562, "logps/chosen": -477.9635314941406, "logps/rejected": -836.441162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.839996337890625, "rewards/margins": 27.455047607421875, "rewards/rejected": -33.2950439453125, "step": 534 }, { "epoch": 0.3328149300155521, "grad_norm": 21.538833618164062, "learning_rate": 2.5833333333333337e-06, "logits/chosen": -38.95292663574219, "logits/rejected": 19.689794540405273, "logps/chosen": -502.48431396484375, "logps/rejected": -597.1568603515625, "loss": 0.099, "rewards/accuracies": 0.875, "rewards/chosen": -6.1365065574646, "rewards/margins": 12.255342483520508, "rewards/rejected": -18.391847610473633, "step": 535 }, { "epoch": 0.3334370139968896, "grad_norm": 44.25954055786133, "learning_rate": 2.577777777777778e-06, "logits/chosen": -124.04884338378906, "logits/rejected": -18.230335235595703, "logps/chosen": -408.546875, "logps/rejected": -628.8480224609375, "loss": 0.3246, "rewards/accuracies": 0.875, "rewards/chosen": -3.7298359870910645, "rewards/margins": 17.540559768676758, "rewards/rejected": -21.270395278930664, "step": 536 }, { "epoch": 0.3340590979782271, "grad_norm": 915.2777099609375, "learning_rate": 2.5722222222222225e-06, "logits/chosen": -59.6215934753418, "logits/rejected": 39.407562255859375, "logps/chosen": -732.1753540039062, "logps/rejected": -1202.220703125, "loss": 2.9061, "rewards/accuracies": 0.875, "rewards/chosen": 0.0549391508102417, "rewards/margins": 8.19179630279541, "rewards/rejected": -8.136857986450195, "step": 537 }, { "epoch": 0.33468118195956453, "grad_norm": 101.01426696777344, "learning_rate": 2.566666666666667e-06, "logits/chosen": -121.70182800292969, "logits/rejected": -88.66795349121094, "logps/chosen": -419.38751220703125, "logps/rejected": -946.19482421875, "loss": 0.9002, "rewards/accuracies": 0.875, "rewards/chosen": -7.548337936401367, "rewards/margins": 10.937995910644531, "rewards/rejected": -18.4863338470459, "step": 538 }, { "epoch": 0.33530326594090204, "grad_norm": 0.22497332096099854, "learning_rate": 2.5611111111111113e-06, "logits/chosen": -155.32852172851562, "logits/rejected": -64.15583801269531, "logps/chosen": -391.474853515625, "logps/rejected": -590.6363525390625, "loss": 0.0875, "rewards/accuracies": 0.875, "rewards/chosen": -5.654869079589844, "rewards/margins": 17.69102668762207, "rewards/rejected": -23.345895767211914, "step": 539 }, { "epoch": 0.3359253499222395, "grad_norm": 2171.966552734375, "learning_rate": 2.5555555555555557e-06, "logits/chosen": -139.00430297851562, "logits/rejected": -44.87596130371094, "logps/chosen": -1128.762939453125, "logps/rejected": -2281.35498046875, "loss": 0.3873, "rewards/accuracies": 0.75, "rewards/chosen": -7.1004557609558105, "rewards/margins": 19.601308822631836, "rewards/rejected": -26.701763153076172, "step": 540 }, { "epoch": 0.336547433903577, "grad_norm": 0.00040764911682344973, "learning_rate": 2.55e-06, "logits/chosen": -173.5714569091797, "logits/rejected": -71.816650390625, "logps/chosen": -347.21630859375, "logps/rejected": -677.5891723632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.594542980194092, "rewards/margins": 21.809738159179688, "rewards/rejected": -27.404277801513672, "step": 541 }, { "epoch": 0.33716951788491445, "grad_norm": 0.052160948514938354, "learning_rate": 2.5444444444444446e-06, "logits/chosen": -163.11651611328125, "logits/rejected": 5.506620407104492, "logps/chosen": -381.1016845703125, "logps/rejected": -777.0826416015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.960916996002197, "rewards/margins": 18.652755737304688, "rewards/rejected": -26.61367416381836, "step": 542 }, { "epoch": 0.33779160186625196, "grad_norm": 0.0027104229666292667, "learning_rate": 2.538888888888889e-06, "logits/chosen": -70.49809265136719, "logits/rejected": 96.97160339355469, "logps/chosen": -538.9619750976562, "logps/rejected": -865.5678100585938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.697265148162842, "rewards/margins": 20.115009307861328, "rewards/rejected": -27.812274932861328, "step": 543 }, { "epoch": 0.3384136858475894, "grad_norm": 10.569849014282227, "learning_rate": 2.5333333333333338e-06, "logits/chosen": -158.86380004882812, "logits/rejected": -6.6270647048950195, "logps/chosen": -347.7818603515625, "logps/rejected": -673.0206909179688, "loss": 0.0898, "rewards/accuracies": 1.0, "rewards/chosen": -7.693136692047119, "rewards/margins": 14.407018661499023, "rewards/rejected": -22.100154876708984, "step": 544 }, { "epoch": 0.3390357698289269, "grad_norm": 38.356014251708984, "learning_rate": 2.5277777777777778e-06, "logits/chosen": -138.15548706054688, "logits/rejected": 1.8432598114013672, "logps/chosen": -557.3701171875, "logps/rejected": -1114.21142578125, "loss": 0.2129, "rewards/accuracies": 0.875, "rewards/chosen": -6.736522674560547, "rewards/margins": 9.910958290100098, "rewards/rejected": -16.647480010986328, "step": 545 }, { "epoch": 0.33965785381026437, "grad_norm": 3.2684230973245576e-05, "learning_rate": 2.5222222222222226e-06, "logits/chosen": -77.82260131835938, "logits/rejected": 51.82418441772461, "logps/chosen": -380.8198547363281, "logps/rejected": -799.3794555664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.664532661437988, "rewards/margins": 22.91758155822754, "rewards/rejected": -27.582111358642578, "step": 546 }, { "epoch": 0.34027993779160187, "grad_norm": 62.43033981323242, "learning_rate": 2.5166666666666666e-06, "logits/chosen": -155.48291015625, "logits/rejected": 63.98257064819336, "logps/chosen": -406.412353515625, "logps/rejected": -869.9842529296875, "loss": 0.2666, "rewards/accuracies": 0.875, "rewards/chosen": -6.313762187957764, "rewards/margins": 21.309492111206055, "rewards/rejected": -27.623252868652344, "step": 547 }, { "epoch": 0.3409020217729393, "grad_norm": 0.01893557608127594, "learning_rate": 2.5111111111111114e-06, "logits/chosen": -173.40585327148438, "logits/rejected": 59.75921630859375, "logps/chosen": -518.6378173828125, "logps/rejected": -1227.51611328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.921689510345459, "rewards/margins": 24.799715042114258, "rewards/rejected": -29.721405029296875, "step": 548 }, { "epoch": 0.34152410575427683, "grad_norm": 0.03524330258369446, "learning_rate": 2.5055555555555554e-06, "logits/chosen": -81.65140533447266, "logits/rejected": 53.29224395751953, "logps/chosen": -419.2182922363281, "logps/rejected": -788.4107666015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.967625617980957, "rewards/margins": 27.95848846435547, "rewards/rejected": -33.926116943359375, "step": 549 }, { "epoch": 0.3421461897356143, "grad_norm": 0.0030301192309707403, "learning_rate": 2.5e-06, "logits/chosen": -162.14901733398438, "logits/rejected": 37.351524353027344, "logps/chosen": -290.3460998535156, "logps/rejected": -780.3597412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.041409492492676, "rewards/margins": 25.58060646057129, "rewards/rejected": -30.622013092041016, "step": 550 }, { "epoch": 0.3427682737169518, "grad_norm": 0.0007868031389079988, "learning_rate": 2.4944444444444446e-06, "logits/chosen": -89.37284851074219, "logits/rejected": 76.03825378417969, "logps/chosen": -408.15582275390625, "logps/rejected": -778.2498168945312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.955649375915527, "rewards/margins": 24.252300262451172, "rewards/rejected": -30.207950592041016, "step": 551 }, { "epoch": 0.3433903576982893, "grad_norm": 7.3237810134887695, "learning_rate": 2.488888888888889e-06, "logits/chosen": -113.70440673828125, "logits/rejected": -4.633126258850098, "logps/chosen": -516.0413208007812, "logps/rejected": -706.09375, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": -7.577106475830078, "rewards/margins": 19.43143653869629, "rewards/rejected": -27.008543014526367, "step": 552 }, { "epoch": 0.34401244167962675, "grad_norm": 27.165157318115234, "learning_rate": 2.4833333333333334e-06, "logits/chosen": -70.55792236328125, "logits/rejected": -84.22840881347656, "logps/chosen": -518.1121826171875, "logps/rejected": -664.818603515625, "loss": 0.1745, "rewards/accuracies": 0.875, "rewards/chosen": -6.33168888092041, "rewards/margins": 19.870664596557617, "rewards/rejected": -26.202354431152344, "step": 553 }, { "epoch": 0.34463452566096425, "grad_norm": 13.65674877166748, "learning_rate": 2.4777777777777782e-06, "logits/chosen": 15.586252212524414, "logits/rejected": -40.6330680847168, "logps/chosen": -575.465087890625, "logps/rejected": -600.142822265625, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/chosen": -8.002599716186523, "rewards/margins": 13.05514907836914, "rewards/rejected": -21.05774688720703, "step": 554 }, { "epoch": 0.3452566096423017, "grad_norm": 0.1597648411989212, "learning_rate": 2.4722222222222226e-06, "logits/chosen": -57.72572708129883, "logits/rejected": 35.889373779296875, "logps/chosen": -1080.215087890625, "logps/rejected": -2058.431884765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.29849624633789, "rewards/margins": 43.9632453918457, "rewards/rejected": -59.26173782348633, "step": 555 }, { "epoch": 0.3458786936236392, "grad_norm": 0.0048780241049826145, "learning_rate": 2.466666666666667e-06, "logits/chosen": -50.6446418762207, "logits/rejected": 114.20291137695312, "logps/chosen": -210.42401123046875, "logps/rejected": -608.5009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.4228029251098633, "rewards/margins": 24.626663208007812, "rewards/rejected": -27.049467086791992, "step": 556 }, { "epoch": 0.34650077760497666, "grad_norm": 9.669543942436576e-06, "learning_rate": 2.4611111111111115e-06, "logits/chosen": -121.74302673339844, "logits/rejected": -27.763591766357422, "logps/chosen": -682.2377319335938, "logps/rejected": -947.9501953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.418315410614014, "rewards/margins": 23.888389587402344, "rewards/rejected": -31.306703567504883, "step": 557 }, { "epoch": 0.34712286158631417, "grad_norm": 15.246994018554688, "learning_rate": 2.455555555555556e-06, "logits/chosen": -132.05062866210938, "logits/rejected": -26.71329689025879, "logps/chosen": -547.3456420898438, "logps/rejected": -746.6658935546875, "loss": 0.0961, "rewards/accuracies": 0.875, "rewards/chosen": -8.82697868347168, "rewards/margins": 16.472761154174805, "rewards/rejected": -25.299739837646484, "step": 558 }, { "epoch": 0.3477449455676516, "grad_norm": 0.10348263382911682, "learning_rate": 2.4500000000000003e-06, "logits/chosen": 6.919384002685547, "logits/rejected": 16.04297637939453, "logps/chosen": -508.27850341796875, "logps/rejected": -700.031982421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.918520927429199, "rewards/margins": 19.172508239746094, "rewards/rejected": -25.091028213500977, "step": 559 }, { "epoch": 0.3483670295489891, "grad_norm": 4.3703443225240335e-05, "learning_rate": 2.4444444444444447e-06, "logits/chosen": -192.80165100097656, "logits/rejected": 33.61063766479492, "logps/chosen": -692.0377807617188, "logps/rejected": -1948.145263671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.256573677062988, "rewards/margins": 38.18522644042969, "rewards/rejected": -50.44179916381836, "step": 560 }, { "epoch": 0.3489891135303266, "grad_norm": 0.004655397031456232, "learning_rate": 2.438888888888889e-06, "logits/chosen": -50.00129699707031, "logits/rejected": 110.15692901611328, "logps/chosen": -423.329345703125, "logps/rejected": -774.76806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.33040714263916, "rewards/margins": 24.559856414794922, "rewards/rejected": -28.890262603759766, "step": 561 }, { "epoch": 0.3496111975116641, "grad_norm": 392.4862365722656, "learning_rate": 2.4333333333333335e-06, "logits/chosen": -73.5555419921875, "logits/rejected": -56.832275390625, "logps/chosen": -1104.6937255859375, "logps/rejected": -1062.54541015625, "loss": 1.7689, "rewards/accuracies": 0.875, "rewards/chosen": -16.892208099365234, "rewards/margins": 18.787551879882812, "rewards/rejected": -35.67975616455078, "step": 562 }, { "epoch": 0.35023328149300154, "grad_norm": 2.4955488697742112e-05, "learning_rate": 2.427777777777778e-06, "logits/chosen": -125.34312438964844, "logits/rejected": 8.37929916381836, "logps/chosen": -211.44158935546875, "logps/rejected": -513.5240478515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.3684237003326416, "rewards/margins": 20.99271583557129, "rewards/rejected": -24.36113739013672, "step": 563 }, { "epoch": 0.35085536547433904, "grad_norm": 2.8383233547210693, "learning_rate": 2.4222222222222223e-06, "logits/chosen": 19.221206665039062, "logits/rejected": 93.83993530273438, "logps/chosen": -603.9064331054688, "logps/rejected": -813.9634399414062, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -10.701355934143066, "rewards/margins": 17.144973754882812, "rewards/rejected": -27.846328735351562, "step": 564 }, { "epoch": 0.3514774494556765, "grad_norm": 7.670842023799196e-07, "learning_rate": 2.4166666666666667e-06, "logits/chosen": -207.89566040039062, "logits/rejected": 31.57199478149414, "logps/chosen": -784.5374755859375, "logps/rejected": -2618.91162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.443561553955078, "rewards/margins": 62.17121124267578, "rewards/rejected": -74.61477661132812, "step": 565 }, { "epoch": 0.352099533437014, "grad_norm": 0.0008148363558575511, "learning_rate": 2.411111111111111e-06, "logits/chosen": -211.11575317382812, "logits/rejected": -18.84640884399414, "logps/chosen": -757.9737548828125, "logps/rejected": -2311.94189453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.249677658081055, "rewards/margins": 46.82902145385742, "rewards/rejected": -64.07870483398438, "step": 566 }, { "epoch": 0.35272161741835145, "grad_norm": 0.14958453178405762, "learning_rate": 2.4055555555555555e-06, "logits/chosen": -68.69004821777344, "logits/rejected": 34.978755950927734, "logps/chosen": -418.025146484375, "logps/rejected": -691.0222778320312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.099625587463379, "rewards/margins": 20.950443267822266, "rewards/rejected": -27.05006980895996, "step": 567 }, { "epoch": 0.35334370139968896, "grad_norm": 0.20123806595802307, "learning_rate": 2.4000000000000003e-06, "logits/chosen": -34.08427810668945, "logits/rejected": 44.57142639160156, "logps/chosen": -331.96337890625, "logps/rejected": -675.78515625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.940258502960205, "rewards/margins": 24.700469970703125, "rewards/rejected": -29.640727996826172, "step": 568 }, { "epoch": 0.35396578538102647, "grad_norm": 7.6890482902526855, "learning_rate": 2.3944444444444447e-06, "logits/chosen": -132.9723358154297, "logits/rejected": -24.86490249633789, "logps/chosen": -1485.1414794921875, "logps/rejected": -1868.260986328125, "loss": 0.0676, "rewards/accuracies": 1.0, "rewards/chosen": -20.06203269958496, "rewards/margins": 20.29733657836914, "rewards/rejected": -40.359371185302734, "step": 569 }, { "epoch": 0.3545878693623639, "grad_norm": 2.7767045497894287, "learning_rate": 2.388888888888889e-06, "logits/chosen": -16.59646987915039, "logits/rejected": -36.00761795043945, "logps/chosen": -806.3593139648438, "logps/rejected": -843.9041748046875, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -4.745519638061523, "rewards/margins": 25.288211822509766, "rewards/rejected": -30.033733367919922, "step": 570 }, { "epoch": 0.3552099533437014, "grad_norm": 28.454931259155273, "learning_rate": 2.3833333333333335e-06, "logits/chosen": -49.27771759033203, "logits/rejected": 10.047475814819336, "logps/chosen": -504.58355712890625, "logps/rejected": -630.89013671875, "loss": 0.2616, "rewards/accuracies": 0.875, "rewards/chosen": -9.547435760498047, "rewards/margins": 10.821661949157715, "rewards/rejected": -20.369096755981445, "step": 571 }, { "epoch": 0.3558320373250389, "grad_norm": 3.039405601157341e-05, "learning_rate": 2.377777777777778e-06, "logits/chosen": -233.16433715820312, "logits/rejected": 39.46758270263672, "logps/chosen": -1209.5201416015625, "logps/rejected": -2428.56103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.46906089782715, "rewards/margins": 49.1942138671875, "rewards/rejected": -66.66327667236328, "step": 572 }, { "epoch": 0.3564541213063764, "grad_norm": 1334.62255859375, "learning_rate": 2.3722222222222223e-06, "logits/chosen": -36.86346435546875, "logits/rejected": 30.234554290771484, "logps/chosen": -1288.766845703125, "logps/rejected": -945.4560546875, "loss": 12.5678, "rewards/accuracies": 0.75, "rewards/chosen": -24.924528121948242, "rewards/margins": 0.2439953088760376, "rewards/rejected": -25.168521881103516, "step": 573 }, { "epoch": 0.35707620528771383, "grad_norm": 0.1380101442337036, "learning_rate": 2.3666666666666667e-06, "logits/chosen": -137.03709411621094, "logits/rejected": -69.22855377197266, "logps/chosen": -394.98443603515625, "logps/rejected": -586.0848999023438, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.399110794067383, "rewards/margins": 14.193290710449219, "rewards/rejected": -20.592403411865234, "step": 574 }, { "epoch": 0.35769828926905134, "grad_norm": 0.0052089206874370575, "learning_rate": 2.361111111111111e-06, "logits/chosen": -164.9602813720703, "logits/rejected": 52.3336296081543, "logps/chosen": -391.48114013671875, "logps/rejected": -734.2557373046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.019348621368408, "rewards/margins": 21.936927795410156, "rewards/rejected": -26.956275939941406, "step": 575 }, { "epoch": 0.3583203732503888, "grad_norm": 7.298961008928018e-06, "learning_rate": 2.3555555555555555e-06, "logits/chosen": -185.95272827148438, "logits/rejected": -2.403752326965332, "logps/chosen": -440.54486083984375, "logps/rejected": -1613.17236328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.6661062240600586, "rewards/margins": 28.380462646484375, "rewards/rejected": -32.04656982421875, "step": 576 }, { "epoch": 0.3589424572317263, "grad_norm": 0.0014296313747763634, "learning_rate": 2.35e-06, "logits/chosen": -52.272216796875, "logits/rejected": 67.93026733398438, "logps/chosen": -256.752685546875, "logps/rejected": -596.4014892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.7895524501800537, "rewards/margins": 19.99996566772461, "rewards/rejected": -23.789520263671875, "step": 577 }, { "epoch": 0.35956454121306375, "grad_norm": 43.864864349365234, "learning_rate": 2.3444444444444448e-06, "logits/chosen": -113.0282211303711, "logits/rejected": 92.70934295654297, "logps/chosen": -597.6167602539062, "logps/rejected": -895.5853881835938, "loss": 0.2357, "rewards/accuracies": 0.875, "rewards/chosen": -5.9196319580078125, "rewards/margins": 17.71734619140625, "rewards/rejected": -23.63697624206543, "step": 578 }, { "epoch": 0.36018662519440126, "grad_norm": 1.5760114192962646, "learning_rate": 2.338888888888889e-06, "logits/chosen": -9.568958282470703, "logits/rejected": -17.735870361328125, "logps/chosen": -445.1705322265625, "logps/rejected": -614.3236083984375, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -6.3857421875, "rewards/margins": 17.206356048583984, "rewards/rejected": -23.592100143432617, "step": 579 }, { "epoch": 0.3608087091757387, "grad_norm": 531.9171142578125, "learning_rate": 2.3333333333333336e-06, "logits/chosen": -81.9645767211914, "logits/rejected": -9.224945068359375, "logps/chosen": -835.51806640625, "logps/rejected": -922.8350830078125, "loss": 2.2872, "rewards/accuracies": 0.875, "rewards/chosen": -10.700523376464844, "rewards/margins": 20.093875885009766, "rewards/rejected": -30.79439926147461, "step": 580 }, { "epoch": 0.3614307931570762, "grad_norm": 861.4806518554688, "learning_rate": 2.327777777777778e-06, "logits/chosen": -83.49053192138672, "logits/rejected": -36.894691467285156, "logps/chosen": -1544.520263671875, "logps/rejected": -1797.271240234375, "loss": 0.3248, "rewards/accuracies": 0.875, "rewards/chosen": -20.73171043395996, "rewards/margins": 18.095443725585938, "rewards/rejected": -38.827152252197266, "step": 581 }, { "epoch": 0.36205287713841366, "grad_norm": 0.31022652983665466, "learning_rate": 2.3222222222222224e-06, "logits/chosen": -172.88304138183594, "logits/rejected": -40.24371337890625, "logps/chosen": -865.566650390625, "logps/rejected": -1898.8275146484375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -8.167162895202637, "rewards/margins": 31.16229820251465, "rewards/rejected": -39.32946014404297, "step": 582 }, { "epoch": 0.36267496111975117, "grad_norm": 78.37903594970703, "learning_rate": 2.316666666666667e-06, "logits/chosen": -99.82331848144531, "logits/rejected": -54.77117156982422, "logps/chosen": -580.80419921875, "logps/rejected": -711.6333618164062, "loss": 1.0649, "rewards/accuracies": 0.75, "rewards/chosen": -10.620256423950195, "rewards/margins": 14.304161071777344, "rewards/rejected": -24.92441749572754, "step": 583 }, { "epoch": 0.3632970451010886, "grad_norm": 9.525142669677734, "learning_rate": 2.311111111111111e-06, "logits/chosen": -53.69776916503906, "logits/rejected": 46.772151947021484, "logps/chosen": -765.6139526367188, "logps/rejected": -1288.7596435546875, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": -9.688950538635254, "rewards/margins": 27.56381607055664, "rewards/rejected": -37.252769470214844, "step": 584 }, { "epoch": 0.36391912908242613, "grad_norm": 85.4580078125, "learning_rate": 2.305555555555556e-06, "logits/chosen": -119.37943267822266, "logits/rejected": 7.402251243591309, "logps/chosen": -667.3710327148438, "logps/rejected": -924.570556640625, "loss": 0.8335, "rewards/accuracies": 0.75, "rewards/chosen": -10.36562442779541, "rewards/margins": 18.250911712646484, "rewards/rejected": -28.616535186767578, "step": 585 }, { "epoch": 0.3645412130637636, "grad_norm": 0.3050767779350281, "learning_rate": 2.3000000000000004e-06, "logits/chosen": -82.48968505859375, "logits/rejected": -55.58576202392578, "logps/chosen": -483.3197937011719, "logps/rejected": -705.854736328125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -6.277300834655762, "rewards/margins": 20.907207489013672, "rewards/rejected": -27.18450927734375, "step": 586 }, { "epoch": 0.3651632970451011, "grad_norm": 0.07923314720392227, "learning_rate": 2.294444444444445e-06, "logits/chosen": -145.64170837402344, "logits/rejected": 12.160036087036133, "logps/chosen": -368.4640197753906, "logps/rejected": -723.4093017578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.83080005645752, "rewards/margins": 21.79155921936035, "rewards/rejected": -30.622356414794922, "step": 587 }, { "epoch": 0.3657853810264386, "grad_norm": 0.09495560824871063, "learning_rate": 2.2888888888888892e-06, "logits/chosen": -51.720211029052734, "logits/rejected": 66.5475845336914, "logps/chosen": -390.1048583984375, "logps/rejected": -673.8599243164062, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.884439468383789, "rewards/margins": 15.129334449768066, "rewards/rejected": -20.013774871826172, "step": 588 }, { "epoch": 0.36640746500777605, "grad_norm": 3.2170259952545166, "learning_rate": 2.2833333333333336e-06, "logits/chosen": -167.39239501953125, "logits/rejected": -11.971790313720703, "logps/chosen": -314.90203857421875, "logps/rejected": -718.659912109375, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -6.959827899932861, "rewards/margins": 17.658077239990234, "rewards/rejected": -24.617904663085938, "step": 589 }, { "epoch": 0.36702954898911355, "grad_norm": 2.8344414234161377, "learning_rate": 2.277777777777778e-06, "logits/chosen": -87.245849609375, "logits/rejected": 2.2337474822998047, "logps/chosen": -542.1461181640625, "logps/rejected": -900.9862670898438, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -7.6205949783325195, "rewards/margins": 21.75766944885254, "rewards/rejected": -29.378265380859375, "step": 590 }, { "epoch": 0.367651632970451, "grad_norm": 0.04511585831642151, "learning_rate": 2.2722222222222224e-06, "logits/chosen": -114.44702911376953, "logits/rejected": 11.931731224060059, "logps/chosen": -730.4141235351562, "logps/rejected": -1668.9658203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.166756629943848, "rewards/margins": 33.38457489013672, "rewards/rejected": -39.55133056640625, "step": 591 }, { "epoch": 0.3682737169517885, "grad_norm": 37.129783630371094, "learning_rate": 2.266666666666667e-06, "logits/chosen": -156.915771484375, "logits/rejected": 28.41141700744629, "logps/chosen": -263.21044921875, "logps/rejected": -612.0789794921875, "loss": 0.1681, "rewards/accuracies": 0.875, "rewards/chosen": -7.897441864013672, "rewards/margins": 17.343170166015625, "rewards/rejected": -25.240612030029297, "step": 592 }, { "epoch": 0.36889580093312596, "grad_norm": 0.058156587183475494, "learning_rate": 2.2611111111111112e-06, "logits/chosen": -121.58743286132812, "logits/rejected": -34.45641326904297, "logps/chosen": -521.7720947265625, "logps/rejected": -745.9910888671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.344429016113281, "rewards/margins": 18.742578506469727, "rewards/rejected": -30.087007522583008, "step": 593 }, { "epoch": 0.36951788491446347, "grad_norm": 0.48914235830307007, "learning_rate": 2.2555555555555557e-06, "logits/chosen": -92.32192993164062, "logits/rejected": 67.59513854980469, "logps/chosen": -427.814697265625, "logps/rejected": -809.5601806640625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.8305888175964355, "rewards/margins": 22.327367782592773, "rewards/rejected": -28.157955169677734, "step": 594 }, { "epoch": 0.3701399688958009, "grad_norm": 0.0020152556244283915, "learning_rate": 2.25e-06, "logits/chosen": -64.32755279541016, "logits/rejected": -29.465993881225586, "logps/chosen": -523.4403076171875, "logps/rejected": -665.192138671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.535962104797363, "rewards/margins": 19.718692779541016, "rewards/rejected": -26.254655838012695, "step": 595 }, { "epoch": 0.3707620528771384, "grad_norm": 0.09995375573635101, "learning_rate": 2.2444444444444445e-06, "logits/chosen": -112.38836669921875, "logits/rejected": 3.6374130249023438, "logps/chosen": -1004.1110229492188, "logps/rejected": -1621.420654296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.10578727722168, "rewards/margins": 30.56065559387207, "rewards/rejected": -40.66644287109375, "step": 596 }, { "epoch": 0.3713841368584759, "grad_norm": 5.144113063812256, "learning_rate": 2.238888888888889e-06, "logits/chosen": -27.603708267211914, "logits/rejected": -50.764190673828125, "logps/chosen": -980.617431640625, "logps/rejected": -1070.885498046875, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -11.478384971618652, "rewards/margins": 19.908456802368164, "rewards/rejected": -31.3868408203125, "step": 597 }, { "epoch": 0.3720062208398134, "grad_norm": 8.989690059024724e-07, "learning_rate": 2.2333333333333333e-06, "logits/chosen": -143.34988403320312, "logits/rejected": -11.692249298095703, "logps/chosen": -332.810302734375, "logps/rejected": -664.9310913085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.168551921844482, "rewards/margins": 25.125680923461914, "rewards/rejected": -31.294233322143555, "step": 598 }, { "epoch": 0.37262830482115084, "grad_norm": 1.042702206177637e-05, "learning_rate": 2.2277777777777777e-06, "logits/chosen": -174.20375061035156, "logits/rejected": -88.45613861083984, "logps/chosen": -431.92108154296875, "logps/rejected": -1329.857421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.093454837799072, "rewards/margins": 28.173410415649414, "rewards/rejected": -34.26686477661133, "step": 599 }, { "epoch": 0.37325038880248834, "grad_norm": 0.0029989455360919237, "learning_rate": 2.222222222222222e-06, "logits/chosen": -16.385250091552734, "logits/rejected": 74.99683380126953, "logps/chosen": -494.3900146484375, "logps/rejected": -895.1744384765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.749133586883545, "rewards/margins": 30.88367462158203, "rewards/rejected": -36.632808685302734, "step": 600 }, { "epoch": 0.3738724727838258, "grad_norm": 9.404666900634766, "learning_rate": 2.216666666666667e-06, "logits/chosen": -66.52561950683594, "logits/rejected": -31.62841796875, "logps/chosen": -1326.646728515625, "logps/rejected": -1213.0557861328125, "loss": 0.0443, "rewards/accuracies": 1.0, "rewards/chosen": -3.0750322341918945, "rewards/margins": 15.154412269592285, "rewards/rejected": -18.22944450378418, "step": 601 }, { "epoch": 0.3744945567651633, "grad_norm": 8.219594955444336, "learning_rate": 2.2111111111111113e-06, "logits/chosen": -142.11624145507812, "logits/rejected": 30.811321258544922, "logps/chosen": -574.8021240234375, "logps/rejected": -885.902099609375, "loss": 0.0225, "rewards/accuracies": 1.0, "rewards/chosen": -7.985462665557861, "rewards/margins": 21.239744186401367, "rewards/rejected": -29.22520637512207, "step": 602 }, { "epoch": 0.37511664074650075, "grad_norm": 0.11127887666225433, "learning_rate": 2.2055555555555557e-06, "logits/chosen": -193.1872100830078, "logits/rejected": 21.30368995666504, "logps/chosen": -334.204833984375, "logps/rejected": -727.332763671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.521048545837402, "rewards/margins": 20.15285873413086, "rewards/rejected": -27.673908233642578, "step": 603 }, { "epoch": 0.37573872472783826, "grad_norm": 56.09501266479492, "learning_rate": 2.2e-06, "logits/chosen": -17.52008819580078, "logits/rejected": 1.2058982849121094, "logps/chosen": -466.9716491699219, "logps/rejected": -721.3602294921875, "loss": 0.4439, "rewards/accuracies": 0.875, "rewards/chosen": -8.79868221282959, "rewards/margins": 21.904537200927734, "rewards/rejected": -30.703218460083008, "step": 604 }, { "epoch": 0.37636080870917576, "grad_norm": 0.30513161420822144, "learning_rate": 2.1944444444444445e-06, "logits/chosen": -37.364097595214844, "logits/rejected": -51.587738037109375, "logps/chosen": -843.2265625, "logps/rejected": -1285.77490234375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -8.73627758026123, "rewards/margins": 13.51424503326416, "rewards/rejected": -22.25052261352539, "step": 605 }, { "epoch": 0.3769828926905132, "grad_norm": 1454.0654296875, "learning_rate": 2.188888888888889e-06, "logits/chosen": -71.0340576171875, "logits/rejected": -12.663619995117188, "logps/chosen": -898.2349853515625, "logps/rejected": -2114.133544921875, "loss": 0.2007, "rewards/accuracies": 0.875, "rewards/chosen": -2.4543628692626953, "rewards/margins": 15.316060066223145, "rewards/rejected": -17.770421981811523, "step": 606 }, { "epoch": 0.3776049766718507, "grad_norm": 0.8054802417755127, "learning_rate": 2.1833333333333333e-06, "logits/chosen": -133.4380645751953, "logits/rejected": -31.200092315673828, "logps/chosen": -373.4833679199219, "logps/rejected": -668.4326171875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -7.956595420837402, "rewards/margins": 18.575931549072266, "rewards/rejected": -26.53252601623535, "step": 607 }, { "epoch": 0.3782270606531882, "grad_norm": 397.105224609375, "learning_rate": 2.1777777777777777e-06, "logits/chosen": -70.80426025390625, "logits/rejected": -14.209436416625977, "logps/chosen": -1917.397216796875, "logps/rejected": -2349.953857421875, "loss": 0.1157, "rewards/accuracies": 0.875, "rewards/chosen": -4.041793346405029, "rewards/margins": 17.697311401367188, "rewards/rejected": -21.739105224609375, "step": 608 }, { "epoch": 0.3788491446345257, "grad_norm": 0.014335371553897858, "learning_rate": 2.1722222222222226e-06, "logits/chosen": -187.15628051757812, "logits/rejected": 6.733484268188477, "logps/chosen": -473.24017333984375, "logps/rejected": -950.6836547851562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.5804243087768555, "rewards/margins": 27.375957489013672, "rewards/rejected": -31.956382751464844, "step": 609 }, { "epoch": 0.37947122861586313, "grad_norm": 0.01722325012087822, "learning_rate": 2.166666666666667e-06, "logits/chosen": -114.68191528320312, "logits/rejected": 74.52108764648438, "logps/chosen": -709.3165283203125, "logps/rejected": -1398.282958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.265692710876465, "rewards/margins": 25.487876892089844, "rewards/rejected": -31.753572463989258, "step": 610 }, { "epoch": 0.38009331259720064, "grad_norm": 0.0009549583191983402, "learning_rate": 2.1611111111111114e-06, "logits/chosen": -81.60078430175781, "logits/rejected": -15.650975227355957, "logps/chosen": -239.14053344726562, "logps/rejected": -521.9845581054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.845150947570801, "rewards/margins": 20.330509185791016, "rewards/rejected": -25.175662994384766, "step": 611 }, { "epoch": 0.3807153965785381, "grad_norm": 0.0001866208913270384, "learning_rate": 2.1555555555555558e-06, "logits/chosen": -194.98324584960938, "logits/rejected": -38.205413818359375, "logps/chosen": -423.7889709472656, "logps/rejected": -803.6785888671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.382149696350098, "rewards/margins": 24.69231414794922, "rewards/rejected": -35.074462890625, "step": 612 }, { "epoch": 0.3813374805598756, "grad_norm": 0.5984663367271423, "learning_rate": 2.15e-06, "logits/chosen": -92.76805114746094, "logits/rejected": 46.405643463134766, "logps/chosen": -407.82763671875, "logps/rejected": -812.7186889648438, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -11.976581573486328, "rewards/margins": 24.423805236816406, "rewards/rejected": -36.400390625, "step": 613 }, { "epoch": 0.38195956454121305, "grad_norm": 2.985876562888734e-06, "learning_rate": 2.1444444444444446e-06, "logits/chosen": -112.91573333740234, "logits/rejected": 53.55718231201172, "logps/chosen": -309.55010986328125, "logps/rejected": -768.39306640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.51696491241455, "rewards/margins": 24.15925407409668, "rewards/rejected": -33.67621994018555, "step": 614 }, { "epoch": 0.38258164852255055, "grad_norm": 0.7454193234443665, "learning_rate": 2.138888888888889e-06, "logits/chosen": -132.85391235351562, "logits/rejected": -6.822943687438965, "logps/chosen": -511.02093505859375, "logps/rejected": -758.37548828125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -9.259361267089844, "rewards/margins": 20.60987091064453, "rewards/rejected": -29.869230270385742, "step": 615 }, { "epoch": 0.383203732503888, "grad_norm": 0.2601203918457031, "learning_rate": 2.133333333333334e-06, "logits/chosen": -28.975269317626953, "logits/rejected": 86.46131134033203, "logps/chosen": -564.5445556640625, "logps/rejected": -1432.8876953125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -6.950493335723877, "rewards/margins": 18.61968421936035, "rewards/rejected": -25.57017707824707, "step": 616 }, { "epoch": 0.3838258164852255, "grad_norm": 4.089287176611833e-05, "learning_rate": 2.127777777777778e-06, "logits/chosen": -51.21128463745117, "logits/rejected": -28.334491729736328, "logps/chosen": -1088.528564453125, "logps/rejected": -1536.84619140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.921379089355469, "rewards/margins": 25.620052337646484, "rewards/rejected": -41.54143142700195, "step": 617 }, { "epoch": 0.38444790046656296, "grad_norm": 14.667560577392578, "learning_rate": 2.1222222222222226e-06, "logits/chosen": -106.6644058227539, "logits/rejected": 17.593385696411133, "logps/chosen": -942.4255981445312, "logps/rejected": -1868.6220703125, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": -8.998981475830078, "rewards/margins": 24.860937118530273, "rewards/rejected": -33.85991668701172, "step": 618 }, { "epoch": 0.38506998444790047, "grad_norm": 46.89704513549805, "learning_rate": 2.116666666666667e-06, "logits/chosen": -42.04169845581055, "logits/rejected": 60.90462112426758, "logps/chosen": -747.0496215820312, "logps/rejected": -1519.3692626953125, "loss": 0.4358, "rewards/accuracies": 0.875, "rewards/chosen": -7.780097007751465, "rewards/margins": 26.65879249572754, "rewards/rejected": -34.43888854980469, "step": 619 }, { "epoch": 0.3856920684292379, "grad_norm": 25.33188819885254, "learning_rate": 2.1111111111111114e-06, "logits/chosen": -133.68812561035156, "logits/rejected": 9.434602737426758, "logps/chosen": -372.1322021484375, "logps/rejected": -1462.08056640625, "loss": 0.1411, "rewards/accuracies": 0.875, "rewards/chosen": -9.200334548950195, "rewards/margins": 28.982280731201172, "rewards/rejected": -38.1826171875, "step": 620 }, { "epoch": 0.38631415241057543, "grad_norm": 0.0002119099663104862, "learning_rate": 2.105555555555556e-06, "logits/chosen": -129.2259979248047, "logits/rejected": 105.20929718017578, "logps/chosen": -274.5806884765625, "logps/rejected": -758.4302978515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.094328880310059, "rewards/margins": 22.93634796142578, "rewards/rejected": -29.03067398071289, "step": 621 }, { "epoch": 0.38693623639191294, "grad_norm": 0.1862388402223587, "learning_rate": 2.1000000000000002e-06, "logits/chosen": -163.81405639648438, "logits/rejected": 28.384748458862305, "logps/chosen": -571.4024658203125, "logps/rejected": -995.376220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -10.451725006103516, "rewards/margins": 27.335735321044922, "rewards/rejected": -37.78745651245117, "step": 622 }, { "epoch": 0.3875583203732504, "grad_norm": 0.39005598425865173, "learning_rate": 2.0944444444444446e-06, "logits/chosen": -101.69953918457031, "logits/rejected": -81.94253540039062, "logps/chosen": -594.2868041992188, "logps/rejected": -855.71435546875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.625045776367188, "rewards/margins": 23.445173263549805, "rewards/rejected": -32.07021713256836, "step": 623 }, { "epoch": 0.3881804043545879, "grad_norm": 0.12650153040885925, "learning_rate": 2.088888888888889e-06, "logits/chosen": -169.94459533691406, "logits/rejected": -4.104953765869141, "logps/chosen": -266.11480712890625, "logps/rejected": -643.60888671875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.39999532699585, "rewards/margins": 21.298187255859375, "rewards/rejected": -26.698183059692383, "step": 624 }, { "epoch": 0.38880248833592534, "grad_norm": 0.5524210333824158, "learning_rate": 2.0833333333333334e-06, "logits/chosen": -135.94186401367188, "logits/rejected": 12.509674072265625, "logps/chosen": -453.7965087890625, "logps/rejected": -804.5779418945312, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -7.588127136230469, "rewards/margins": 25.513492584228516, "rewards/rejected": -33.101619720458984, "step": 625 }, { "epoch": 0.38942457231726285, "grad_norm": 0.26446598768234253, "learning_rate": 2.077777777777778e-06, "logits/chosen": -161.56442260742188, "logits/rejected": -0.44899463653564453, "logps/chosen": -354.1595764160156, "logps/rejected": -817.5521240234375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -7.972655296325684, "rewards/margins": 26.82430076599121, "rewards/rejected": -34.796958923339844, "step": 626 }, { "epoch": 0.3900466562986003, "grad_norm": 0.0017269800882786512, "learning_rate": 2.0722222222222222e-06, "logits/chosen": 34.18122100830078, "logits/rejected": 53.35512161254883, "logps/chosen": -642.6044921875, "logps/rejected": -931.87939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.293821334838867, "rewards/margins": 27.351917266845703, "rewards/rejected": -37.6457405090332, "step": 627 }, { "epoch": 0.3906687402799378, "grad_norm": 0.10534560680389404, "learning_rate": 2.0666666666666666e-06, "logits/chosen": -146.35899353027344, "logits/rejected": -20.856992721557617, "logps/chosen": -514.1100463867188, "logps/rejected": -787.3795166015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -12.936281204223633, "rewards/margins": 21.363506317138672, "rewards/rejected": -34.29978942871094, "step": 628 }, { "epoch": 0.39129082426127526, "grad_norm": 42.68144226074219, "learning_rate": 2.061111111111111e-06, "logits/chosen": -199.16421508789062, "logits/rejected": 7.059661865234375, "logps/chosen": -311.6868896484375, "logps/rejected": -1079.119384765625, "loss": 0.3537, "rewards/accuracies": 0.875, "rewards/chosen": -6.248361110687256, "rewards/margins": 28.20779037475586, "rewards/rejected": -34.456153869628906, "step": 629 }, { "epoch": 0.39191290824261277, "grad_norm": 767.7793579101562, "learning_rate": 2.0555555555555555e-06, "logits/chosen": -78.69597625732422, "logits/rejected": 52.3643913269043, "logps/chosen": -1084.8243408203125, "logps/rejected": -1700.9754638671875, "loss": 2.9738, "rewards/accuracies": 0.875, "rewards/chosen": -11.405563354492188, "rewards/margins": 26.698448181152344, "rewards/rejected": -38.10401153564453, "step": 630 }, { "epoch": 0.3925349922239502, "grad_norm": 0.406760036945343, "learning_rate": 2.05e-06, "logits/chosen": -84.87580871582031, "logits/rejected": 25.160152435302734, "logps/chosen": -349.681396484375, "logps/rejected": -677.6912841796875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -5.005940914154053, "rewards/margins": 25.5880126953125, "rewards/rejected": -30.593955993652344, "step": 631 }, { "epoch": 0.3931570762052877, "grad_norm": 0.022369414567947388, "learning_rate": 2.0444444444444447e-06, "logits/chosen": -152.44244384765625, "logits/rejected": 9.739723205566406, "logps/chosen": -524.9042358398438, "logps/rejected": -858.0196533203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.992812633514404, "rewards/margins": 21.894689559936523, "rewards/rejected": -26.88750457763672, "step": 632 }, { "epoch": 0.3937791601866252, "grad_norm": 877.775390625, "learning_rate": 2.038888888888889e-06, "logits/chosen": -144.58380126953125, "logits/rejected": -43.58372116088867, "logps/chosen": -1413.16455078125, "logps/rejected": -896.6156005859375, "loss": 6.6883, "rewards/accuracies": 0.75, "rewards/chosen": -14.016287803649902, "rewards/margins": 8.289559364318848, "rewards/rejected": -22.30584716796875, "step": 633 }, { "epoch": 0.3944012441679627, "grad_norm": 299.9452819824219, "learning_rate": 2.0333333333333335e-06, "logits/chosen": -25.185104370117188, "logits/rejected": 17.668020248413086, "logps/chosen": -611.83837890625, "logps/rejected": -990.292236328125, "loss": 0.5059, "rewards/accuracies": 0.875, "rewards/chosen": -6.49157190322876, "rewards/margins": 20.811405181884766, "rewards/rejected": -27.302978515625, "step": 634 }, { "epoch": 0.39502332814930013, "grad_norm": 0.0001227120083058253, "learning_rate": 2.027777777777778e-06, "logits/chosen": -89.3143081665039, "logits/rejected": 68.9333724975586, "logps/chosen": -364.33453369140625, "logps/rejected": -760.9310302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.161453723907471, "rewards/margins": 31.042577743530273, "rewards/rejected": -37.20403289794922, "step": 635 }, { "epoch": 0.39564541213063764, "grad_norm": 2.9567978344857693e-05, "learning_rate": 2.0222222222222223e-06, "logits/chosen": -94.53516387939453, "logits/rejected": 103.82453155517578, "logps/chosen": -360.36920166015625, "logps/rejected": -831.7943115234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.316364288330078, "rewards/margins": 31.500768661499023, "rewards/rejected": -39.81713104248047, "step": 636 }, { "epoch": 0.3962674961119751, "grad_norm": 212.62533569335938, "learning_rate": 2.0166666666666667e-06, "logits/chosen": -32.55740737915039, "logits/rejected": -7.809814453125, "logps/chosen": -496.3062438964844, "logps/rejected": -927.9765625, "loss": 0.6006, "rewards/accuracies": 0.875, "rewards/chosen": -7.686923980712891, "rewards/margins": 16.08425521850586, "rewards/rejected": -23.77117919921875, "step": 637 }, { "epoch": 0.3968895800933126, "grad_norm": 86.22675323486328, "learning_rate": 2.011111111111111e-06, "logits/chosen": -99.4027099609375, "logits/rejected": 10.640411376953125, "logps/chosen": -885.7298583984375, "logps/rejected": -1405.05615234375, "loss": 2.7366, "rewards/accuracies": 0.875, "rewards/chosen": -11.96135425567627, "rewards/margins": 25.107240676879883, "rewards/rejected": -37.06859588623047, "step": 638 }, { "epoch": 0.39751166407465005, "grad_norm": 100.03654479980469, "learning_rate": 2.0055555555555555e-06, "logits/chosen": -82.22471618652344, "logits/rejected": 60.92601776123047, "logps/chosen": -481.6571350097656, "logps/rejected": -864.4533081054688, "loss": 0.3789, "rewards/accuracies": 0.875, "rewards/chosen": -5.588788986206055, "rewards/margins": 26.069128036499023, "rewards/rejected": -31.657917022705078, "step": 639 }, { "epoch": 0.39813374805598756, "grad_norm": 2.553518772125244, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -109.14447021484375, "logits/rejected": 63.156063079833984, "logps/chosen": -484.38323974609375, "logps/rejected": -835.3967895507812, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -6.852179527282715, "rewards/margins": 23.01146125793457, "rewards/rejected": -29.86363983154297, "step": 640 }, { "epoch": 0.39875583203732506, "grad_norm": 3.4517709082138026e-06, "learning_rate": 1.9944444444444447e-06, "logits/chosen": -16.416790008544922, "logits/rejected": 132.94276428222656, "logps/chosen": -510.31768798828125, "logps/rejected": -903.2525634765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.151825904846191, "rewards/margins": 28.988466262817383, "rewards/rejected": -41.140289306640625, "step": 641 }, { "epoch": 0.3993779160186625, "grad_norm": 0.004211324267089367, "learning_rate": 1.988888888888889e-06, "logits/chosen": -152.37393188476562, "logits/rejected": -13.606033325195312, "logps/chosen": -409.6802978515625, "logps/rejected": -792.8511352539062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.504545211791992, "rewards/margins": 24.69369125366211, "rewards/rejected": -34.19823455810547, "step": 642 }, { "epoch": 0.4, "grad_norm": 0.17244529724121094, "learning_rate": 1.9833333333333335e-06, "logits/chosen": -108.60623168945312, "logits/rejected": 50.22208786010742, "logps/chosen": -363.12939453125, "logps/rejected": -812.1402587890625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -6.111898899078369, "rewards/margins": 25.02814483642578, "rewards/rejected": -31.140043258666992, "step": 643 }, { "epoch": 0.4006220839813375, "grad_norm": 0.00034380503348074853, "learning_rate": 1.977777777777778e-06, "logits/chosen": -108.44320678710938, "logits/rejected": 45.1851692199707, "logps/chosen": -421.476318359375, "logps/rejected": -1163.824462890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.222896099090576, "rewards/margins": 33.559814453125, "rewards/rejected": -37.782711029052734, "step": 644 }, { "epoch": 0.401244167962675, "grad_norm": 0.0049946000799536705, "learning_rate": 1.9722222222222224e-06, "logits/chosen": -97.761474609375, "logits/rejected": 4.405635833740234, "logps/chosen": -408.9325256347656, "logps/rejected": -1494.8212890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.6619462966918945, "rewards/margins": 34.33782958984375, "rewards/rejected": -41.99977111816406, "step": 645 }, { "epoch": 0.40186625194401243, "grad_norm": 0.013779985718429089, "learning_rate": 1.9666666666666668e-06, "logits/chosen": -186.8841552734375, "logits/rejected": 44.72999572753906, "logps/chosen": -380.64971923828125, "logps/rejected": -814.0526123046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.534713745117188, "rewards/margins": 25.545106887817383, "rewards/rejected": -34.07981872558594, "step": 646 }, { "epoch": 0.40248833592534994, "grad_norm": 7.518558979034424, "learning_rate": 1.9611111111111116e-06, "logits/chosen": -133.4121551513672, "logits/rejected": 19.714656829833984, "logps/chosen": -651.8238525390625, "logps/rejected": -2336.634521484375, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -8.024836540222168, "rewards/margins": 24.043197631835938, "rewards/rejected": -32.06803512573242, "step": 647 }, { "epoch": 0.4031104199066874, "grad_norm": 0.0001080551664927043, "learning_rate": 1.955555555555556e-06, "logits/chosen": -122.52125549316406, "logits/rejected": 51.81283950805664, "logps/chosen": -414.55255126953125, "logps/rejected": -1342.5406494140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.878800392150879, "rewards/margins": 29.33898162841797, "rewards/rejected": -35.21778106689453, "step": 648 }, { "epoch": 0.4037325038880249, "grad_norm": 0.0004664478765334934, "learning_rate": 1.9500000000000004e-06, "logits/chosen": -78.87663269042969, "logits/rejected": 70.33039855957031, "logps/chosen": -1363.2222900390625, "logps/rejected": -1906.75341796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.070587158203125, "rewards/margins": 28.2924861907959, "rewards/rejected": -46.363075256347656, "step": 649 }, { "epoch": 0.40435458786936235, "grad_norm": 52.658958435058594, "learning_rate": 1.944444444444445e-06, "logits/chosen": -6.126832962036133, "logits/rejected": 7.629493713378906, "logps/chosen": -504.5369873046875, "logps/rejected": -697.6328125, "loss": 0.4755, "rewards/accuracies": 0.875, "rewards/chosen": -9.267518997192383, "rewards/margins": 23.434417724609375, "rewards/rejected": -32.701934814453125, "step": 650 }, { "epoch": 0.40497667185069985, "grad_norm": 0.5209815502166748, "learning_rate": 1.938888888888889e-06, "logits/chosen": -121.06184387207031, "logits/rejected": 23.749065399169922, "logps/chosen": -380.817138671875, "logps/rejected": -807.7526245117188, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -11.429553985595703, "rewards/margins": 30.674301147460938, "rewards/rejected": -42.10385513305664, "step": 651 }, { "epoch": 0.4055987558320373, "grad_norm": 60.56729507446289, "learning_rate": 1.9333333333333336e-06, "logits/chosen": 19.702457427978516, "logits/rejected": 110.11468505859375, "logps/chosen": -371.5089416503906, "logps/rejected": -658.2398681640625, "loss": 0.1856, "rewards/accuracies": 0.875, "rewards/chosen": -10.270659446716309, "rewards/margins": 20.151500701904297, "rewards/rejected": -30.422161102294922, "step": 652 }, { "epoch": 0.4062208398133748, "grad_norm": 2.3228983879089355, "learning_rate": 1.927777777777778e-06, "logits/chosen": -151.02757263183594, "logits/rejected": -8.273486137390137, "logps/chosen": -548.06982421875, "logps/rejected": -788.784912109375, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -9.983125686645508, "rewards/margins": 18.303466796875, "rewards/rejected": -28.286590576171875, "step": 653 }, { "epoch": 0.40684292379471226, "grad_norm": 12.760083198547363, "learning_rate": 1.9222222222222224e-06, "logits/chosen": -141.07003784179688, "logits/rejected": -41.08721160888672, "logps/chosen": -475.43267822265625, "logps/rejected": -822.8726806640625, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": -5.151067733764648, "rewards/margins": 28.453012466430664, "rewards/rejected": -33.60408020019531, "step": 654 }, { "epoch": 0.40746500777604977, "grad_norm": 2.503775119781494, "learning_rate": 1.916666666666667e-06, "logits/chosen": 48.98674774169922, "logits/rejected": 52.28057861328125, "logps/chosen": -1297.3560791015625, "logps/rejected": -1273.932861328125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -15.868106842041016, "rewards/margins": 18.652477264404297, "rewards/rejected": -34.52058410644531, "step": 655 }, { "epoch": 0.4080870917573872, "grad_norm": 0.014395114034414291, "learning_rate": 1.9111111111111112e-06, "logits/chosen": -92.49978637695312, "logits/rejected": -2.600125312805176, "logps/chosen": -262.1980895996094, "logps/rejected": -632.6068115234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.913809299468994, "rewards/margins": 26.556716918945312, "rewards/rejected": -33.47052764892578, "step": 656 }, { "epoch": 0.40870917573872473, "grad_norm": 0.4067128896713257, "learning_rate": 1.9055555555555558e-06, "logits/chosen": -36.86445617675781, "logits/rejected": -18.49867820739746, "logps/chosen": -574.934326171875, "logps/rejected": -875.3681640625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -14.891278266906738, "rewards/margins": 24.616798400878906, "rewards/rejected": -39.508079528808594, "step": 657 }, { "epoch": 0.40933125972006223, "grad_norm": 0.03244220092892647, "learning_rate": 1.9000000000000002e-06, "logits/chosen": -128.7218780517578, "logits/rejected": 53.10542297363281, "logps/chosen": -311.9930114746094, "logps/rejected": -659.8009643554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.43799352645874, "rewards/margins": 19.681873321533203, "rewards/rejected": -27.119869232177734, "step": 658 }, { "epoch": 0.4099533437013997, "grad_norm": 0.9425063133239746, "learning_rate": 1.8944444444444446e-06, "logits/chosen": -24.558128356933594, "logits/rejected": 148.08770751953125, "logps/chosen": -518.3831787109375, "logps/rejected": -864.6934204101562, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -8.965529441833496, "rewards/margins": 22.957069396972656, "rewards/rejected": -31.92259979248047, "step": 659 }, { "epoch": 0.4105754276827372, "grad_norm": 0.00014337169704958797, "learning_rate": 1.888888888888889e-06, "logits/chosen": -145.640869140625, "logits/rejected": 37.357322692871094, "logps/chosen": -442.11566162109375, "logps/rejected": -868.1012573242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.173210144042969, "rewards/margins": 22.604381561279297, "rewards/rejected": -33.777591705322266, "step": 660 }, { "epoch": 0.41119751166407464, "grad_norm": 0.03413870185613632, "learning_rate": 1.8833333333333334e-06, "logits/chosen": -164.5783233642578, "logits/rejected": 10.176637649536133, "logps/chosen": -444.73834228515625, "logps/rejected": -2449.811767578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.66865873336792, "rewards/margins": 30.468387603759766, "rewards/rejected": -36.137046813964844, "step": 661 }, { "epoch": 0.41181959564541215, "grad_norm": 486.0137634277344, "learning_rate": 1.8777777777777778e-06, "logits/chosen": -51.05937576293945, "logits/rejected": 22.034088134765625, "logps/chosen": -2208.51708984375, "logps/rejected": -2034.6881103515625, "loss": 3.052, "rewards/accuracies": 0.875, "rewards/chosen": -10.770562171936035, "rewards/margins": 19.41939353942871, "rewards/rejected": -30.189958572387695, "step": 662 }, { "epoch": 0.4124416796267496, "grad_norm": 47.00955581665039, "learning_rate": 1.8722222222222225e-06, "logits/chosen": -14.684051513671875, "logits/rejected": 8.471969604492188, "logps/chosen": -449.7879943847656, "logps/rejected": -594.7542114257812, "loss": 0.3381, "rewards/accuracies": 0.875, "rewards/chosen": -9.380927085876465, "rewards/margins": 17.106979370117188, "rewards/rejected": -26.487905502319336, "step": 663 }, { "epoch": 0.4130637636080871, "grad_norm": 6.235434055328369, "learning_rate": 1.8666666666666669e-06, "logits/chosen": -85.09210968017578, "logits/rejected": -0.3258938789367676, "logps/chosen": -519.4788818359375, "logps/rejected": -793.679443359375, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -12.450668334960938, "rewards/margins": 23.570507049560547, "rewards/rejected": -36.02117156982422, "step": 664 }, { "epoch": 0.41368584758942456, "grad_norm": 24.491966247558594, "learning_rate": 1.8611111111111113e-06, "logits/chosen": -151.94400024414062, "logits/rejected": -76.71399688720703, "logps/chosen": -217.75730895996094, "logps/rejected": -535.067138671875, "loss": 0.1757, "rewards/accuracies": 0.875, "rewards/chosen": -3.097677230834961, "rewards/margins": 21.261213302612305, "rewards/rejected": -24.358890533447266, "step": 665 }, { "epoch": 0.41430793157076207, "grad_norm": 0.598987877368927, "learning_rate": 1.8555555555555557e-06, "logits/chosen": -119.86331176757812, "logits/rejected": 33.68067169189453, "logps/chosen": -1065.724365234375, "logps/rejected": -1624.5645751953125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -5.903199195861816, "rewards/margins": 25.08647918701172, "rewards/rejected": -30.98967742919922, "step": 666 }, { "epoch": 0.4149300155520995, "grad_norm": 38.270931243896484, "learning_rate": 1.85e-06, "logits/chosen": -27.1868839263916, "logits/rejected": 7.609567642211914, "logps/chosen": -373.2843322753906, "logps/rejected": -768.737548828125, "loss": 0.0808, "rewards/accuracies": 1.0, "rewards/chosen": -7.881779670715332, "rewards/margins": 26.574831008911133, "rewards/rejected": -34.456607818603516, "step": 667 }, { "epoch": 0.415552099533437, "grad_norm": 0.02267165295779705, "learning_rate": 1.8444444444444445e-06, "logits/chosen": -112.08218383789062, "logits/rejected": 27.71953582763672, "logps/chosen": -1042.07275390625, "logps/rejected": -1129.5087890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.801518440246582, "rewards/margins": 19.48831558227539, "rewards/rejected": -34.289833068847656, "step": 668 }, { "epoch": 0.4161741835147745, "grad_norm": 1.666570454972316e-07, "learning_rate": 1.8388888888888889e-06, "logits/chosen": -161.90579223632812, "logits/rejected": 10.375336647033691, "logps/chosen": -373.11322021484375, "logps/rejected": -872.2646484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.761898040771484, "rewards/margins": 34.57844543457031, "rewards/rejected": -40.34034729003906, "step": 669 }, { "epoch": 0.416796267496112, "grad_norm": 1224.45068359375, "learning_rate": 1.8333333333333333e-06, "logits/chosen": -110.13895416259766, "logits/rejected": 23.377351760864258, "logps/chosen": -686.072509765625, "logps/rejected": -1578.144775390625, "loss": 2.7911, "rewards/accuracies": 0.875, "rewards/chosen": -4.076228141784668, "rewards/margins": 22.31646728515625, "rewards/rejected": -26.392696380615234, "step": 670 }, { "epoch": 0.41741835147744943, "grad_norm": 125.89705657958984, "learning_rate": 1.8277777777777781e-06, "logits/chosen": -67.06732177734375, "logits/rejected": 13.651888847351074, "logps/chosen": -340.56256103515625, "logps/rejected": -630.1422729492188, "loss": 1.2385, "rewards/accuracies": 0.875, "rewards/chosen": -7.8657026290893555, "rewards/margins": 24.01036262512207, "rewards/rejected": -31.87606430053711, "step": 671 }, { "epoch": 0.41804043545878694, "grad_norm": 0.3765208423137665, "learning_rate": 1.8222222222222225e-06, "logits/chosen": -100.89735412597656, "logits/rejected": 32.53485870361328, "logps/chosen": -515.4548950195312, "logps/rejected": -804.4092407226562, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -5.394279479980469, "rewards/margins": 23.351259231567383, "rewards/rejected": -28.74553680419922, "step": 672 }, { "epoch": 0.4186625194401244, "grad_norm": 0.043067559599876404, "learning_rate": 1.816666666666667e-06, "logits/chosen": -251.77734375, "logits/rejected": -21.537763595581055, "logps/chosen": -203.25003051757812, "logps/rejected": -753.7747802734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.019883155822754, "rewards/margins": 28.371501922607422, "rewards/rejected": -34.391387939453125, "step": 673 }, { "epoch": 0.4192846034214619, "grad_norm": 43.743896484375, "learning_rate": 1.8111111111111113e-06, "logits/chosen": -110.60884094238281, "logits/rejected": -11.425204277038574, "logps/chosen": -523.369873046875, "logps/rejected": -809.807861328125, "loss": 0.2399, "rewards/accuracies": 0.875, "rewards/chosen": -10.413145065307617, "rewards/margins": 23.758867263793945, "rewards/rejected": -34.17201232910156, "step": 674 }, { "epoch": 0.4199066874027994, "grad_norm": 37.91448211669922, "learning_rate": 1.8055555555555557e-06, "logits/chosen": -63.768798828125, "logits/rejected": 61.90055847167969, "logps/chosen": -486.90380859375, "logps/rejected": -935.5989990234375, "loss": 0.1493, "rewards/accuracies": 0.875, "rewards/chosen": -4.215798377990723, "rewards/margins": 22.192340850830078, "rewards/rejected": -26.408138275146484, "step": 675 }, { "epoch": 0.42052877138413686, "grad_norm": 84.39241027832031, "learning_rate": 1.8000000000000001e-06, "logits/chosen": -122.06509399414062, "logits/rejected": 32.813720703125, "logps/chosen": -533.413330078125, "logps/rejected": -821.0714111328125, "loss": 0.6924, "rewards/accuracies": 0.75, "rewards/chosen": -8.633810997009277, "rewards/margins": 20.466468811035156, "rewards/rejected": -29.100278854370117, "step": 676 }, { "epoch": 0.42115085536547436, "grad_norm": 37.38468551635742, "learning_rate": 1.7944444444444445e-06, "logits/chosen": -36.77057647705078, "logits/rejected": 19.19281005859375, "logps/chosen": -451.4635009765625, "logps/rejected": -651.059326171875, "loss": 0.4474, "rewards/accuracies": 0.875, "rewards/chosen": -10.318145751953125, "rewards/margins": 14.281937599182129, "rewards/rejected": -24.60008430480957, "step": 677 }, { "epoch": 0.4217729393468118, "grad_norm": 0.008393422700464725, "learning_rate": 1.788888888888889e-06, "logits/chosen": -136.81988525390625, "logits/rejected": 11.300825119018555, "logps/chosen": -274.952392578125, "logps/rejected": -607.551513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.044919967651367, "rewards/margins": 20.219356536865234, "rewards/rejected": -25.2642765045166, "step": 678 }, { "epoch": 0.4223950233281493, "grad_norm": 1.795506068447139e-05, "learning_rate": 1.7833333333333336e-06, "logits/chosen": -184.8307647705078, "logits/rejected": 30.00881576538086, "logps/chosen": -906.047607421875, "logps/rejected": -2296.1875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.545950889587402, "rewards/margins": 28.065431594848633, "rewards/rejected": -36.611385345458984, "step": 679 }, { "epoch": 0.4230171073094868, "grad_norm": 0.03637096658349037, "learning_rate": 1.777777777777778e-06, "logits/chosen": -116.86735534667969, "logits/rejected": 82.61153411865234, "logps/chosen": -328.0142822265625, "logps/rejected": -900.8728637695312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.515879154205322, "rewards/margins": 19.080745697021484, "rewards/rejected": -25.59662437438965, "step": 680 }, { "epoch": 0.4236391912908243, "grad_norm": 5.365406607893419e-08, "learning_rate": 1.7722222222222224e-06, "logits/chosen": -80.28112030029297, "logits/rejected": 24.580060958862305, "logps/chosen": -1502.774658203125, "logps/rejected": -1987.80029296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.475521087646484, "rewards/margins": 28.35834503173828, "rewards/rejected": -41.8338623046875, "step": 681 }, { "epoch": 0.42426127527216173, "grad_norm": 0.23309074342250824, "learning_rate": 1.7666666666666668e-06, "logits/chosen": -25.303722381591797, "logits/rejected": -22.517988204956055, "logps/chosen": -324.21014404296875, "logps/rejected": -517.8095703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.970341682434082, "rewards/margins": 20.696395874023438, "rewards/rejected": -25.666736602783203, "step": 682 }, { "epoch": 0.42488335925349924, "grad_norm": 6.045684131095186e-05, "learning_rate": 1.7611111111111112e-06, "logits/chosen": -173.89187622070312, "logits/rejected": -44.92247772216797, "logps/chosen": -511.6766357421875, "logps/rejected": -787.9351196289062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.2953310012817383, "rewards/margins": 23.62456703186035, "rewards/rejected": -26.919898986816406, "step": 683 }, { "epoch": 0.4255054432348367, "grad_norm": 4.7863588406471536e-05, "learning_rate": 1.7555555555555556e-06, "logits/chosen": -191.41952514648438, "logits/rejected": 43.73417663574219, "logps/chosen": -336.21435546875, "logps/rejected": -816.8866577148438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.471943378448486, "rewards/margins": 28.435672760009766, "rewards/rejected": -33.907615661621094, "step": 684 }, { "epoch": 0.4261275272161742, "grad_norm": 12.498409271240234, "learning_rate": 1.75e-06, "logits/chosen": -86.87174987792969, "logits/rejected": 46.648521423339844, "logps/chosen": -415.39862060546875, "logps/rejected": -680.6744384765625, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": -6.712294578552246, "rewards/margins": 19.301076889038086, "rewards/rejected": -26.01337432861328, "step": 685 }, { "epoch": 0.42674961119751165, "grad_norm": 6.335072976071388e-05, "learning_rate": 1.7444444444444448e-06, "logits/chosen": -122.83106231689453, "logits/rejected": 68.56353759765625, "logps/chosen": -354.56005859375, "logps/rejected": -747.1412353515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.719512939453125, "rewards/margins": 21.17591094970703, "rewards/rejected": -27.895423889160156, "step": 686 }, { "epoch": 0.42737169517884915, "grad_norm": 92.6241226196289, "learning_rate": 1.7388888888888892e-06, "logits/chosen": -98.55459594726562, "logits/rejected": 37.2609748840332, "logps/chosen": -434.0101623535156, "logps/rejected": -709.33935546875, "loss": 2.4705, "rewards/accuracies": 0.875, "rewards/chosen": -8.763077735900879, "rewards/margins": 16.424898147583008, "rewards/rejected": -25.187976837158203, "step": 687 }, { "epoch": 0.4279937791601866, "grad_norm": 2.7207908630371094, "learning_rate": 1.7333333333333336e-06, "logits/chosen": -197.36843872070312, "logits/rejected": 10.147337913513184, "logps/chosen": -365.2086181640625, "logps/rejected": -741.259765625, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -8.264455795288086, "rewards/margins": 19.000343322753906, "rewards/rejected": -27.264799118041992, "step": 688 }, { "epoch": 0.4286158631415241, "grad_norm": 0.002626116154715419, "learning_rate": 1.727777777777778e-06, "logits/chosen": -119.55145263671875, "logits/rejected": 55.331912994384766, "logps/chosen": -481.59521484375, "logps/rejected": -1353.3203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.29876708984375, "rewards/margins": 20.46005630493164, "rewards/rejected": -26.75882339477539, "step": 689 }, { "epoch": 0.42923794712286156, "grad_norm": 22.57261085510254, "learning_rate": 1.7222222222222224e-06, "logits/chosen": -173.70523071289062, "logits/rejected": -51.59834289550781, "logps/chosen": -849.1958618164062, "logps/rejected": -1357.558837890625, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": -8.69334602355957, "rewards/margins": 16.638965606689453, "rewards/rejected": -25.332311630249023, "step": 690 }, { "epoch": 0.42986003110419907, "grad_norm": 99.55349731445312, "learning_rate": 1.7166666666666668e-06, "logits/chosen": -112.88703918457031, "logits/rejected": -17.740304946899414, "logps/chosen": -424.35748291015625, "logps/rejected": -711.3453369140625, "loss": 1.1824, "rewards/accuracies": 0.875, "rewards/chosen": -6.148022651672363, "rewards/margins": 12.532917976379395, "rewards/rejected": -18.68094253540039, "step": 691 }, { "epoch": 0.4304821150855365, "grad_norm": 33.83268356323242, "learning_rate": 1.7111111111111112e-06, "logits/chosen": -131.600341796875, "logits/rejected": -16.9444580078125, "logps/chosen": -441.9706726074219, "logps/rejected": -958.176513671875, "loss": 0.1516, "rewards/accuracies": 0.875, "rewards/chosen": -8.11992359161377, "rewards/margins": 15.563799858093262, "rewards/rejected": -23.68372344970703, "step": 692 }, { "epoch": 0.431104199066874, "grad_norm": 0.3491835594177246, "learning_rate": 1.7055555555555556e-06, "logits/chosen": -116.65411376953125, "logits/rejected": 6.314001083374023, "logps/chosen": -1973.502197265625, "logps/rejected": -3368.041259765625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -14.595005989074707, "rewards/margins": 24.281784057617188, "rewards/rejected": -38.876792907714844, "step": 693 }, { "epoch": 0.43172628304821153, "grad_norm": 0.4353962242603302, "learning_rate": 1.7000000000000002e-06, "logits/chosen": -67.74949645996094, "logits/rejected": -82.28294372558594, "logps/chosen": -430.0411376953125, "logps/rejected": -618.2429809570312, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -5.494540691375732, "rewards/margins": 19.550899505615234, "rewards/rejected": -25.045440673828125, "step": 694 }, { "epoch": 0.432348367029549, "grad_norm": 0.01532625313848257, "learning_rate": 1.6944444444444446e-06, "logits/chosen": -103.94966888427734, "logits/rejected": 11.359590530395508, "logps/chosen": -343.24884033203125, "logps/rejected": -658.8260498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.142261981964111, "rewards/margins": 23.5025634765625, "rewards/rejected": -29.644824981689453, "step": 695 }, { "epoch": 0.4329704510108865, "grad_norm": 0.010948620736598969, "learning_rate": 1.688888888888889e-06, "logits/chosen": -170.687744140625, "logits/rejected": 63.31150436401367, "logps/chosen": -383.587890625, "logps/rejected": -905.762939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.83194637298584, "rewards/margins": 26.36358642578125, "rewards/rejected": -31.195533752441406, "step": 696 }, { "epoch": 0.43359253499222394, "grad_norm": 422.81329345703125, "learning_rate": 1.6833333333333335e-06, "logits/chosen": -22.62029457092285, "logits/rejected": 76.77090454101562, "logps/chosen": -920.0440673828125, "logps/rejected": -1116.374755859375, "loss": 0.503, "rewards/accuracies": 0.875, "rewards/chosen": -10.86777114868164, "rewards/margins": 17.08026123046875, "rewards/rejected": -27.94803237915039, "step": 697 }, { "epoch": 0.43421461897356145, "grad_norm": 881.690185546875, "learning_rate": 1.6777777777777779e-06, "logits/chosen": -117.57392120361328, "logits/rejected": -129.6474151611328, "logps/chosen": -1602.8016357421875, "logps/rejected": -818.41064453125, "loss": 5.7195, "rewards/accuracies": 0.875, "rewards/chosen": -14.428415298461914, "rewards/margins": 13.144752502441406, "rewards/rejected": -27.573169708251953, "step": 698 }, { "epoch": 0.4348367029548989, "grad_norm": 2.5813629627227783, "learning_rate": 1.6722222222222223e-06, "logits/chosen": -131.7569580078125, "logits/rejected": 41.19657897949219, "logps/chosen": -290.69879150390625, "logps/rejected": -624.9320068359375, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": -2.3801627159118652, "rewards/margins": 19.26412582397461, "rewards/rejected": -21.644289016723633, "step": 699 }, { "epoch": 0.4354587869362364, "grad_norm": 0.00012219342170283198, "learning_rate": 1.6666666666666667e-06, "logits/chosen": -152.80975341796875, "logits/rejected": 76.1199951171875, "logps/chosen": -300.38592529296875, "logps/rejected": -740.673583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.567030906677246, "rewards/margins": 24.559507369995117, "rewards/rejected": -30.126537322998047, "step": 700 }, { "epoch": 0.43608087091757386, "grad_norm": 0.6013535857200623, "learning_rate": 1.661111111111111e-06, "logits/chosen": -180.16830444335938, "logits/rejected": 39.80121612548828, "logps/chosen": -330.1705322265625, "logps/rejected": -733.29052734375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -9.535659790039062, "rewards/margins": 18.36993980407715, "rewards/rejected": -27.905601501464844, "step": 701 }, { "epoch": 0.43670295489891137, "grad_norm": 0.32515305280685425, "learning_rate": 1.6555555555555559e-06, "logits/chosen": -89.5015869140625, "logits/rejected": 58.43971252441406, "logps/chosen": -450.43743896484375, "logps/rejected": -1560.7330322265625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.511563777923584, "rewards/margins": 21.634267807006836, "rewards/rejected": -29.14583396911621, "step": 702 }, { "epoch": 0.4373250388802488, "grad_norm": 0.00518906069919467, "learning_rate": 1.6500000000000003e-06, "logits/chosen": -109.431640625, "logits/rejected": 17.594715118408203, "logps/chosen": -324.4794921875, "logps/rejected": -666.993408203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.536863088607788, "rewards/margins": 25.65342140197754, "rewards/rejected": -28.190284729003906, "step": 703 }, { "epoch": 0.4379471228615863, "grad_norm": 0.002745087491348386, "learning_rate": 1.6444444444444447e-06, "logits/chosen": -90.78618621826172, "logits/rejected": -57.6749382019043, "logps/chosen": -922.083984375, "logps/rejected": -1474.892333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.238462448120117, "rewards/margins": 21.818592071533203, "rewards/rejected": -30.05705451965332, "step": 704 }, { "epoch": 0.4385692068429238, "grad_norm": 0.7870026230812073, "learning_rate": 1.638888888888889e-06, "logits/chosen": -83.08509826660156, "logits/rejected": 69.55027770996094, "logps/chosen": -407.1316223144531, "logps/rejected": -760.4619750976562, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -9.063232421875, "rewards/margins": 23.576244354248047, "rewards/rejected": -32.63947677612305, "step": 705 }, { "epoch": 0.4391912908242613, "grad_norm": 2.4516847133636475, "learning_rate": 1.6333333333333335e-06, "logits/chosen": -46.775718688964844, "logits/rejected": 11.706619262695312, "logps/chosen": -411.1054382324219, "logps/rejected": -555.9168090820312, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -6.680307388305664, "rewards/margins": 15.830259323120117, "rewards/rejected": -22.51056671142578, "step": 706 }, { "epoch": 0.43981337480559873, "grad_norm": 0.00012583508214447647, "learning_rate": 1.627777777777778e-06, "logits/chosen": -89.5656509399414, "logits/rejected": 43.06319808959961, "logps/chosen": -1230.562744140625, "logps/rejected": -1636.2696533203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.338455200195312, "rewards/margins": 23.808509826660156, "rewards/rejected": -40.14696502685547, "step": 707 }, { "epoch": 0.44043545878693624, "grad_norm": 0.0003049974038731307, "learning_rate": 1.6222222222222223e-06, "logits/chosen": -152.36488342285156, "logits/rejected": -22.832372665405273, "logps/chosen": -256.43695068359375, "logps/rejected": -741.229248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.301939010620117, "rewards/margins": 25.906457901000977, "rewards/rejected": -32.208396911621094, "step": 708 }, { "epoch": 0.4410575427682737, "grad_norm": 0.08373872935771942, "learning_rate": 1.6166666666666667e-06, "logits/chosen": -217.28805541992188, "logits/rejected": -23.98189926147461, "logps/chosen": -557.675537109375, "logps/rejected": -1621.658935546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.968478202819824, "rewards/margins": 32.71379089355469, "rewards/rejected": -40.682273864746094, "step": 709 }, { "epoch": 0.4416796267496112, "grad_norm": 0.022391460835933685, "learning_rate": 1.6111111111111113e-06, "logits/chosen": -54.736759185791016, "logits/rejected": -45.23244094848633, "logps/chosen": -646.8934936523438, "logps/rejected": -778.4488525390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.978015422821045, "rewards/margins": 24.43364143371582, "rewards/rejected": -29.411657333374023, "step": 710 }, { "epoch": 0.4423017107309487, "grad_norm": 0.0011658166768029332, "learning_rate": 1.6055555555555557e-06, "logits/chosen": -82.49189758300781, "logits/rejected": 23.794401168823242, "logps/chosen": -471.4813537597656, "logps/rejected": -814.3196411132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.69105339050293, "rewards/margins": 24.478273391723633, "rewards/rejected": -33.16932678222656, "step": 711 }, { "epoch": 0.44292379471228616, "grad_norm": 0.00012946366041433066, "learning_rate": 1.6000000000000001e-06, "logits/chosen": -48.673927307128906, "logits/rejected": 47.28028869628906, "logps/chosen": -599.9083862304688, "logps/rejected": -986.0660400390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.328247547149658, "rewards/margins": 24.408180236816406, "rewards/rejected": -31.736427307128906, "step": 712 }, { "epoch": 0.44354587869362366, "grad_norm": 5.540538040804677e-05, "learning_rate": 1.5944444444444445e-06, "logits/chosen": -232.19285583496094, "logits/rejected": -55.66661071777344, "logps/chosen": -302.42041015625, "logps/rejected": -783.9266357421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.718314170837402, "rewards/margins": 29.975704193115234, "rewards/rejected": -34.69401931762695, "step": 713 }, { "epoch": 0.4441679626749611, "grad_norm": 0.1854691356420517, "learning_rate": 1.588888888888889e-06, "logits/chosen": -175.55657958984375, "logits/rejected": -23.822433471679688, "logps/chosen": -276.94049072265625, "logps/rejected": -640.519287109375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.260311126708984, "rewards/margins": 21.577255249023438, "rewards/rejected": -27.837566375732422, "step": 714 }, { "epoch": 0.4447900466562986, "grad_norm": 0.0026221410371363163, "learning_rate": 1.5833333333333333e-06, "logits/chosen": -148.6116943359375, "logits/rejected": -48.037445068359375, "logps/chosen": -425.61553955078125, "logps/rejected": -697.444091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.479765892028809, "rewards/margins": 20.465469360351562, "rewards/rejected": -28.945234298706055, "step": 715 }, { "epoch": 0.4454121306376361, "grad_norm": 0.0129752391949296, "learning_rate": 1.5777777777777778e-06, "logits/chosen": -83.60792541503906, "logits/rejected": 36.99983596801758, "logps/chosen": -476.87060546875, "logps/rejected": -726.6820068359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.37528133392334, "rewards/margins": 18.513668060302734, "rewards/rejected": -27.888948440551758, "step": 716 }, { "epoch": 0.4460342146189736, "grad_norm": 1.8560853277449496e-05, "learning_rate": 1.5722222222222226e-06, "logits/chosen": -144.66384887695312, "logits/rejected": -17.073589324951172, "logps/chosen": -326.673583984375, "logps/rejected": -747.9263916015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.300039291381836, "rewards/margins": 27.49146270751953, "rewards/rejected": -32.79150390625, "step": 717 }, { "epoch": 0.44665629860031103, "grad_norm": 425.8340148925781, "learning_rate": 1.566666666666667e-06, "logits/chosen": -91.33794403076172, "logits/rejected": 82.35475158691406, "logps/chosen": -985.9503173828125, "logps/rejected": -1376.75341796875, "loss": 2.705, "rewards/accuracies": 0.875, "rewards/chosen": -8.864580154418945, "rewards/margins": 11.041722297668457, "rewards/rejected": -19.90630340576172, "step": 718 }, { "epoch": 0.44727838258164854, "grad_norm": 0.0247510839253664, "learning_rate": 1.5611111111111114e-06, "logits/chosen": -153.80862426757812, "logits/rejected": 38.85824203491211, "logps/chosen": -596.953857421875, "logps/rejected": -1564.6199951171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.512521266937256, "rewards/margins": 28.546722412109375, "rewards/rejected": -35.059242248535156, "step": 719 }, { "epoch": 0.447900466562986, "grad_norm": 3.2243831157684326, "learning_rate": 1.5555555555555558e-06, "logits/chosen": -77.09352111816406, "logits/rejected": 31.358675003051758, "logps/chosen": -1020.8194580078125, "logps/rejected": -1578.967529296875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.632713317871094, "rewards/margins": 20.82750129699707, "rewards/rejected": -28.46021270751953, "step": 720 }, { "epoch": 0.4485225505443235, "grad_norm": 1625.1943359375, "learning_rate": 1.5500000000000002e-06, "logits/chosen": -56.491458892822266, "logits/rejected": -45.39542770385742, "logps/chosen": -1749.9366455078125, "logps/rejected": -2600.73095703125, "loss": 2.7209, "rewards/accuracies": 0.75, "rewards/chosen": -6.2524871826171875, "rewards/margins": 14.677700996398926, "rewards/rejected": -20.93018913269043, "step": 721 }, { "epoch": 0.44914463452566095, "grad_norm": 537.3250122070312, "learning_rate": 1.5444444444444446e-06, "logits/chosen": -63.97138977050781, "logits/rejected": -65.99876403808594, "logps/chosen": -1162.6993408203125, "logps/rejected": -1281.4927978515625, "loss": 1.5185, "rewards/accuracies": 0.875, "rewards/chosen": -12.767265319824219, "rewards/margins": 16.384214401245117, "rewards/rejected": -29.15148162841797, "step": 722 }, { "epoch": 0.44976671850699845, "grad_norm": 0.6008815765380859, "learning_rate": 1.538888888888889e-06, "logits/chosen": 17.265945434570312, "logits/rejected": 111.18235778808594, "logps/chosen": -918.56689453125, "logps/rejected": -1524.6748046875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -7.4128570556640625, "rewards/margins": 20.329118728637695, "rewards/rejected": -27.74197769165039, "step": 723 }, { "epoch": 0.4503888024883359, "grad_norm": 60.88777542114258, "learning_rate": 1.5333333333333334e-06, "logits/chosen": -23.412349700927734, "logits/rejected": 61.14636993408203, "logps/chosen": -574.0431518554688, "logps/rejected": -896.8446655273438, "loss": 0.8084, "rewards/accuracies": 0.875, "rewards/chosen": -7.080666542053223, "rewards/margins": 24.18720054626465, "rewards/rejected": -31.267868041992188, "step": 724 }, { "epoch": 0.4510108864696734, "grad_norm": 2.4099934101104736, "learning_rate": 1.527777777777778e-06, "logits/chosen": -56.02541732788086, "logits/rejected": 25.27204132080078, "logps/chosen": -717.0059814453125, "logps/rejected": -957.6932983398438, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -7.515704154968262, "rewards/margins": 26.55235481262207, "rewards/rejected": -34.06806182861328, "step": 725 }, { "epoch": 0.45163297045101086, "grad_norm": 0.0029469302389770746, "learning_rate": 1.5222222222222224e-06, "logits/chosen": -187.469970703125, "logits/rejected": -1.767843246459961, "logps/chosen": -366.51458740234375, "logps/rejected": -734.5069580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.107301712036133, "rewards/margins": 23.879457473754883, "rewards/rejected": -31.98676109313965, "step": 726 }, { "epoch": 0.45225505443234837, "grad_norm": 439.93585205078125, "learning_rate": 1.5166666666666668e-06, "logits/chosen": 13.853029251098633, "logits/rejected": 69.74406433105469, "logps/chosen": -2082.96240234375, "logps/rejected": -2330.923583984375, "loss": 1.4754, "rewards/accuracies": 0.875, "rewards/chosen": -11.89030647277832, "rewards/margins": 19.74787139892578, "rewards/rejected": -31.6381778717041, "step": 727 }, { "epoch": 0.4528771384136858, "grad_norm": 21.285776138305664, "learning_rate": 1.5111111111111112e-06, "logits/chosen": -100.06267547607422, "logits/rejected": 29.468894958496094, "logps/chosen": -1290.3515625, "logps/rejected": -2020.12744140625, "loss": 0.0743, "rewards/accuracies": 1.0, "rewards/chosen": -15.826516151428223, "rewards/margins": 28.93760108947754, "rewards/rejected": -44.76411437988281, "step": 728 }, { "epoch": 0.4534992223950233, "grad_norm": 0.38643619418144226, "learning_rate": 1.5055555555555556e-06, "logits/chosen": -80.3835678100586, "logits/rejected": -33.72454833984375, "logps/chosen": -498.37811279296875, "logps/rejected": -728.838623046875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -6.160611152648926, "rewards/margins": 17.963254928588867, "rewards/rejected": -24.123865127563477, "step": 729 }, { "epoch": 0.45412130637636083, "grad_norm": 0.0001543891994515434, "learning_rate": 1.5e-06, "logits/chosen": -200.746826171875, "logits/rejected": 41.371612548828125, "logps/chosen": -414.5986022949219, "logps/rejected": -873.7646484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.050432205200195, "rewards/margins": 26.36678695678711, "rewards/rejected": -34.41722106933594, "step": 730 }, { "epoch": 0.4547433903576983, "grad_norm": 0.12354336678981781, "learning_rate": 1.4944444444444444e-06, "logits/chosen": -173.765380859375, "logits/rejected": 71.8283462524414, "logps/chosen": -631.5, "logps/rejected": -1091.5753173828125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.629332542419434, "rewards/margins": 23.01896858215332, "rewards/rejected": -30.64830207824707, "step": 731 }, { "epoch": 0.4553654743390358, "grad_norm": 9.605352533981204e-05, "learning_rate": 1.4888888888888888e-06, "logits/chosen": -174.76846313476562, "logits/rejected": 0.5329093933105469, "logps/chosen": -245.07998657226562, "logps/rejected": -757.9222412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.093783378601074, "rewards/margins": 27.14044189453125, "rewards/rejected": -33.234222412109375, "step": 732 }, { "epoch": 0.45598755832037324, "grad_norm": 561.3857421875, "learning_rate": 1.4833333333333337e-06, "logits/chosen": -86.11457824707031, "logits/rejected": -7.002838134765625, "logps/chosen": -820.6993408203125, "logps/rejected": -1385.594482421875, "loss": 0.2919, "rewards/accuracies": 0.875, "rewards/chosen": -8.811335563659668, "rewards/margins": 21.74298095703125, "rewards/rejected": -30.554317474365234, "step": 733 }, { "epoch": 0.45660964230171075, "grad_norm": 320.7572937011719, "learning_rate": 1.477777777777778e-06, "logits/chosen": -154.40225219726562, "logits/rejected": 42.933372497558594, "logps/chosen": -974.4981079101562, "logps/rejected": -1639.80322265625, "loss": 1.5032, "rewards/accuracies": 0.875, "rewards/chosen": -11.078603744506836, "rewards/margins": 20.87938690185547, "rewards/rejected": -31.957992553710938, "step": 734 }, { "epoch": 0.4572317262830482, "grad_norm": 0.003680742811411619, "learning_rate": 1.4722222222222225e-06, "logits/chosen": -99.3466567993164, "logits/rejected": -22.38395118713379, "logps/chosen": -244.99826049804688, "logps/rejected": -1113.9642333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.941980838775635, "rewards/margins": 25.03716278076172, "rewards/rejected": -29.979141235351562, "step": 735 }, { "epoch": 0.4578538102643857, "grad_norm": 4.534273147583008, "learning_rate": 1.4666666666666669e-06, "logits/chosen": -148.7191619873047, "logits/rejected": 0.7306156158447266, "logps/chosen": -380.2203063964844, "logps/rejected": -704.259033203125, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": -7.613857746124268, "rewards/margins": 20.20241928100586, "rewards/rejected": -27.81627655029297, "step": 736 }, { "epoch": 0.45847589424572316, "grad_norm": 0.0020630231592804193, "learning_rate": 1.4611111111111113e-06, "logits/chosen": -75.61015319824219, "logits/rejected": 11.637721061706543, "logps/chosen": -417.73260498046875, "logps/rejected": -835.2186279296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.667624473571777, "rewards/margins": 27.696279525756836, "rewards/rejected": -37.36390686035156, "step": 737 }, { "epoch": 0.45909797822706067, "grad_norm": 7.302381992340088, "learning_rate": 1.4555555555555557e-06, "logits/chosen": -247.57852172851562, "logits/rejected": -2.2930078506469727, "logps/chosen": -360.5993347167969, "logps/rejected": -1785.2296142578125, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -7.260935306549072, "rewards/margins": 25.027515411376953, "rewards/rejected": -32.2884521484375, "step": 738 }, { "epoch": 0.4597200622083981, "grad_norm": 1.0254557132720947, "learning_rate": 1.45e-06, "logits/chosen": -163.68707275390625, "logits/rejected": -42.059242248535156, "logps/chosen": -378.3367919921875, "logps/rejected": -618.7022705078125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -6.056993007659912, "rewards/margins": 20.296926498413086, "rewards/rejected": -26.353919982910156, "step": 739 }, { "epoch": 0.4603421461897356, "grad_norm": 3.5925612449645996, "learning_rate": 1.4444444444444445e-06, "logits/chosen": -21.96143341064453, "logits/rejected": 18.220735549926758, "logps/chosen": -477.6463928222656, "logps/rejected": -681.0731811523438, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -9.698896408081055, "rewards/margins": 18.886022567749023, "rewards/rejected": -28.584922790527344, "step": 740 }, { "epoch": 0.4609642301710731, "grad_norm": 5.92808078181406e-07, "learning_rate": 1.4388888888888891e-06, "logits/chosen": -243.30874633789062, "logits/rejected": 49.264305114746094, "logps/chosen": -255.92181396484375, "logps/rejected": -917.1226806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.625999450683594, "rewards/margins": 34.23487091064453, "rewards/rejected": -39.860870361328125, "step": 741 }, { "epoch": 0.4615863141524106, "grad_norm": 0.025632508099079132, "learning_rate": 1.4333333333333335e-06, "logits/chosen": -84.78192901611328, "logits/rejected": 36.935935974121094, "logps/chosen": -384.52703857421875, "logps/rejected": -785.9841918945312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.376428604125977, "rewards/margins": 23.34320831298828, "rewards/rejected": -28.719635009765625, "step": 742 }, { "epoch": 0.46220839813374803, "grad_norm": 1.129104733467102, "learning_rate": 1.427777777777778e-06, "logits/chosen": -111.55496978759766, "logits/rejected": 19.67584800720215, "logps/chosen": -424.76422119140625, "logps/rejected": -721.418701171875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -6.438197135925293, "rewards/margins": 20.46205711364746, "rewards/rejected": -26.900253295898438, "step": 743 }, { "epoch": 0.46283048211508554, "grad_norm": 0.08946477621793747, "learning_rate": 1.4222222222222223e-06, "logits/chosen": -97.6273193359375, "logits/rejected": 56.71605682373047, "logps/chosen": -394.1695861816406, "logps/rejected": -775.4356689453125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.113058090209961, "rewards/margins": 28.39914894104004, "rewards/rejected": -35.51220703125, "step": 744 }, { "epoch": 0.463452566096423, "grad_norm": 40.99982452392578, "learning_rate": 1.4166666666666667e-06, "logits/chosen": -135.53250122070312, "logits/rejected": 32.255592346191406, "logps/chosen": -320.7776794433594, "logps/rejected": -918.77294921875, "loss": 0.2762, "rewards/accuracies": 0.875, "rewards/chosen": -6.600131034851074, "rewards/margins": 28.73556137084961, "rewards/rejected": -35.335693359375, "step": 745 }, { "epoch": 0.4640746500777605, "grad_norm": 0.46382442116737366, "learning_rate": 1.4111111111111111e-06, "logits/chosen": -227.12213134765625, "logits/rejected": -33.051700592041016, "logps/chosen": -571.0344848632812, "logps/rejected": -1574.5330810546875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -9.682711601257324, "rewards/margins": 24.866924285888672, "rewards/rejected": -34.54963684082031, "step": 746 }, { "epoch": 0.464696734059098, "grad_norm": 0.00010395878052804619, "learning_rate": 1.4055555555555555e-06, "logits/chosen": -94.17485809326172, "logits/rejected": -16.49654197692871, "logps/chosen": -270.4022521972656, "logps/rejected": -636.9591674804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.409465789794922, "rewards/margins": 24.12567901611328, "rewards/rejected": -31.535144805908203, "step": 747 }, { "epoch": 0.46531881804043546, "grad_norm": 6.685013795504346e-05, "learning_rate": 1.4000000000000001e-06, "logits/chosen": -136.83506774902344, "logits/rejected": -50.00811004638672, "logps/chosen": -504.32598876953125, "logps/rejected": -1696.194580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.5828094482421875, "rewards/margins": 28.545276641845703, "rewards/rejected": -35.12808609008789, "step": 748 }, { "epoch": 0.46594090202177296, "grad_norm": 1.4022375345230103, "learning_rate": 1.3944444444444446e-06, "logits/chosen": -65.96943664550781, "logits/rejected": 25.33393669128418, "logps/chosen": -523.046875, "logps/rejected": -824.787841796875, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -12.712835311889648, "rewards/margins": 23.786640167236328, "rewards/rejected": -36.499473571777344, "step": 749 }, { "epoch": 0.4665629860031104, "grad_norm": 68.12742614746094, "learning_rate": 1.3888888888888892e-06, "logits/chosen": -46.880035400390625, "logits/rejected": 61.094573974609375, "logps/chosen": -613.8743896484375, "logps/rejected": -813.2623291015625, "loss": 0.4711, "rewards/accuracies": 0.875, "rewards/chosen": -10.169315338134766, "rewards/margins": 16.221559524536133, "rewards/rejected": -26.39087677001953, "step": 750 }, { "epoch": 0.4671850699844479, "grad_norm": 0.06876806169748306, "learning_rate": 1.3833333333333336e-06, "logits/chosen": -4.13031005859375, "logits/rejected": -0.9877738952636719, "logps/chosen": -497.4647521972656, "logps/rejected": -1297.579345703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.237387657165527, "rewards/margins": 29.56896209716797, "rewards/rejected": -37.80635070800781, "step": 751 }, { "epoch": 0.46780715396578537, "grad_norm": 424.5680236816406, "learning_rate": 1.377777777777778e-06, "logits/chosen": -29.843873977661133, "logits/rejected": 16.447145462036133, "logps/chosen": -1066.9786376953125, "logps/rejected": -1151.446533203125, "loss": 0.7259, "rewards/accuracies": 0.875, "rewards/chosen": -13.052844047546387, "rewards/margins": 18.665910720825195, "rewards/rejected": -31.718753814697266, "step": 752 }, { "epoch": 0.4684292379471229, "grad_norm": 838.212890625, "learning_rate": 1.3722222222222224e-06, "logits/chosen": -133.55035400390625, "logits/rejected": 26.87261199951172, "logps/chosen": -1351.7154541015625, "logps/rejected": -2557.692138671875, "loss": 6.455, "rewards/accuracies": 0.75, "rewards/chosen": -5.447998046875, "rewards/margins": 16.497989654541016, "rewards/rejected": -21.945987701416016, "step": 753 }, { "epoch": 0.46905132192846033, "grad_norm": 8.603736877441406, "learning_rate": 1.3666666666666668e-06, "logits/chosen": -79.95362854003906, "logits/rejected": -2.2068262100219727, "logps/chosen": -409.2478942871094, "logps/rejected": -685.8031616210938, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": -5.802241802215576, "rewards/margins": 22.848920822143555, "rewards/rejected": -28.651161193847656, "step": 754 }, { "epoch": 0.46967340590979784, "grad_norm": 0.04553980007767677, "learning_rate": 1.3611111111111112e-06, "logits/chosen": -121.27899169921875, "logits/rejected": -33.0942497253418, "logps/chosen": -621.1445922851562, "logps/rejected": -1797.6920166015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.575942993164062, "rewards/margins": 29.115339279174805, "rewards/rejected": -41.6912841796875, "step": 755 }, { "epoch": 0.4702954898911353, "grad_norm": 1450.013671875, "learning_rate": 1.3555555555555558e-06, "logits/chosen": -129.9134979248047, "logits/rejected": -16.832881927490234, "logps/chosen": -1316.44482421875, "logps/rejected": -2375.8232421875, "loss": 1.5964, "rewards/accuracies": 0.625, "rewards/chosen": -14.721901893615723, "rewards/margins": 25.352672576904297, "rewards/rejected": -40.0745735168457, "step": 756 }, { "epoch": 0.4709175738724728, "grad_norm": 32.29878234863281, "learning_rate": 1.3500000000000002e-06, "logits/chosen": -92.97866821289062, "logits/rejected": -41.528175354003906, "logps/chosen": -1106.26123046875, "logps/rejected": -1738.5997314453125, "loss": 0.1306, "rewards/accuracies": 0.875, "rewards/chosen": -16.135082244873047, "rewards/margins": 23.714895248413086, "rewards/rejected": -39.8499755859375, "step": 757 }, { "epoch": 0.47153965785381025, "grad_norm": 0.32500284910202026, "learning_rate": 1.3444444444444446e-06, "logits/chosen": -86.3247299194336, "logits/rejected": -13.790433883666992, "logps/chosen": -1255.379150390625, "logps/rejected": -1665.0185546875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -16.367998123168945, "rewards/margins": 24.38388442993164, "rewards/rejected": -40.75188064575195, "step": 758 }, { "epoch": 0.47216174183514775, "grad_norm": 110.28009796142578, "learning_rate": 1.338888888888889e-06, "logits/chosen": -218.4553680419922, "logits/rejected": -30.096960067749023, "logps/chosen": -697.8259887695312, "logps/rejected": -1183.4180908203125, "loss": 0.8171, "rewards/accuracies": 0.875, "rewards/chosen": -13.15618896484375, "rewards/margins": 16.802265167236328, "rewards/rejected": -29.958454132080078, "step": 759 }, { "epoch": 0.4727838258164852, "grad_norm": 6.281070709228516, "learning_rate": 1.3333333333333334e-06, "logits/chosen": -57.84153747558594, "logits/rejected": 4.39607048034668, "logps/chosen": -520.5064697265625, "logps/rejected": -748.1658935546875, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -9.815181732177734, "rewards/margins": 19.05870246887207, "rewards/rejected": -28.873882293701172, "step": 760 }, { "epoch": 0.4734059097978227, "grad_norm": 103.11515045166016, "learning_rate": 1.3277777777777778e-06, "logits/chosen": -83.13993835449219, "logits/rejected": -26.825349807739258, "logps/chosen": -587.3551025390625, "logps/rejected": -732.6553344726562, "loss": 1.8947, "rewards/accuracies": 0.75, "rewards/chosen": -8.774739265441895, "rewards/margins": 15.77462387084961, "rewards/rejected": -24.54936408996582, "step": 761 }, { "epoch": 0.47402799377916016, "grad_norm": 50.580352783203125, "learning_rate": 1.3222222222222222e-06, "logits/chosen": -26.291950225830078, "logits/rejected": 45.624488830566406, "logps/chosen": -378.6702880859375, "logps/rejected": -603.4419555664062, "loss": 0.5575, "rewards/accuracies": 0.875, "rewards/chosen": -9.212278366088867, "rewards/margins": 14.647682189941406, "rewards/rejected": -23.859960556030273, "step": 762 }, { "epoch": 0.47465007776049767, "grad_norm": 8.859475997269328e-07, "learning_rate": 1.3166666666666666e-06, "logits/chosen": -75.36311340332031, "logits/rejected": 53.971961975097656, "logps/chosen": -462.31756591796875, "logps/rejected": -878.4935302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.667603015899658, "rewards/margins": 28.9129638671875, "rewards/rejected": -36.58056640625, "step": 763 }, { "epoch": 0.4752721617418352, "grad_norm": 4.507965564727783, "learning_rate": 1.3111111111111112e-06, "logits/chosen": -85.80006408691406, "logits/rejected": -85.2442855834961, "logps/chosen": -599.5420532226562, "logps/rejected": -820.9864501953125, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -7.97761344909668, "rewards/margins": 21.923080444335938, "rewards/rejected": -29.90069580078125, "step": 764 }, { "epoch": 0.4758942457231726, "grad_norm": 21.018447875976562, "learning_rate": 1.3055555555555556e-06, "logits/chosen": -95.10114288330078, "logits/rejected": -29.717361450195312, "logps/chosen": -704.1224975585938, "logps/rejected": -1353.95263671875, "loss": 0.3271, "rewards/accuracies": 0.875, "rewards/chosen": -13.786711692810059, "rewards/margins": 19.059911727905273, "rewards/rejected": -32.846622467041016, "step": 765 }, { "epoch": 0.47651632970451013, "grad_norm": 4.399100303649902, "learning_rate": 1.3e-06, "logits/chosen": -79.9168701171875, "logits/rejected": 45.26768112182617, "logps/chosen": -653.638916015625, "logps/rejected": -931.0980224609375, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -5.225135326385498, "rewards/margins": 21.91374969482422, "rewards/rejected": -27.138885498046875, "step": 766 }, { "epoch": 0.4771384136858476, "grad_norm": 9.789610862731934, "learning_rate": 1.2944444444444447e-06, "logits/chosen": -19.181392669677734, "logits/rejected": 9.625414848327637, "logps/chosen": -515.9251708984375, "logps/rejected": -650.49853515625, "loss": 0.0411, "rewards/accuracies": 1.0, "rewards/chosen": -5.497054100036621, "rewards/margins": 17.973054885864258, "rewards/rejected": -23.470108032226562, "step": 767 }, { "epoch": 0.4777604976671851, "grad_norm": 0.0003901986638084054, "learning_rate": 1.288888888888889e-06, "logits/chosen": -17.775306701660156, "logits/rejected": 62.65926742553711, "logps/chosen": -562.2040405273438, "logps/rejected": -1835.795166015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.555234909057617, "rewards/margins": 29.840126037597656, "rewards/rejected": -34.39535903930664, "step": 768 }, { "epoch": 0.47838258164852254, "grad_norm": 0.13331423699855804, "learning_rate": 1.2833333333333335e-06, "logits/chosen": -84.9162368774414, "logits/rejected": -89.1533203125, "logps/chosen": -389.9031982421875, "logps/rejected": -566.2091064453125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -6.103680610656738, "rewards/margins": 21.367023468017578, "rewards/rejected": -27.470703125, "step": 769 }, { "epoch": 0.47900466562986005, "grad_norm": 76.48078918457031, "learning_rate": 1.2777777777777779e-06, "logits/chosen": -132.94378662109375, "logits/rejected": -71.70551300048828, "logps/chosen": -682.2783203125, "logps/rejected": -698.073974609375, "loss": 0.64, "rewards/accuracies": 0.875, "rewards/chosen": -8.276190757751465, "rewards/margins": 14.120307922363281, "rewards/rejected": -22.39649772644043, "step": 770 }, { "epoch": 0.4796267496111975, "grad_norm": 0.03500295802950859, "learning_rate": 1.2722222222222223e-06, "logits/chosen": -154.8763427734375, "logits/rejected": -20.12619400024414, "logps/chosen": -495.89312744140625, "logps/rejected": -834.1507568359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.446554660797119, "rewards/margins": 23.833330154418945, "rewards/rejected": -31.279882431030273, "step": 771 }, { "epoch": 0.480248833592535, "grad_norm": 1.477426528930664, "learning_rate": 1.2666666666666669e-06, "logits/chosen": -63.224124908447266, "logits/rejected": 76.65580749511719, "logps/chosen": -685.2578125, "logps/rejected": -1371.101806640625, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -11.197524070739746, "rewards/margins": 27.853137969970703, "rewards/rejected": -39.05066680908203, "step": 772 }, { "epoch": 0.48087091757387246, "grad_norm": 0.0004227452154736966, "learning_rate": 1.2611111111111113e-06, "logits/chosen": -79.38337707519531, "logits/rejected": 52.92335510253906, "logps/chosen": -504.8081970214844, "logps/rejected": -759.9849243164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.118171691894531, "rewards/margins": 21.612485885620117, "rewards/rejected": -27.73065757751465, "step": 773 }, { "epoch": 0.48149300155520997, "grad_norm": 0.003543645842000842, "learning_rate": 1.2555555555555557e-06, "logits/chosen": -33.06328582763672, "logits/rejected": 14.608135223388672, "logps/chosen": -550.0364990234375, "logps/rejected": -893.1881103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.755828380584717, "rewards/margins": 28.648494720458984, "rewards/rejected": -33.404327392578125, "step": 774 }, { "epoch": 0.4821150855365474, "grad_norm": 0.03421971946954727, "learning_rate": 1.25e-06, "logits/chosen": -134.6974334716797, "logits/rejected": 23.91486358642578, "logps/chosen": -783.36962890625, "logps/rejected": -1381.6165771484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.948232650756836, "rewards/margins": 20.673641204833984, "rewards/rejected": -29.621871948242188, "step": 775 }, { "epoch": 0.4827371695178849, "grad_norm": 1290.2222900390625, "learning_rate": 1.2444444444444445e-06, "logits/chosen": -85.71219635009766, "logits/rejected": -43.05216979980469, "logps/chosen": -1520.309326171875, "logps/rejected": -1170.6256103515625, "loss": 3.4457, "rewards/accuracies": 0.875, "rewards/chosen": -14.69404411315918, "rewards/margins": 18.86469268798828, "rewards/rejected": -33.558738708496094, "step": 776 }, { "epoch": 0.4833592534992224, "grad_norm": 60.80632019042969, "learning_rate": 1.2388888888888891e-06, "logits/chosen": -135.27468872070312, "logits/rejected": 64.14369201660156, "logps/chosen": -502.1170349121094, "logps/rejected": -911.0506591796875, "loss": 0.2513, "rewards/accuracies": 0.875, "rewards/chosen": -6.582192420959473, "rewards/margins": 22.399246215820312, "rewards/rejected": -28.9814395904541, "step": 777 }, { "epoch": 0.4839813374805599, "grad_norm": 3.3139188289642334, "learning_rate": 1.2333333333333335e-06, "logits/chosen": -133.04454040527344, "logits/rejected": -98.82015228271484, "logps/chosen": -1223.7120361328125, "logps/rejected": -1419.457275390625, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -12.170188903808594, "rewards/margins": 22.006811141967773, "rewards/rejected": -34.177001953125, "step": 778 }, { "epoch": 0.48460342146189733, "grad_norm": 1.7188092726883042e-08, "learning_rate": 1.227777777777778e-06, "logits/chosen": -169.68699645996094, "logits/rejected": 28.896669387817383, "logps/chosen": -416.91839599609375, "logps/rejected": -933.982666015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.363171577453613, "rewards/margins": 33.29971694946289, "rewards/rejected": -41.66288757324219, "step": 779 }, { "epoch": 0.48522550544323484, "grad_norm": 0.019180424511432648, "learning_rate": 1.2222222222222223e-06, "logits/chosen": -112.29360961914062, "logits/rejected": 43.95820617675781, "logps/chosen": -306.61151123046875, "logps/rejected": -752.361328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.311319828033447, "rewards/margins": 25.710323333740234, "rewards/rejected": -32.021644592285156, "step": 780 }, { "epoch": 0.4858475894245723, "grad_norm": 7.801623344421387, "learning_rate": 1.2166666666666667e-06, "logits/chosen": -167.13601684570312, "logits/rejected": -100.21235656738281, "logps/chosen": -414.0590515136719, "logps/rejected": -718.2651977539062, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": -5.062987327575684, "rewards/margins": 22.653976440429688, "rewards/rejected": -27.716962814331055, "step": 781 }, { "epoch": 0.4864696734059098, "grad_norm": 0.00885347742587328, "learning_rate": 1.2111111111111111e-06, "logits/chosen": -67.37416076660156, "logits/rejected": -43.40993881225586, "logps/chosen": -635.25341796875, "logps/rejected": -768.30810546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.608406066894531, "rewards/margins": 20.203596115112305, "rewards/rejected": -32.8120002746582, "step": 782 }, { "epoch": 0.4870917573872473, "grad_norm": 0.001759762060828507, "learning_rate": 1.2055555555555555e-06, "logits/chosen": -117.10198211669922, "logits/rejected": 5.605515003204346, "logps/chosen": -443.94775390625, "logps/rejected": -828.2411499023438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.242298126220703, "rewards/margins": 25.7496395111084, "rewards/rejected": -33.991939544677734, "step": 783 }, { "epoch": 0.48771384136858476, "grad_norm": 2.2811286449432373, "learning_rate": 1.2000000000000002e-06, "logits/chosen": -25.989660263061523, "logits/rejected": -1.2958402633666992, "logps/chosen": -512.8759765625, "logps/rejected": -670.24560546875, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -6.108600616455078, "rewards/margins": 15.675848007202148, "rewards/rejected": -21.784448623657227, "step": 784 }, { "epoch": 0.48833592534992226, "grad_norm": 31.097185134887695, "learning_rate": 1.1944444444444446e-06, "logits/chosen": -124.17618560791016, "logits/rejected": 36.79144287109375, "logps/chosen": -364.1361083984375, "logps/rejected": -703.2439575195312, "loss": 0.1364, "rewards/accuracies": 1.0, "rewards/chosen": -6.542037010192871, "rewards/margins": 18.263164520263672, "rewards/rejected": -24.805198669433594, "step": 785 }, { "epoch": 0.4889580093312597, "grad_norm": 0.1221480593085289, "learning_rate": 1.188888888888889e-06, "logits/chosen": -86.44905090332031, "logits/rejected": 17.79490852355957, "logps/chosen": -604.401611328125, "logps/rejected": -959.8953247070312, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.946455955505371, "rewards/margins": 20.450824737548828, "rewards/rejected": -29.397281646728516, "step": 786 }, { "epoch": 0.4895800933125972, "grad_norm": 23.349821090698242, "learning_rate": 1.1833333333333334e-06, "logits/chosen": -128.18051147460938, "logits/rejected": 74.09746551513672, "logps/chosen": -306.3685302734375, "logps/rejected": -703.7098388671875, "loss": 0.0978, "rewards/accuracies": 0.875, "rewards/chosen": -6.383405685424805, "rewards/margins": 21.267290115356445, "rewards/rejected": -27.650693893432617, "step": 787 }, { "epoch": 0.49020217729393467, "grad_norm": 12.782303810119629, "learning_rate": 1.1777777777777778e-06, "logits/chosen": -6.366929531097412, "logits/rejected": -4.293282508850098, "logps/chosen": -629.8514404296875, "logps/rejected": -808.7570190429688, "loss": 0.0408, "rewards/accuracies": 1.0, "rewards/chosen": -6.965999603271484, "rewards/margins": 20.24810791015625, "rewards/rejected": -27.214109420776367, "step": 788 }, { "epoch": 0.4908242612752722, "grad_norm": 0.050425127148628235, "learning_rate": 1.1722222222222224e-06, "logits/chosen": -101.6484375, "logits/rejected": 58.47187042236328, "logps/chosen": -412.6858215332031, "logps/rejected": -811.67578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.240036010742188, "rewards/margins": 16.556400299072266, "rewards/rejected": -25.796436309814453, "step": 789 }, { "epoch": 0.49144634525660963, "grad_norm": 7.510043144226074, "learning_rate": 1.1666666666666668e-06, "logits/chosen": -85.51889038085938, "logits/rejected": 26.52862548828125, "logps/chosen": -400.0337829589844, "logps/rejected": -665.0091552734375, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": -7.367213249206543, "rewards/margins": 14.574678421020508, "rewards/rejected": -21.941890716552734, "step": 790 }, { "epoch": 0.49206842923794714, "grad_norm": 0.00031603348907083273, "learning_rate": 1.1611111111111112e-06, "logits/chosen": -104.32347106933594, "logits/rejected": 87.80584716796875, "logps/chosen": -540.6323852539062, "logps/rejected": -1335.2225341796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.491456985473633, "rewards/margins": 27.664722442626953, "rewards/rejected": -37.15618133544922, "step": 791 }, { "epoch": 0.4926905132192846, "grad_norm": 0.15867695212364197, "learning_rate": 1.1555555555555556e-06, "logits/chosen": -112.59764099121094, "logits/rejected": 65.92097473144531, "logps/chosen": -577.6868286132812, "logps/rejected": -1369.201171875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -8.941761016845703, "rewards/margins": 25.734575271606445, "rewards/rejected": -34.676334381103516, "step": 792 }, { "epoch": 0.4933125972006221, "grad_norm": 0.005466988310217857, "learning_rate": 1.1500000000000002e-06, "logits/chosen": -188.63864135742188, "logits/rejected": -23.10389518737793, "logps/chosen": -447.15484619140625, "logps/rejected": -857.9431762695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.378690719604492, "rewards/margins": 27.409475326538086, "rewards/rejected": -34.788169860839844, "step": 793 }, { "epoch": 0.49393468118195955, "grad_norm": 1.4321177005767822, "learning_rate": 1.1444444444444446e-06, "logits/chosen": -58.74419021606445, "logits/rejected": -6.3274688720703125, "logps/chosen": -811.5924072265625, "logps/rejected": -1340.5816650390625, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -10.812429428100586, "rewards/margins": 27.774620056152344, "rewards/rejected": -38.58705139160156, "step": 794 }, { "epoch": 0.49455676516329705, "grad_norm": 0.22366230189800262, "learning_rate": 1.138888888888889e-06, "logits/chosen": -78.07588195800781, "logits/rejected": 48.15463638305664, "logps/chosen": -606.1942138671875, "logps/rejected": -800.5628662109375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.144711971282959, "rewards/margins": 21.434534072875977, "rewards/rejected": -28.579248428344727, "step": 795 }, { "epoch": 0.4951788491446345, "grad_norm": 34.753318786621094, "learning_rate": 1.1333333333333334e-06, "logits/chosen": -202.78797912597656, "logits/rejected": -52.16399383544922, "logps/chosen": -1104.3931884765625, "logps/rejected": -2984.791748046875, "loss": 0.1265, "rewards/accuracies": 0.875, "rewards/chosen": -13.148391723632812, "rewards/margins": 24.268150329589844, "rewards/rejected": -37.416542053222656, "step": 796 }, { "epoch": 0.495800933125972, "grad_norm": 0.5012848973274231, "learning_rate": 1.1277777777777778e-06, "logits/chosen": -112.75540161132812, "logits/rejected": -2.0931739807128906, "logps/chosen": -470.1435852050781, "logps/rejected": -807.0820922851562, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -8.223106384277344, "rewards/margins": 22.956302642822266, "rewards/rejected": -31.17940902709961, "step": 797 }, { "epoch": 0.49642301710730946, "grad_norm": 0.001359703135676682, "learning_rate": 1.1222222222222222e-06, "logits/chosen": -192.61326599121094, "logits/rejected": -77.79004669189453, "logps/chosen": -712.4236450195312, "logps/rejected": -1396.9161376953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.636786460876465, "rewards/margins": 23.335918426513672, "rewards/rejected": -31.972702026367188, "step": 798 }, { "epoch": 0.49704510108864697, "grad_norm": 0.0007623012643307447, "learning_rate": 1.1166666666666666e-06, "logits/chosen": -122.147216796875, "logits/rejected": 27.71405792236328, "logps/chosen": -423.3140869140625, "logps/rejected": -1568.11767578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.942974090576172, "rewards/margins": 30.311073303222656, "rewards/rejected": -38.254051208496094, "step": 799 }, { "epoch": 0.4976671850699845, "grad_norm": 0.6951321959495544, "learning_rate": 1.111111111111111e-06, "logits/chosen": -19.4531307220459, "logits/rejected": 59.53974151611328, "logps/chosen": -720.5052490234375, "logps/rejected": -857.0462036132812, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -7.503777027130127, "rewards/margins": 17.832021713256836, "rewards/rejected": -25.335798263549805, "step": 800 }, { "epoch": 0.4982892690513219, "grad_norm": 14.06302547454834, "learning_rate": 1.1055555555555557e-06, "logits/chosen": 9.543481826782227, "logits/rejected": 26.585514068603516, "logps/chosen": -820.0592041015625, "logps/rejected": -981.0501708984375, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": -8.32360553741455, "rewards/margins": 21.351694107055664, "rewards/rejected": -29.6752986907959, "step": 801 }, { "epoch": 0.49891135303265943, "grad_norm": 0.8960484862327576, "learning_rate": 1.1e-06, "logits/chosen": -73.817138671875, "logits/rejected": -22.463111877441406, "logps/chosen": -726.0693969726562, "logps/rejected": -812.912109375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -8.350183486938477, "rewards/margins": 16.799339294433594, "rewards/rejected": -25.14952278137207, "step": 802 }, { "epoch": 0.4995334370139969, "grad_norm": 0.007091051433235407, "learning_rate": 1.0944444444444445e-06, "logits/chosen": -126.402099609375, "logits/rejected": 63.41026306152344, "logps/chosen": -354.9360656738281, "logps/rejected": -742.4476318359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.198775291442871, "rewards/margins": 20.73122787475586, "rewards/rejected": -25.930002212524414, "step": 803 }, { "epoch": 0.5001555209953343, "grad_norm": 0.035982389003038406, "learning_rate": 1.0888888888888889e-06, "logits/chosen": -158.6231689453125, "logits/rejected": 42.4395751953125, "logps/chosen": -447.70953369140625, "logps/rejected": -931.365966796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.468390464782715, "rewards/margins": 23.573461532592773, "rewards/rejected": -33.04185485839844, "step": 804 }, { "epoch": 0.5007776049766719, "grad_norm": 54.38862609863281, "learning_rate": 1.0833333333333335e-06, "logits/chosen": -39.203147888183594, "logits/rejected": -42.8327751159668, "logps/chosen": -497.3260803222656, "logps/rejected": -579.7037963867188, "loss": 0.6785, "rewards/accuracies": 0.875, "rewards/chosen": -6.13219690322876, "rewards/margins": 15.116905212402344, "rewards/rejected": -21.249099731445312, "step": 805 }, { "epoch": 0.5013996889580093, "grad_norm": 0.46212929487228394, "learning_rate": 1.0777777777777779e-06, "logits/chosen": -122.7110824584961, "logits/rejected": -56.752845764160156, "logps/chosen": -562.4315185546875, "logps/rejected": -1572.6318359375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -9.006181716918945, "rewards/margins": 25.918704986572266, "rewards/rejected": -34.92488479614258, "step": 806 }, { "epoch": 0.5020217729393468, "grad_norm": 0.15322664380073547, "learning_rate": 1.0722222222222223e-06, "logits/chosen": -2.593364715576172, "logits/rejected": 63.01699447631836, "logps/chosen": -618.697998046875, "logps/rejected": -775.5316162109375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.904258728027344, "rewards/margins": 19.612674713134766, "rewards/rejected": -25.516935348510742, "step": 807 }, { "epoch": 0.5026438569206843, "grad_norm": 3.0954246520996094, "learning_rate": 1.066666666666667e-06, "logits/chosen": -58.32223129272461, "logits/rejected": -23.86005210876465, "logps/chosen": -1088.94970703125, "logps/rejected": -1719.3585205078125, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -12.135305404663086, "rewards/margins": 20.116741180419922, "rewards/rejected": -32.25204849243164, "step": 808 }, { "epoch": 0.5032659409020218, "grad_norm": 5.97275447845459, "learning_rate": 1.0611111111111113e-06, "logits/chosen": -163.36102294921875, "logits/rejected": -6.6377668380737305, "logps/chosen": -320.9810485839844, "logps/rejected": -768.1694946289062, "loss": 0.0223, "rewards/accuracies": 1.0, "rewards/chosen": -4.2564191818237305, "rewards/margins": 21.35448455810547, "rewards/rejected": -25.610902786254883, "step": 809 }, { "epoch": 0.5038880248833593, "grad_norm": 0.00038198623224161565, "learning_rate": 1.0555555555555557e-06, "logits/chosen": -20.06270408630371, "logits/rejected": 0.06752777099609375, "logps/chosen": -404.18621826171875, "logps/rejected": -561.1058349609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.348189353942871, "rewards/margins": 19.517799377441406, "rewards/rejected": -26.865989685058594, "step": 810 }, { "epoch": 0.5045101088646967, "grad_norm": 0.8940173387527466, "learning_rate": 1.0500000000000001e-06, "logits/chosen": -194.52886962890625, "logits/rejected": 14.64038372039795, "logps/chosen": -1233.7379150390625, "logps/rejected": -2348.429443359375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -6.674137592315674, "rewards/margins": 26.68243980407715, "rewards/rejected": -33.35657501220703, "step": 811 }, { "epoch": 0.5051321928460342, "grad_norm": 6.097475124988705e-05, "learning_rate": 1.0444444444444445e-06, "logits/chosen": -182.1772918701172, "logits/rejected": 53.819923400878906, "logps/chosen": -745.404541015625, "logps/rejected": -1805.2493896484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.672194480895996, "rewards/margins": 26.31649398803711, "rewards/rejected": -35.988685607910156, "step": 812 }, { "epoch": 0.5057542768273717, "grad_norm": 0.30417296290397644, "learning_rate": 1.038888888888889e-06, "logits/chosen": -144.7532958984375, "logits/rejected": 32.75605392456055, "logps/chosen": -1093.7080078125, "logps/rejected": -1695.1502685546875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -12.306402206420898, "rewards/margins": 28.240373611450195, "rewards/rejected": -40.546775817871094, "step": 813 }, { "epoch": 0.5063763608087092, "grad_norm": 0.10419566184282303, "learning_rate": 1.0333333333333333e-06, "logits/chosen": -107.64749145507812, "logits/rejected": 4.129255294799805, "logps/chosen": -354.80487060546875, "logps/rejected": -627.330810546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.342581748962402, "rewards/margins": 22.532852172851562, "rewards/rejected": -28.87543487548828, "step": 814 }, { "epoch": 0.5069984447900466, "grad_norm": 0.5217145681381226, "learning_rate": 1.0277777777777777e-06, "logits/chosen": -122.59197235107422, "logits/rejected": 38.18445587158203, "logps/chosen": -1092.721923828125, "logps/rejected": -1855.8404541015625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -13.067987442016602, "rewards/margins": 23.710399627685547, "rewards/rejected": -36.77838897705078, "step": 815 }, { "epoch": 0.5076205287713841, "grad_norm": 0.015259141102433205, "learning_rate": 1.0222222222222223e-06, "logits/chosen": -111.74264526367188, "logits/rejected": 18.99979019165039, "logps/chosen": -574.563232421875, "logps/rejected": -1126.4327392578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.97230863571167, "rewards/margins": 29.848556518554688, "rewards/rejected": -36.820865631103516, "step": 816 }, { "epoch": 0.5082426127527216, "grad_norm": 42.426761627197266, "learning_rate": 1.0166666666666667e-06, "logits/chosen": -203.6204071044922, "logits/rejected": -19.469696044921875, "logps/chosen": -607.760009765625, "logps/rejected": -1923.6600341796875, "loss": 0.1534, "rewards/accuracies": 0.875, "rewards/chosen": -9.396963119506836, "rewards/margins": 23.123554229736328, "rewards/rejected": -32.52051544189453, "step": 817 }, { "epoch": 0.5088646967340591, "grad_norm": 26.991989135742188, "learning_rate": 1.0111111111111111e-06, "logits/chosen": -92.52302551269531, "logits/rejected": 9.58810806274414, "logps/chosen": -499.6994323730469, "logps/rejected": -820.214599609375, "loss": 0.0474, "rewards/accuracies": 1.0, "rewards/chosen": -7.87128210067749, "rewards/margins": 22.458473205566406, "rewards/rejected": -30.329753875732422, "step": 818 }, { "epoch": 0.5094867807153965, "grad_norm": 0.426338791847229, "learning_rate": 1.0055555555555556e-06, "logits/chosen": -47.71516418457031, "logits/rejected": 39.30681610107422, "logps/chosen": -559.4285888671875, "logps/rejected": -799.8426513671875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.889528274536133, "rewards/margins": 24.206363677978516, "rewards/rejected": -33.09589385986328, "step": 819 }, { "epoch": 0.5101088646967341, "grad_norm": 8.074851393757854e-06, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -139.68301391601562, "logits/rejected": 36.518898010253906, "logps/chosen": -721.7512817382812, "logps/rejected": -1113.0599365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.689058303833008, "rewards/margins": 28.095703125, "rewards/rejected": -34.784759521484375, "step": 820 }, { "epoch": 0.5107309486780716, "grad_norm": 0.0676528587937355, "learning_rate": 9.944444444444446e-07, "logits/chosen": -58.00905227661133, "logits/rejected": 31.57887077331543, "logps/chosen": -487.1253662109375, "logps/rejected": -760.5587768554688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.476115226745605, "rewards/margins": 19.65747833251953, "rewards/rejected": -30.133594512939453, "step": 821 }, { "epoch": 0.511353032659409, "grad_norm": 0.0006774406647309661, "learning_rate": 9.88888888888889e-07, "logits/chosen": -186.0316619873047, "logits/rejected": -30.9036865234375, "logps/chosen": -297.0524597167969, "logps/rejected": -718.5314331054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.740857124328613, "rewards/margins": 25.053611755371094, "rewards/rejected": -31.79446792602539, "step": 822 }, { "epoch": 0.5119751166407465, "grad_norm": 1.2454219131541322e-06, "learning_rate": 9.833333333333334e-07, "logits/chosen": -185.17919921875, "logits/rejected": -43.8787956237793, "logps/chosen": -480.292236328125, "logps/rejected": -789.608154296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.746007442474365, "rewards/margins": 25.33332061767578, "rewards/rejected": -33.07933044433594, "step": 823 }, { "epoch": 0.512597200622084, "grad_norm": 0.0032215684186667204, "learning_rate": 9.77777777777778e-07, "logits/chosen": -12.684924125671387, "logits/rejected": 53.901981353759766, "logps/chosen": -628.0204467773438, "logps/rejected": -846.1293334960938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.768143653869629, "rewards/margins": 20.50605010986328, "rewards/rejected": -27.274194717407227, "step": 824 }, { "epoch": 0.5132192846034215, "grad_norm": 44.66653823852539, "learning_rate": 9.722222222222224e-07, "logits/chosen": -54.31118392944336, "logits/rejected": -32.13478469848633, "logps/chosen": -722.8442993164062, "logps/rejected": -1421.018310546875, "loss": 0.3516, "rewards/accuracies": 0.875, "rewards/chosen": -4.981404781341553, "rewards/margins": 18.17508316040039, "rewards/rejected": -23.15648651123047, "step": 825 }, { "epoch": 0.5138413685847589, "grad_norm": 0.008648249320685863, "learning_rate": 9.666666666666668e-07, "logits/chosen": -83.87012481689453, "logits/rejected": 63.499755859375, "logps/chosen": -420.0633850097656, "logps/rejected": -833.63720703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.89228630065918, "rewards/margins": 23.61211395263672, "rewards/rejected": -30.5044002532959, "step": 826 }, { "epoch": 0.5144634525660964, "grad_norm": 30.128643035888672, "learning_rate": 9.611111111111112e-07, "logits/chosen": -115.29667663574219, "logits/rejected": -14.685375213623047, "logps/chosen": -417.88677978515625, "logps/rejected": -683.2611694335938, "loss": 0.1903, "rewards/accuracies": 0.875, "rewards/chosen": -3.896798610687256, "rewards/margins": 21.491680145263672, "rewards/rejected": -25.38848114013672, "step": 827 }, { "epoch": 0.5150855365474339, "grad_norm": 5.245591455604881e-05, "learning_rate": 9.555555555555556e-07, "logits/chosen": -119.66450500488281, "logits/rejected": 44.030418395996094, "logps/chosen": -382.2896423339844, "logps/rejected": -767.4990844726562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.766641616821289, "rewards/margins": 25.389842987060547, "rewards/rejected": -33.15648651123047, "step": 828 }, { "epoch": 0.5157076205287714, "grad_norm": 0.0001896356843644753, "learning_rate": 9.500000000000001e-07, "logits/chosen": -87.0384750366211, "logits/rejected": 23.808502197265625, "logps/chosen": -388.2333068847656, "logps/rejected": -647.1336669921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.834542274475098, "rewards/margins": 20.444068908691406, "rewards/rejected": -30.278610229492188, "step": 829 }, { "epoch": 0.5163297045101088, "grad_norm": 5.341070175170898, "learning_rate": 9.444444444444445e-07, "logits/chosen": -146.3845977783203, "logits/rejected": 2.0931625366210938, "logps/chosen": -1073.31298828125, "logps/rejected": -2127.587158203125, "loss": 0.0332, "rewards/accuracies": 1.0, "rewards/chosen": -10.206274032592773, "rewards/margins": 20.0421142578125, "rewards/rejected": -30.24838638305664, "step": 830 }, { "epoch": 0.5169517884914463, "grad_norm": 4.2202437100513635e-08, "learning_rate": 9.388888888888889e-07, "logits/chosen": -207.63766479492188, "logits/rejected": -60.26597213745117, "logps/chosen": -368.68975830078125, "logps/rejected": -1205.767333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.202880859375, "rewards/margins": 33.91260528564453, "rewards/rejected": -40.11548614501953, "step": 831 }, { "epoch": 0.5175738724727839, "grad_norm": 0.27999940514564514, "learning_rate": 9.333333333333334e-07, "logits/chosen": -50.21611022949219, "logits/rejected": 56.904510498046875, "logps/chosen": -603.9766845703125, "logps/rejected": -1339.534912109375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -8.407368659973145, "rewards/margins": 25.60379409790039, "rewards/rejected": -34.011165618896484, "step": 832 }, { "epoch": 0.5181959564541213, "grad_norm": 0.002325457753613591, "learning_rate": 9.277777777777778e-07, "logits/chosen": -48.065696716308594, "logits/rejected": -40.092498779296875, "logps/chosen": -587.875, "logps/rejected": -795.5709228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.386090278625488, "rewards/margins": 24.233123779296875, "rewards/rejected": -31.619216918945312, "step": 833 }, { "epoch": 0.5188180404354588, "grad_norm": 0.039517052471637726, "learning_rate": 9.222222222222222e-07, "logits/chosen": -186.51185607910156, "logits/rejected": -23.94443702697754, "logps/chosen": -503.7452392578125, "logps/rejected": -2663.202392578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.813905239105225, "rewards/margins": 26.897171020507812, "rewards/rejected": -33.71107482910156, "step": 834 }, { "epoch": 0.5194401244167963, "grad_norm": 0.0031487091910094023, "learning_rate": 9.166666666666666e-07, "logits/chosen": -80.9673080444336, "logits/rejected": 43.110939025878906, "logps/chosen": -286.49609375, "logps/rejected": -555.93310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.663183212280273, "rewards/margins": 18.23088836669922, "rewards/rejected": -26.894073486328125, "step": 835 }, { "epoch": 0.5200622083981338, "grad_norm": 0.0442747138440609, "learning_rate": 9.111111111111113e-07, "logits/chosen": -59.27135467529297, "logits/rejected": -50.51622009277344, "logps/chosen": -582.3428955078125, "logps/rejected": -731.079833984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.876436233520508, "rewards/margins": 24.229711532592773, "rewards/rejected": -34.10614776611328, "step": 836 }, { "epoch": 0.5206842923794712, "grad_norm": 3.294527778052725e-05, "learning_rate": 9.055555555555557e-07, "logits/chosen": -141.17420959472656, "logits/rejected": -28.406482696533203, "logps/chosen": -442.17327880859375, "logps/rejected": -798.8004150390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.553028106689453, "rewards/margins": 28.370576858520508, "rewards/rejected": -36.923606872558594, "step": 837 }, { "epoch": 0.5213063763608087, "grad_norm": 0.5813305377960205, "learning_rate": 9.000000000000001e-07, "logits/chosen": -109.40097045898438, "logits/rejected": -4.245128631591797, "logps/chosen": -1078.3134765625, "logps/rejected": -1591.6990966796875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -19.99920082092285, "rewards/margins": 23.410926818847656, "rewards/rejected": -43.410125732421875, "step": 838 }, { "epoch": 0.5219284603421462, "grad_norm": 1.7486151415724294e-09, "learning_rate": 8.944444444444445e-07, "logits/chosen": -129.99496459960938, "logits/rejected": 142.2227020263672, "logps/chosen": -322.6527099609375, "logps/rejected": -926.0189819335938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.830894470214844, "rewards/margins": 39.06279754638672, "rewards/rejected": -45.89369201660156, "step": 839 }, { "epoch": 0.5225505443234837, "grad_norm": 1.2666715383529663, "learning_rate": 8.88888888888889e-07, "logits/chosen": -89.93159484863281, "logits/rejected": 83.07142639160156, "logps/chosen": -1148.3018798828125, "logps/rejected": -1829.3731689453125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -14.328964233398438, "rewards/margins": 26.85198974609375, "rewards/rejected": -41.18095397949219, "step": 840 }, { "epoch": 0.5231726283048211, "grad_norm": 43.10531997680664, "learning_rate": 8.833333333333334e-07, "logits/chosen": -49.95688247680664, "logits/rejected": -15.388423919677734, "logps/chosen": -372.6175537109375, "logps/rejected": -541.112548828125, "loss": 0.3581, "rewards/accuracies": 0.875, "rewards/chosen": -7.981208801269531, "rewards/margins": 17.061294555664062, "rewards/rejected": -25.04250144958496, "step": 841 }, { "epoch": 0.5237947122861586, "grad_norm": 0.2844661772251129, "learning_rate": 8.777777777777778e-07, "logits/chosen": -109.6478500366211, "logits/rejected": -41.96812438964844, "logps/chosen": -636.424072265625, "logps/rejected": -836.0592041015625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -8.304250717163086, "rewards/margins": 21.51742172241211, "rewards/rejected": -29.821670532226562, "step": 842 }, { "epoch": 0.5244167962674962, "grad_norm": 0.010013116523623466, "learning_rate": 8.722222222222224e-07, "logits/chosen": -16.94856071472168, "logits/rejected": 11.071537017822266, "logps/chosen": -879.642578125, "logps/rejected": -1319.298828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.528738021850586, "rewards/margins": 21.012815475463867, "rewards/rejected": -33.54155349731445, "step": 843 }, { "epoch": 0.5250388802488336, "grad_norm": 2.3913791179656982, "learning_rate": 8.666666666666668e-07, "logits/chosen": -86.56632232666016, "logits/rejected": -19.607769012451172, "logps/chosen": -975.7464599609375, "logps/rejected": -2160.980712890625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -9.544780731201172, "rewards/margins": 27.01382064819336, "rewards/rejected": -36.55860137939453, "step": 844 }, { "epoch": 0.5256609642301711, "grad_norm": 12.165240287780762, "learning_rate": 8.611111111111112e-07, "logits/chosen": -90.56256866455078, "logits/rejected": -4.555442810058594, "logps/chosen": -580.6461181640625, "logps/rejected": -704.717529296875, "loss": 0.0362, "rewards/accuracies": 1.0, "rewards/chosen": -9.008340835571289, "rewards/margins": 12.787553787231445, "rewards/rejected": -21.795894622802734, "step": 845 }, { "epoch": 0.5262830482115085, "grad_norm": 6.134949217084795e-05, "learning_rate": 8.555555555555556e-07, "logits/chosen": -145.3144073486328, "logits/rejected": -74.45709991455078, "logps/chosen": -353.292724609375, "logps/rejected": -719.39697265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.8193817138671875, "rewards/margins": 27.45532989501953, "rewards/rejected": -33.27471160888672, "step": 846 }, { "epoch": 0.5269051321928461, "grad_norm": 0.26117029786109924, "learning_rate": 8.500000000000001e-07, "logits/chosen": -126.07553100585938, "logits/rejected": -39.950294494628906, "logps/chosen": -446.1632995605469, "logps/rejected": -756.325439453125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -12.564342498779297, "rewards/margins": 19.859272003173828, "rewards/rejected": -32.423614501953125, "step": 847 }, { "epoch": 0.5275272161741835, "grad_norm": 5.370923645386938e-06, "learning_rate": 8.444444444444445e-07, "logits/chosen": -146.56402587890625, "logits/rejected": 15.39506721496582, "logps/chosen": -394.5847473144531, "logps/rejected": -933.09228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.998437881469727, "rewards/margins": 29.90552520751953, "rewards/rejected": -37.903961181640625, "step": 848 }, { "epoch": 0.528149300155521, "grad_norm": 910.6976318359375, "learning_rate": 8.388888888888889e-07, "logits/chosen": -138.39822387695312, "logits/rejected": -58.659366607666016, "logps/chosen": -1132.43017578125, "logps/rejected": -1245.0130615234375, "loss": 2.3638, "rewards/accuracies": 0.875, "rewards/chosen": -20.2799015045166, "rewards/margins": 14.926755905151367, "rewards/rejected": -35.20665740966797, "step": 849 }, { "epoch": 0.5287713841368584, "grad_norm": 0.00024127349024638534, "learning_rate": 8.333333333333333e-07, "logits/chosen": -235.1168212890625, "logits/rejected": 55.60449981689453, "logps/chosen": -264.3034362792969, "logps/rejected": -761.09619140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.371866226196289, "rewards/margins": 23.470722198486328, "rewards/rejected": -31.842586517333984, "step": 850 }, { "epoch": 0.529393468118196, "grad_norm": 259.7936096191406, "learning_rate": 8.277777777777779e-07, "logits/chosen": 8.482486724853516, "logits/rejected": 28.25489616394043, "logps/chosen": -681.7047119140625, "logps/rejected": -1236.337158203125, "loss": 0.0956, "rewards/accuracies": 0.875, "rewards/chosen": -10.538414001464844, "rewards/margins": 20.94775390625, "rewards/rejected": -31.486167907714844, "step": 851 }, { "epoch": 0.5300155520995334, "grad_norm": 261.53350830078125, "learning_rate": 8.222222222222223e-07, "logits/chosen": -84.49862670898438, "logits/rejected": -24.465980529785156, "logps/chosen": -767.8336791992188, "logps/rejected": -817.7130126953125, "loss": 0.3914, "rewards/accuracies": 0.875, "rewards/chosen": -12.387303352355957, "rewards/margins": 16.945751190185547, "rewards/rejected": -29.33305549621582, "step": 852 }, { "epoch": 0.5306376360808709, "grad_norm": 15.506037712097168, "learning_rate": 8.166666666666668e-07, "logits/chosen": -127.98323059082031, "logits/rejected": 28.72386360168457, "logps/chosen": -419.86505126953125, "logps/rejected": -755.94091796875, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": -6.950892448425293, "rewards/margins": 25.021717071533203, "rewards/rejected": -31.972606658935547, "step": 853 }, { "epoch": 0.5312597200622085, "grad_norm": 7.774425506591797, "learning_rate": 8.111111111111112e-07, "logits/chosen": -189.38522338867188, "logits/rejected": 29.426782608032227, "logps/chosen": -306.844970703125, "logps/rejected": -728.2288208007812, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -9.713464736938477, "rewards/margins": 23.127696990966797, "rewards/rejected": -32.84115982055664, "step": 854 }, { "epoch": 0.5318818040435459, "grad_norm": 0.0008791831787675619, "learning_rate": 8.055555555555557e-07, "logits/chosen": -36.51375961303711, "logits/rejected": 35.5898551940918, "logps/chosen": -599.6082153320312, "logps/rejected": -922.4703979492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.662460327148438, "rewards/margins": 24.500829696655273, "rewards/rejected": -36.16328811645508, "step": 855 }, { "epoch": 0.5325038880248834, "grad_norm": 0.20585988461971283, "learning_rate": 8.000000000000001e-07, "logits/chosen": -114.61103057861328, "logits/rejected": -4.071637153625488, "logps/chosen": -406.62713623046875, "logps/rejected": -774.7154541015625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.121740341186523, "rewards/margins": 28.484004974365234, "rewards/rejected": -34.605743408203125, "step": 856 }, { "epoch": 0.5331259720062208, "grad_norm": 3.589526750147343e-05, "learning_rate": 7.944444444444445e-07, "logits/chosen": -159.44439697265625, "logits/rejected": -7.972356796264648, "logps/chosen": -405.5602722167969, "logps/rejected": -794.030517578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.591901779174805, "rewards/margins": 24.136524200439453, "rewards/rejected": -34.728424072265625, "step": 857 }, { "epoch": 0.5337480559875584, "grad_norm": 61.47627258300781, "learning_rate": 7.888888888888889e-07, "logits/chosen": -105.62098693847656, "logits/rejected": -1.2084236145019531, "logps/chosen": -409.990234375, "logps/rejected": -1650.571533203125, "loss": 0.6165, "rewards/accuracies": 0.75, "rewards/chosen": -11.533965110778809, "rewards/margins": 23.72869873046875, "rewards/rejected": -35.262664794921875, "step": 858 }, { "epoch": 0.5343701399688958, "grad_norm": 623.6025390625, "learning_rate": 7.833333333333335e-07, "logits/chosen": -136.017578125, "logits/rejected": -28.613344192504883, "logps/chosen": -1098.1689453125, "logps/rejected": -1068.584716796875, "loss": 1.5422, "rewards/accuracies": 0.875, "rewards/chosen": -14.555343627929688, "rewards/margins": 23.741247177124023, "rewards/rejected": -38.296592712402344, "step": 859 }, { "epoch": 0.5349922239502333, "grad_norm": 78.34994506835938, "learning_rate": 7.777777777777779e-07, "logits/chosen": -112.71089172363281, "logits/rejected": -60.49833679199219, "logps/chosen": -602.4278564453125, "logps/rejected": -1385.170654296875, "loss": 0.3646, "rewards/accuracies": 0.875, "rewards/chosen": -8.029106140136719, "rewards/margins": 22.138999938964844, "rewards/rejected": -30.168106079101562, "step": 860 }, { "epoch": 0.5356143079315707, "grad_norm": 0.2625846862792969, "learning_rate": 7.722222222222223e-07, "logits/chosen": -45.29359436035156, "logits/rejected": -5.414055824279785, "logps/chosen": -583.503662109375, "logps/rejected": -757.3583374023438, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -10.39488697052002, "rewards/margins": 17.386009216308594, "rewards/rejected": -27.78089714050293, "step": 861 }, { "epoch": 0.5362363919129083, "grad_norm": 3.701441528392024e-05, "learning_rate": 7.666666666666667e-07, "logits/chosen": -103.17862701416016, "logits/rejected": -53.896400451660156, "logps/chosen": -763.0181884765625, "logps/rejected": -1675.606201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.108431816101074, "rewards/margins": 35.056732177734375, "rewards/rejected": -43.165164947509766, "step": 862 }, { "epoch": 0.5368584758942457, "grad_norm": 0.00015114927373360842, "learning_rate": 7.611111111111112e-07, "logits/chosen": -189.86087036132812, "logits/rejected": 66.08692169189453, "logps/chosen": -427.2649230957031, "logps/rejected": -883.819091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.595136642456055, "rewards/margins": 25.899477005004883, "rewards/rejected": -36.49461364746094, "step": 863 }, { "epoch": 0.5374805598755832, "grad_norm": 0.12338165938854218, "learning_rate": 7.555555555555556e-07, "logits/chosen": -98.48855590820312, "logits/rejected": 30.035297393798828, "logps/chosen": -441.1501770019531, "logps/rejected": -745.9420166015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -12.13723087310791, "rewards/margins": 20.437583923339844, "rewards/rejected": -32.57481384277344, "step": 864 }, { "epoch": 0.5381026438569206, "grad_norm": 0.013184742070734501, "learning_rate": 7.5e-07, "logits/chosen": -163.33673095703125, "logits/rejected": -22.73147964477539, "logps/chosen": -479.5675048828125, "logps/rejected": -815.1461181640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.198512077331543, "rewards/margins": 23.010210037231445, "rewards/rejected": -30.208721160888672, "step": 865 }, { "epoch": 0.5387247278382582, "grad_norm": 0.2516389489173889, "learning_rate": 7.444444444444444e-07, "logits/chosen": -92.22774505615234, "logits/rejected": 38.381324768066406, "logps/chosen": -796.99609375, "logps/rejected": -1124.147216796875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -10.516542434692383, "rewards/margins": 24.317276000976562, "rewards/rejected": -34.83381652832031, "step": 866 }, { "epoch": 0.5393468118195957, "grad_norm": 8.88383510755375e-05, "learning_rate": 7.38888888888889e-07, "logits/chosen": -171.62026977539062, "logits/rejected": 55.91582107543945, "logps/chosen": -500.8038635253906, "logps/rejected": -953.6299438476562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.28930950164795, "rewards/margins": 26.94706153869629, "rewards/rejected": -37.23637008666992, "step": 867 }, { "epoch": 0.5399688958009331, "grad_norm": 1.3378676176071167, "learning_rate": 7.333333333333334e-07, "logits/chosen": -83.50889587402344, "logits/rejected": -123.29743957519531, "logps/chosen": -1460.8887939453125, "logps/rejected": -1291.987548828125, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -7.702480792999268, "rewards/margins": 23.44473648071289, "rewards/rejected": -31.147218704223633, "step": 868 }, { "epoch": 0.5405909797822706, "grad_norm": 49.51680374145508, "learning_rate": 7.277777777777778e-07, "logits/chosen": -62.35301208496094, "logits/rejected": -19.25672721862793, "logps/chosen": -1026.770751953125, "logps/rejected": -1635.1710205078125, "loss": 0.2427, "rewards/accuracies": 0.875, "rewards/chosen": -8.677632331848145, "rewards/margins": 17.6263484954834, "rewards/rejected": -26.303979873657227, "step": 869 }, { "epoch": 0.5412130637636081, "grad_norm": 35.43935775756836, "learning_rate": 7.222222222222222e-07, "logits/chosen": -106.6770248413086, "logits/rejected": 10.777801513671875, "logps/chosen": -409.45721435546875, "logps/rejected": -927.302001953125, "loss": 0.1258, "rewards/accuracies": 0.875, "rewards/chosen": -6.5649824142456055, "rewards/margins": 23.832622528076172, "rewards/rejected": -30.397607803344727, "step": 870 }, { "epoch": 0.5418351477449456, "grad_norm": 27.153738021850586, "learning_rate": 7.166666666666668e-07, "logits/chosen": -38.62207794189453, "logits/rejected": 59.163726806640625, "logps/chosen": -585.1434326171875, "logps/rejected": -894.4569702148438, "loss": 0.0766, "rewards/accuracies": 1.0, "rewards/chosen": -13.120367050170898, "rewards/margins": 23.192588806152344, "rewards/rejected": -36.312957763671875, "step": 871 }, { "epoch": 0.542457231726283, "grad_norm": 21.4934024810791, "learning_rate": 7.111111111111112e-07, "logits/chosen": -51.06928634643555, "logits/rejected": 23.516983032226562, "logps/chosen": -700.455078125, "logps/rejected": -1192.09033203125, "loss": 0.1716, "rewards/accuracies": 0.875, "rewards/chosen": -14.23714828491211, "rewards/margins": 26.093929290771484, "rewards/rejected": -40.331077575683594, "step": 872 }, { "epoch": 0.5430793157076206, "grad_norm": 0.08452106267213821, "learning_rate": 7.055555555555556e-07, "logits/chosen": -179.34121704101562, "logits/rejected": -14.55868148803711, "logps/chosen": -251.3523712158203, "logps/rejected": -695.3548583984375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.102681636810303, "rewards/margins": 25.201805114746094, "rewards/rejected": -31.304485321044922, "step": 873 }, { "epoch": 0.543701399688958, "grad_norm": 0.03789067640900612, "learning_rate": 7.000000000000001e-07, "logits/chosen": -138.58985900878906, "logits/rejected": -13.961944580078125, "logps/chosen": -462.5423278808594, "logps/rejected": -837.7919921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.471914291381836, "rewards/margins": 25.22199821472168, "rewards/rejected": -31.693912506103516, "step": 874 }, { "epoch": 0.5443234836702955, "grad_norm": 0.31519633531570435, "learning_rate": 6.944444444444446e-07, "logits/chosen": -114.78215026855469, "logits/rejected": -43.2697868347168, "logps/chosen": -650.9859008789062, "logps/rejected": -1927.809326171875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.380867004394531, "rewards/margins": 22.756309509277344, "rewards/rejected": -30.137176513671875, "step": 875 }, { "epoch": 0.5449455676516329, "grad_norm": 75.33787536621094, "learning_rate": 6.88888888888889e-07, "logits/chosen": -124.58892059326172, "logits/rejected": 15.276653289794922, "logps/chosen": -1428.8154296875, "logps/rejected": -2245.8876953125, "loss": 0.0619, "rewards/accuracies": 1.0, "rewards/chosen": -13.573984146118164, "rewards/margins": 26.55134391784668, "rewards/rejected": -40.125328063964844, "step": 876 }, { "epoch": 0.5455676516329705, "grad_norm": 2.258311951663927e-06, "learning_rate": 6.833333333333334e-07, "logits/chosen": -81.523681640625, "logits/rejected": -9.222400665283203, "logps/chosen": -749.4202270507812, "logps/rejected": -1102.953857421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.937610149383545, "rewards/margins": 29.364364624023438, "rewards/rejected": -36.301971435546875, "step": 877 }, { "epoch": 0.546189735614308, "grad_norm": 2.849450453368263e-09, "learning_rate": 6.777777777777779e-07, "logits/chosen": -210.6507568359375, "logits/rejected": -18.14429473876953, "logps/chosen": -387.59326171875, "logps/rejected": -961.952392578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.600461959838867, "rewards/margins": 37.50526809692383, "rewards/rejected": -47.10573196411133, "step": 878 }, { "epoch": 0.5468118195956454, "grad_norm": 45.974788665771484, "learning_rate": 6.722222222222223e-07, "logits/chosen": -71.80035400390625, "logits/rejected": 36.47096252441406, "logps/chosen": -593.3605346679688, "logps/rejected": -899.4530029296875, "loss": 0.259, "rewards/accuracies": 0.875, "rewards/chosen": -10.03809928894043, "rewards/margins": 25.24864959716797, "rewards/rejected": -35.286746978759766, "step": 879 }, { "epoch": 0.5474339035769828, "grad_norm": 0.00043019073200412095, "learning_rate": 6.666666666666667e-07, "logits/chosen": -97.2318115234375, "logits/rejected": -43.58901596069336, "logps/chosen": -670.2297973632812, "logps/rejected": -1032.017333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.023365020751953, "rewards/margins": 24.331575393676758, "rewards/rejected": -33.35493850708008, "step": 880 }, { "epoch": 0.5480559875583204, "grad_norm": 23.392335891723633, "learning_rate": 6.611111111111111e-07, "logits/chosen": -156.52621459960938, "logits/rejected": -44.803470611572266, "logps/chosen": -1457.2607421875, "logps/rejected": -1976.0267333984375, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -16.032331466674805, "rewards/margins": 24.539155960083008, "rewards/rejected": -40.57148742675781, "step": 881 }, { "epoch": 0.5486780715396579, "grad_norm": 1.2681506872177124, "learning_rate": 6.555555555555556e-07, "logits/chosen": -79.65126037597656, "logits/rejected": -16.458913803100586, "logps/chosen": -417.6703796386719, "logps/rejected": -617.889892578125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.498174667358398, "rewards/margins": 15.998775482177734, "rewards/rejected": -24.496950149536133, "step": 882 }, { "epoch": 0.5493001555209953, "grad_norm": 0.2813882827758789, "learning_rate": 6.5e-07, "logits/chosen": -100.19284057617188, "logits/rejected": -26.876911163330078, "logps/chosen": -685.9544677734375, "logps/rejected": -864.5910034179688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -8.291885375976562, "rewards/margins": 26.230627059936523, "rewards/rejected": -34.52251434326172, "step": 883 }, { "epoch": 0.5499222395023328, "grad_norm": 40.99506759643555, "learning_rate": 6.444444444444445e-07, "logits/chosen": -95.4145278930664, "logits/rejected": 46.00867462158203, "logps/chosen": -665.7019653320312, "logps/rejected": -969.52587890625, "loss": 0.171, "rewards/accuracies": 0.875, "rewards/chosen": -7.031434535980225, "rewards/margins": 29.162317276000977, "rewards/rejected": -36.193748474121094, "step": 884 }, { "epoch": 0.5505443234836703, "grad_norm": 1.882736325263977, "learning_rate": 6.388888888888889e-07, "logits/chosen": -74.63520812988281, "logits/rejected": 17.476755142211914, "logps/chosen": -448.08349609375, "logps/rejected": -868.2949829101562, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -6.384781360626221, "rewards/margins": 28.33990478515625, "rewards/rejected": -34.72468566894531, "step": 885 }, { "epoch": 0.5511664074650078, "grad_norm": 0.00016385865455958992, "learning_rate": 6.333333333333334e-07, "logits/chosen": -158.45333862304688, "logits/rejected": 108.83325958251953, "logps/chosen": -357.3680725097656, "logps/rejected": -1039.1295166015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.407934188842773, "rewards/margins": 35.541282653808594, "rewards/rejected": -44.94921875, "step": 886 }, { "epoch": 0.5517884914463452, "grad_norm": 6.816873073577881, "learning_rate": 6.277777777777778e-07, "logits/chosen": -73.053466796875, "logits/rejected": 10.117366790771484, "logps/chosen": -437.90301513671875, "logps/rejected": -685.7943115234375, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": -10.622243881225586, "rewards/margins": 19.535533905029297, "rewards/rejected": -30.157779693603516, "step": 887 }, { "epoch": 0.5524105754276827, "grad_norm": 6.990767928982677e-07, "learning_rate": 6.222222222222223e-07, "logits/chosen": -113.49324035644531, "logits/rejected": -85.80583190917969, "logps/chosen": -1229.2333984375, "logps/rejected": -1450.372314453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.116369247436523, "rewards/margins": 28.681102752685547, "rewards/rejected": -38.79747009277344, "step": 888 }, { "epoch": 0.5530326594090202, "grad_norm": 6.020338535308838, "learning_rate": 6.166666666666668e-07, "logits/chosen": -177.3443603515625, "logits/rejected": -27.7156982421875, "logps/chosen": -523.21240234375, "logps/rejected": -1750.424072265625, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -12.017009735107422, "rewards/margins": 25.977027893066406, "rewards/rejected": -37.99403762817383, "step": 889 }, { "epoch": 0.5536547433903577, "grad_norm": 2.125719902323908e-06, "learning_rate": 6.111111111111112e-07, "logits/chosen": -140.36248779296875, "logits/rejected": 1.7500667572021484, "logps/chosen": -579.1171264648438, "logps/rejected": -1402.8060302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.774377822875977, "rewards/margins": 26.47208023071289, "rewards/rejected": -34.246456146240234, "step": 890 }, { "epoch": 0.5542768273716951, "grad_norm": 0.0029740314930677414, "learning_rate": 6.055555555555556e-07, "logits/chosen": -102.68355560302734, "logits/rejected": -37.37559127807617, "logps/chosen": -1531.4537353515625, "logps/rejected": -1953.917724609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.647120475769043, "rewards/margins": 29.044919967651367, "rewards/rejected": -35.692039489746094, "step": 891 }, { "epoch": 0.5548989113530327, "grad_norm": 0.0003689638979267329, "learning_rate": 6.000000000000001e-07, "logits/chosen": -84.5270004272461, "logits/rejected": -55.99785614013672, "logps/chosen": -503.6187744140625, "logps/rejected": -769.6951904296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.760152816772461, "rewards/margins": 27.22166633605957, "rewards/rejected": -35.98181915283203, "step": 892 }, { "epoch": 0.5555209953343702, "grad_norm": 70.11630249023438, "learning_rate": 5.944444444444445e-07, "logits/chosen": -61.59108352661133, "logits/rejected": -29.13300895690918, "logps/chosen": -645.667724609375, "logps/rejected": -822.9845581054688, "loss": 1.108, "rewards/accuracies": 0.875, "rewards/chosen": -8.119356155395508, "rewards/margins": 26.82553482055664, "rewards/rejected": -34.94489288330078, "step": 893 }, { "epoch": 0.5561430793157076, "grad_norm": 140.50698852539062, "learning_rate": 5.888888888888889e-07, "logits/chosen": -72.34748077392578, "logits/rejected": -13.706929206848145, "logps/chosen": -477.07843017578125, "logps/rejected": -683.7586669921875, "loss": 1.6053, "rewards/accuracies": 0.625, "rewards/chosen": -13.4237699508667, "rewards/margins": 14.0787353515625, "rewards/rejected": -27.502506256103516, "step": 894 }, { "epoch": 0.5567651632970451, "grad_norm": 17.314931869506836, "learning_rate": 5.833333333333334e-07, "logits/chosen": -60.047935485839844, "logits/rejected": 73.61708068847656, "logps/chosen": -475.3175964355469, "logps/rejected": -960.1270751953125, "loss": 0.2124, "rewards/accuracies": 0.875, "rewards/chosen": -11.760129928588867, "rewards/margins": 18.824878692626953, "rewards/rejected": -30.58500862121582, "step": 895 }, { "epoch": 0.5573872472783826, "grad_norm": 6.9655890464782715, "learning_rate": 5.777777777777778e-07, "logits/chosen": -116.27103424072266, "logits/rejected": 26.892370223999023, "logps/chosen": -497.8246154785156, "logps/rejected": -1013.1151733398438, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -10.241984367370605, "rewards/margins": 34.979736328125, "rewards/rejected": -45.22172164916992, "step": 896 }, { "epoch": 0.5580093312597201, "grad_norm": 0.0005471979966387153, "learning_rate": 5.722222222222223e-07, "logits/chosen": -120.10930633544922, "logits/rejected": -65.96672058105469, "logps/chosen": -742.7201538085938, "logps/rejected": -1626.986572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.301980972290039, "rewards/margins": 37.79054641723633, "rewards/rejected": -50.092529296875, "step": 897 }, { "epoch": 0.5586314152410575, "grad_norm": 0.15467680990695953, "learning_rate": 5.666666666666667e-07, "logits/chosen": -31.04018211364746, "logits/rejected": -1.5827350616455078, "logps/chosen": -459.9176940917969, "logps/rejected": -635.8056030273438, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.017556190490723, "rewards/margins": 23.54890251159668, "rewards/rejected": -30.56645965576172, "step": 898 }, { "epoch": 0.559253499222395, "grad_norm": 3.2687224038596696e-09, "learning_rate": 5.611111111111111e-07, "logits/chosen": -83.61625671386719, "logits/rejected": 14.583654403686523, "logps/chosen": -353.7633056640625, "logps/rejected": -854.531494140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.267993927001953, "rewards/margins": 34.79167556762695, "rewards/rejected": -43.059669494628906, "step": 899 }, { "epoch": 0.5598755832037325, "grad_norm": 0.025710569694638252, "learning_rate": 5.555555555555555e-07, "logits/chosen": -120.35882568359375, "logits/rejected": -45.35907745361328, "logps/chosen": -436.2097473144531, "logps/rejected": -772.7053833007812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.865041732788086, "rewards/margins": 25.405567169189453, "rewards/rejected": -34.27061080932617, "step": 900 }, { "epoch": 0.56049766718507, "grad_norm": 5.37911319732666, "learning_rate": 5.5e-07, "logits/chosen": -121.33502960205078, "logits/rejected": 1.539163589477539, "logps/chosen": -1055.9678955078125, "logps/rejected": -1757.5068359375, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": -11.440217018127441, "rewards/margins": 32.76325607299805, "rewards/rejected": -44.203468322753906, "step": 901 }, { "epoch": 0.5611197511664074, "grad_norm": 3.0019876956939697, "learning_rate": 5.444444444444444e-07, "logits/chosen": -94.8713607788086, "logits/rejected": -12.205665588378906, "logps/chosen": -433.6889343261719, "logps/rejected": -768.8201904296875, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -7.44275426864624, "rewards/margins": 24.147319793701172, "rewards/rejected": -31.590072631835938, "step": 902 }, { "epoch": 0.5617418351477449, "grad_norm": 541.1249389648438, "learning_rate": 5.388888888888889e-07, "logits/chosen": -237.37808227539062, "logits/rejected": 45.501136779785156, "logps/chosen": -936.36328125, "logps/rejected": -1902.43408203125, "loss": 3.0866, "rewards/accuracies": 0.875, "rewards/chosen": -7.267169952392578, "rewards/margins": 18.379119873046875, "rewards/rejected": -25.646289825439453, "step": 903 }, { "epoch": 0.5623639191290825, "grad_norm": 0.08545476943254471, "learning_rate": 5.333333333333335e-07, "logits/chosen": -177.2371826171875, "logits/rejected": 36.40119552612305, "logps/chosen": -593.7579956054688, "logps/rejected": -1758.9149169921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.374723434448242, "rewards/margins": 24.538997650146484, "rewards/rejected": -32.913719177246094, "step": 904 }, { "epoch": 0.5629860031104199, "grad_norm": 4.686671257019043, "learning_rate": 5.277777777777779e-07, "logits/chosen": -22.941822052001953, "logits/rejected": 43.089656829833984, "logps/chosen": -616.3505859375, "logps/rejected": -768.3704833984375, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -7.885283470153809, "rewards/margins": 17.801992416381836, "rewards/rejected": -25.687274932861328, "step": 905 }, { "epoch": 0.5636080870917574, "grad_norm": 135.04791259765625, "learning_rate": 5.222222222222223e-07, "logits/chosen": -114.8431396484375, "logits/rejected": -89.34736633300781, "logps/chosen": -760.0064086914062, "logps/rejected": -1168.9241943359375, "loss": 0.5633, "rewards/accuracies": 0.875, "rewards/chosen": -14.212739944458008, "rewards/margins": 20.148963928222656, "rewards/rejected": -34.3617057800293, "step": 906 }, { "epoch": 0.5642301710730949, "grad_norm": 0.018912388011813164, "learning_rate": 5.166666666666667e-07, "logits/chosen": -114.16242980957031, "logits/rejected": -12.398643493652344, "logps/chosen": -461.0420227050781, "logps/rejected": -956.0383911132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.740528106689453, "rewards/margins": 25.995716094970703, "rewards/rejected": -31.736244201660156, "step": 907 }, { "epoch": 0.5648522550544324, "grad_norm": 100.40081787109375, "learning_rate": 5.111111111111112e-07, "logits/chosen": -113.76556396484375, "logits/rejected": 58.61322021484375, "logps/chosen": -606.61572265625, "logps/rejected": -954.71875, "loss": 1.3117, "rewards/accuracies": 0.875, "rewards/chosen": -10.405505180358887, "rewards/margins": 22.123138427734375, "rewards/rejected": -32.52864456176758, "step": 908 }, { "epoch": 0.5654743390357698, "grad_norm": 0.007085845805704594, "learning_rate": 5.055555555555556e-07, "logits/chosen": -112.7616958618164, "logits/rejected": 70.79476165771484, "logps/chosen": -316.0310363769531, "logps/rejected": -751.3112182617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.268526077270508, "rewards/margins": 26.015960693359375, "rewards/rejected": -34.28448486328125, "step": 909 }, { "epoch": 0.5660964230171073, "grad_norm": 0.591586172580719, "learning_rate": 5.000000000000001e-07, "logits/chosen": -84.45720672607422, "logits/rejected": 72.56228637695312, "logps/chosen": -1008.2890014648438, "logps/rejected": -1173.46435546875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -7.480664253234863, "rewards/margins": 25.762676239013672, "rewards/rejected": -33.24333953857422, "step": 910 }, { "epoch": 0.5667185069984448, "grad_norm": 2.9883475303649902, "learning_rate": 4.944444444444445e-07, "logits/chosen": -68.27751922607422, "logits/rejected": 42.13774871826172, "logps/chosen": -571.269775390625, "logps/rejected": -916.4152221679688, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -8.539706230163574, "rewards/margins": 18.39183807373047, "rewards/rejected": -26.931541442871094, "step": 911 }, { "epoch": 0.5673405909797823, "grad_norm": 0.003857386764138937, "learning_rate": 4.88888888888889e-07, "logits/chosen": -22.587879180908203, "logits/rejected": 98.34217071533203, "logps/chosen": -576.606201171875, "logps/rejected": -883.6702270507812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.284748077392578, "rewards/margins": 29.316617965698242, "rewards/rejected": -37.60136413574219, "step": 912 }, { "epoch": 0.5679626749611197, "grad_norm": 734.1175537109375, "learning_rate": 4.833333333333334e-07, "logits/chosen": -72.37592315673828, "logits/rejected": 54.369590759277344, "logps/chosen": -849.1474609375, "logps/rejected": -1245.89404296875, "loss": 0.5441, "rewards/accuracies": 0.875, "rewards/chosen": -10.799903869628906, "rewards/margins": 24.33060073852539, "rewards/rejected": -35.1305046081543, "step": 913 }, { "epoch": 0.5685847589424572, "grad_norm": 0.06870723515748978, "learning_rate": 4.777777777777778e-07, "logits/chosen": -70.1803207397461, "logits/rejected": 1.1819171905517578, "logps/chosen": -411.0703125, "logps/rejected": -735.897705078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.553101539611816, "rewards/margins": 18.302066802978516, "rewards/rejected": -27.855167388916016, "step": 914 }, { "epoch": 0.5692068429237948, "grad_norm": 0.36051803827285767, "learning_rate": 4.7222222222222226e-07, "logits/chosen": -73.1631088256836, "logits/rejected": 80.39761352539062, "logps/chosen": -582.9667358398438, "logps/rejected": -912.0345458984375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -11.575501441955566, "rewards/margins": 20.38041877746582, "rewards/rejected": -31.95591926574707, "step": 915 }, { "epoch": 0.5698289269051322, "grad_norm": 0.0034848100040107965, "learning_rate": 4.666666666666667e-07, "logits/chosen": -177.8181915283203, "logits/rejected": 37.95863342285156, "logps/chosen": -425.399658203125, "logps/rejected": -869.671630859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.320931434631348, "rewards/margins": 25.436542510986328, "rewards/rejected": -35.757476806640625, "step": 916 }, { "epoch": 0.5704510108864697, "grad_norm": 1.2685463428497314, "learning_rate": 4.611111111111111e-07, "logits/chosen": -138.88084411621094, "logits/rejected": 19.98822784423828, "logps/chosen": -338.0628662109375, "logps/rejected": -848.312744140625, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -8.149038314819336, "rewards/margins": 25.696699142456055, "rewards/rejected": -33.84573745727539, "step": 917 }, { "epoch": 0.5710730948678071, "grad_norm": 3.442761182785034, "learning_rate": 4.5555555555555563e-07, "logits/chosen": -155.47402954101562, "logits/rejected": -33.16758728027344, "logps/chosen": -336.50189208984375, "logps/rejected": -804.9049072265625, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -8.782480239868164, "rewards/margins": 29.706771850585938, "rewards/rejected": -38.48925018310547, "step": 918 }, { "epoch": 0.5716951788491447, "grad_norm": 0.0004174673813395202, "learning_rate": 4.5000000000000003e-07, "logits/chosen": -59.324180603027344, "logits/rejected": 85.84835815429688, "logps/chosen": -519.4781494140625, "logps/rejected": -909.3726196289062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.762813568115234, "rewards/margins": 25.67284393310547, "rewards/rejected": -34.43566131591797, "step": 919 }, { "epoch": 0.5723172628304821, "grad_norm": 1137.14892578125, "learning_rate": 4.444444444444445e-07, "logits/chosen": -189.93911743164062, "logits/rejected": -63.0181884765625, "logps/chosen": -591.1880493164062, "logps/rejected": -1783.66552734375, "loss": 0.3116, "rewards/accuracies": 0.75, "rewards/chosen": -11.161086082458496, "rewards/margins": 17.17401123046875, "rewards/rejected": -28.335098266601562, "step": 920 }, { "epoch": 0.5729393468118196, "grad_norm": 0.1336127668619156, "learning_rate": 4.388888888888889e-07, "logits/chosen": -101.9530258178711, "logits/rejected": 56.09759521484375, "logps/chosen": -563.1547241210938, "logps/rejected": -1241.709228515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.56002426147461, "rewards/margins": 28.970205307006836, "rewards/rejected": -37.53022766113281, "step": 921 }, { "epoch": 0.573561430793157, "grad_norm": 0.0025217137299478054, "learning_rate": 4.333333333333334e-07, "logits/chosen": -136.8080596923828, "logits/rejected": 10.55208683013916, "logps/chosen": -554.4729614257812, "logps/rejected": -1807.3134765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.396238327026367, "rewards/margins": 24.28984832763672, "rewards/rejected": -35.68608856201172, "step": 922 }, { "epoch": 0.5741835147744946, "grad_norm": 2.818798780441284, "learning_rate": 4.277777777777778e-07, "logits/chosen": -78.01873779296875, "logits/rejected": 42.75094985961914, "logps/chosen": -672.8967895507812, "logps/rejected": -900.1581420898438, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -11.389653205871582, "rewards/margins": 19.148941040039062, "rewards/rejected": -30.538593292236328, "step": 923 }, { "epoch": 0.574805598755832, "grad_norm": 0.1297747790813446, "learning_rate": 4.2222222222222226e-07, "logits/chosen": -133.55982971191406, "logits/rejected": -40.91748046875, "logps/chosen": -525.1929321289062, "logps/rejected": -763.32275390625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.87146282196045, "rewards/margins": 22.406570434570312, "rewards/rejected": -31.278032302856445, "step": 924 }, { "epoch": 0.5754276827371695, "grad_norm": 0.004122474230825901, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -104.8353271484375, "logits/rejected": 32.96745300292969, "logps/chosen": -243.96768188476562, "logps/rejected": -711.6646118164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.787356376647949, "rewards/margins": 31.571420669555664, "rewards/rejected": -36.3587760925293, "step": 925 }, { "epoch": 0.576049766718507, "grad_norm": 1.2637054920196533, "learning_rate": 4.111111111111112e-07, "logits/chosen": -64.72044372558594, "logits/rejected": 20.97930908203125, "logps/chosen": -452.12335205078125, "logps/rejected": -749.1896362304688, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -8.239216804504395, "rewards/margins": 24.404882431030273, "rewards/rejected": -32.64410400390625, "step": 926 }, { "epoch": 0.5766718506998445, "grad_norm": 0.0010183807462453842, "learning_rate": 4.055555555555556e-07, "logits/chosen": -133.03054809570312, "logits/rejected": -68.99278259277344, "logps/chosen": -558.1740112304688, "logps/rejected": -1027.790283203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.520581245422363, "rewards/margins": 35.3626708984375, "rewards/rejected": -46.88324737548828, "step": 927 }, { "epoch": 0.577293934681182, "grad_norm": 1.4988895600254182e-06, "learning_rate": 4.0000000000000003e-07, "logits/chosen": -69.49510192871094, "logits/rejected": 14.783453941345215, "logps/chosen": -403.23858642578125, "logps/rejected": -805.2083740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.001250267028809, "rewards/margins": 31.45196533203125, "rewards/rejected": -41.45321273803711, "step": 928 }, { "epoch": 0.5779160186625194, "grad_norm": 2.4139995558059013e-11, "learning_rate": 3.9444444444444444e-07, "logits/chosen": -209.7872314453125, "logits/rejected": -53.07157897949219, "logps/chosen": -629.387939453125, "logps/rejected": -1105.12646484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.496204376220703, "rewards/margins": 37.678802490234375, "rewards/rejected": -47.17500686645508, "step": 929 }, { "epoch": 0.578538102643857, "grad_norm": 1.609663963317871, "learning_rate": 3.8888888888888895e-07, "logits/chosen": -17.327072143554688, "logits/rejected": 0.6025829315185547, "logps/chosen": -483.5240478515625, "logps/rejected": -685.63330078125, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -8.159501075744629, "rewards/margins": 24.617897033691406, "rewards/rejected": -32.77739715576172, "step": 930 }, { "epoch": 0.5791601866251944, "grad_norm": 0.0020553036592900753, "learning_rate": 3.8333333333333335e-07, "logits/chosen": -157.98153686523438, "logits/rejected": -27.474838256835938, "logps/chosen": -383.92236328125, "logps/rejected": -701.0250854492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.0099263191223145, "rewards/margins": 20.951326370239258, "rewards/rejected": -25.96125030517578, "step": 931 }, { "epoch": 0.5797822706065319, "grad_norm": 0.3729798495769501, "learning_rate": 3.777777777777778e-07, "logits/chosen": -109.71688842773438, "logits/rejected": -66.7043685913086, "logps/chosen": -574.3134765625, "logps/rejected": -778.2509155273438, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.336092472076416, "rewards/margins": 17.312389373779297, "rewards/rejected": -23.648481369018555, "step": 932 }, { "epoch": 0.5804043545878693, "grad_norm": 0.9305241703987122, "learning_rate": 3.722222222222222e-07, "logits/chosen": -38.88398361206055, "logits/rejected": 35.08024597167969, "logps/chosen": -543.8375854492188, "logps/rejected": -855.8046875, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -8.190146446228027, "rewards/margins": 23.962013244628906, "rewards/rejected": -32.15216064453125, "step": 933 }, { "epoch": 0.5810264385692069, "grad_norm": 0.001967187039554119, "learning_rate": 3.666666666666667e-07, "logits/chosen": -173.9066162109375, "logits/rejected": 26.52267837524414, "logps/chosen": -1196.4072265625, "logps/rejected": -1926.178955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.530378341674805, "rewards/margins": 28.957054138183594, "rewards/rejected": -37.48743438720703, "step": 934 }, { "epoch": 0.5816485225505443, "grad_norm": 65.84040069580078, "learning_rate": 3.611111111111111e-07, "logits/chosen": -41.13294219970703, "logits/rejected": -30.353885650634766, "logps/chosen": -1312.9461669921875, "logps/rejected": -1444.022705078125, "loss": 0.6853, "rewards/accuracies": 0.875, "rewards/chosen": -14.569475173950195, "rewards/margins": 22.24240493774414, "rewards/rejected": -36.81188201904297, "step": 935 }, { "epoch": 0.5822706065318818, "grad_norm": 0.0003252999158576131, "learning_rate": 3.555555555555556e-07, "logits/chosen": -81.82218933105469, "logits/rejected": 22.613759994506836, "logps/chosen": -1397.3013916015625, "logps/rejected": -1828.4632568359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.249957084655762, "rewards/margins": 35.74221420288086, "rewards/rejected": -43.99216842651367, "step": 936 }, { "epoch": 0.5828926905132192, "grad_norm": 622.4991455078125, "learning_rate": 3.5000000000000004e-07, "logits/chosen": -55.592315673828125, "logits/rejected": -3.659071922302246, "logps/chosen": -989.3563232421875, "logps/rejected": -2037.777587890625, "loss": 0.1248, "rewards/accuracies": 0.875, "rewards/chosen": -11.83365249633789, "rewards/margins": 25.7764949798584, "rewards/rejected": -37.610145568847656, "step": 937 }, { "epoch": 0.5835147744945568, "grad_norm": 0.00038271051016636193, "learning_rate": 3.444444444444445e-07, "logits/chosen": -152.54013061523438, "logits/rejected": 75.00166320800781, "logps/chosen": -490.466552734375, "logps/rejected": -1782.5416259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.387326240539551, "rewards/margins": 27.359071731567383, "rewards/rejected": -34.74639892578125, "step": 938 }, { "epoch": 0.5841368584758942, "grad_norm": 0.011153277941048145, "learning_rate": 3.3888888888888895e-07, "logits/chosen": -57.64238357543945, "logits/rejected": -96.18829345703125, "logps/chosen": -639.5533447265625, "logps/rejected": -692.566162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.9037628173828125, "rewards/margins": 21.182796478271484, "rewards/rejected": -27.086559295654297, "step": 939 }, { "epoch": 0.5847589424572317, "grad_norm": 37.527896881103516, "learning_rate": 3.3333333333333335e-07, "logits/chosen": -101.47769165039062, "logits/rejected": -13.391592025756836, "logps/chosen": -1805.4276123046875, "logps/rejected": -2066.37744140625, "loss": 0.2965, "rewards/accuracies": 0.875, "rewards/chosen": -8.3379487991333, "rewards/margins": 23.826505661010742, "rewards/rejected": -32.16445541381836, "step": 940 }, { "epoch": 0.5853810264385692, "grad_norm": 16.228673934936523, "learning_rate": 3.277777777777778e-07, "logits/chosen": -56.657108306884766, "logits/rejected": -62.17206954956055, "logps/chosen": -673.6504516601562, "logps/rejected": -871.0477294921875, "loss": 0.0472, "rewards/accuracies": 1.0, "rewards/chosen": -14.611234664916992, "rewards/margins": 19.47986602783203, "rewards/rejected": -34.091102600097656, "step": 941 }, { "epoch": 0.5860031104199067, "grad_norm": 0.017951317131519318, "learning_rate": 3.2222222222222227e-07, "logits/chosen": -198.502197265625, "logits/rejected": -29.818737030029297, "logps/chosen": -499.754150390625, "logps/rejected": -1742.877685546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.827980041503906, "rewards/margins": 26.01885223388672, "rewards/rejected": -37.846832275390625, "step": 942 }, { "epoch": 0.5866251944012442, "grad_norm": 0.029347987845540047, "learning_rate": 3.166666666666667e-07, "logits/chosen": -175.78787231445312, "logits/rejected": 33.46290588378906, "logps/chosen": -477.66259765625, "logps/rejected": -807.3645629882812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.877490997314453, "rewards/margins": 22.12738609313965, "rewards/rejected": -33.00487518310547, "step": 943 }, { "epoch": 0.5872472783825816, "grad_norm": 0.37025606632232666, "learning_rate": 3.111111111111111e-07, "logits/chosen": -70.31114196777344, "logits/rejected": -9.403692245483398, "logps/chosen": -529.916015625, "logps/rejected": -776.240234375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.279094696044922, "rewards/margins": 22.05630111694336, "rewards/rejected": -29.33539581298828, "step": 944 }, { "epoch": 0.5878693623639192, "grad_norm": 0.03343338519334793, "learning_rate": 3.055555555555556e-07, "logits/chosen": -186.84494018554688, "logits/rejected": 55.197357177734375, "logps/chosen": -387.2557678222656, "logps/rejected": -872.4711303710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.468055725097656, "rewards/margins": 26.24567413330078, "rewards/rejected": -38.7137336730957, "step": 945 }, { "epoch": 0.5884914463452566, "grad_norm": 40.604087829589844, "learning_rate": 3.0000000000000004e-07, "logits/chosen": -115.62500762939453, "logits/rejected": -19.21177864074707, "logps/chosen": -368.7662353515625, "logps/rejected": -632.0204467773438, "loss": 0.1361, "rewards/accuracies": 0.875, "rewards/chosen": -10.808351516723633, "rewards/margins": 20.791366577148438, "rewards/rejected": -31.599716186523438, "step": 946 }, { "epoch": 0.5891135303265941, "grad_norm": 0.055613383650779724, "learning_rate": 2.9444444444444444e-07, "logits/chosen": -162.48248291015625, "logits/rejected": 32.41950988769531, "logps/chosen": -369.4836120605469, "logps/rejected": -817.988037109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.20730209350586, "rewards/margins": 25.73358726501465, "rewards/rejected": -35.940887451171875, "step": 947 }, { "epoch": 0.5897356143079315, "grad_norm": 1.0122228559339419e-05, "learning_rate": 2.888888888888889e-07, "logits/chosen": -224.20298767089844, "logits/rejected": 8.721441268920898, "logps/chosen": -669.8546142578125, "logps/rejected": -3149.087158203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.746034622192383, "rewards/margins": 46.712249755859375, "rewards/rejected": -55.458290100097656, "step": 948 }, { "epoch": 0.5903576982892691, "grad_norm": 2.123368978500366, "learning_rate": 2.8333333333333336e-07, "logits/chosen": -30.310951232910156, "logits/rejected": -32.845943450927734, "logps/chosen": -523.6943969726562, "logps/rejected": -655.5177612304688, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -8.459638595581055, "rewards/margins": 20.373781204223633, "rewards/rejected": -28.833419799804688, "step": 949 }, { "epoch": 0.5909797822706065, "grad_norm": 7.011478828644613e-06, "learning_rate": 2.7777777777777776e-07, "logits/chosen": -40.25692367553711, "logits/rejected": 16.02432632446289, "logps/chosen": -446.2783203125, "logps/rejected": -820.4910888671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.020289421081543, "rewards/margins": 24.919300079345703, "rewards/rejected": -36.93959045410156, "step": 950 }, { "epoch": 0.591601866251944, "grad_norm": 7.290003776550293, "learning_rate": 2.722222222222222e-07, "logits/chosen": -121.19810485839844, "logits/rejected": -2.7111759185791016, "logps/chosen": -701.1524658203125, "logps/rejected": -1166.7220458984375, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -12.899090766906738, "rewards/margins": 21.605409622192383, "rewards/rejected": -34.50450134277344, "step": 951 }, { "epoch": 0.5922239502332814, "grad_norm": 0.002331225899979472, "learning_rate": 2.666666666666667e-07, "logits/chosen": 12.05879020690918, "logits/rejected": 106.11532592773438, "logps/chosen": -687.399658203125, "logps/rejected": -884.0751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.649140357971191, "rewards/margins": 20.063343048095703, "rewards/rejected": -29.712486267089844, "step": 952 }, { "epoch": 0.592846034214619, "grad_norm": 1.2028781384287868e-05, "learning_rate": 2.6111111111111113e-07, "logits/chosen": -39.533634185791016, "logits/rejected": 83.72672271728516, "logps/chosen": -721.0515747070312, "logps/rejected": -1104.297119140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.199729919433594, "rewards/margins": 31.956998825073242, "rewards/rejected": -40.1567268371582, "step": 953 }, { "epoch": 0.5934681181959565, "grad_norm": 66.6301040649414, "learning_rate": 2.555555555555556e-07, "logits/chosen": -81.58306884765625, "logits/rejected": -81.36949920654297, "logps/chosen": -1471.591796875, "logps/rejected": -1152.2298583984375, "loss": 0.8921, "rewards/accuracies": 0.875, "rewards/chosen": -9.575486183166504, "rewards/margins": 21.877790451049805, "rewards/rejected": -31.45327377319336, "step": 954 }, { "epoch": 0.5940902021772939, "grad_norm": 2.464357852935791, "learning_rate": 2.5000000000000004e-07, "logits/chosen": -171.03541564941406, "logits/rejected": -22.971912384033203, "logps/chosen": -380.2628173828125, "logps/rejected": -804.6834716796875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -7.887761116027832, "rewards/margins": 23.52338981628418, "rewards/rejected": -31.411149978637695, "step": 955 }, { "epoch": 0.5947122861586314, "grad_norm": 1.2924025058746338, "learning_rate": 2.444444444444445e-07, "logits/chosen": -133.4987335205078, "logits/rejected": -19.54473876953125, "logps/chosen": -802.723388671875, "logps/rejected": -1898.7115478515625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -11.965556144714355, "rewards/margins": 27.240755081176758, "rewards/rejected": -39.2063102722168, "step": 956 }, { "epoch": 0.5953343701399689, "grad_norm": 3.96956378878599e-09, "learning_rate": 2.388888888888889e-07, "logits/chosen": -89.45348358154297, "logits/rejected": 68.3885269165039, "logps/chosen": -299.8547058105469, "logps/rejected": -920.951904296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.047226905822754, "rewards/margins": 34.88502502441406, "rewards/rejected": -42.9322509765625, "step": 957 }, { "epoch": 0.5959564541213064, "grad_norm": 4.415156364440918, "learning_rate": 2.3333333333333336e-07, "logits/chosen": -46.16106414794922, "logits/rejected": -90.82933044433594, "logps/chosen": -483.7831726074219, "logps/rejected": -606.9338989257812, "loss": 0.0604, "rewards/accuracies": 1.0, "rewards/chosen": -13.30030345916748, "rewards/margins": 16.27111053466797, "rewards/rejected": -29.571414947509766, "step": 958 }, { "epoch": 0.5965785381026438, "grad_norm": 0.0005050650797784328, "learning_rate": 2.2777777777777781e-07, "logits/chosen": -115.10529327392578, "logits/rejected": 26.02462387084961, "logps/chosen": -469.08013916015625, "logps/rejected": -1592.952880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.357044219970703, "rewards/margins": 35.2841911315918, "rewards/rejected": -45.6412353515625, "step": 959 }, { "epoch": 0.5972006220839814, "grad_norm": 3.4490609169006348, "learning_rate": 2.2222222222222224e-07, "logits/chosen": -25.252416610717773, "logits/rejected": 3.047414779663086, "logps/chosen": -628.9044799804688, "logps/rejected": -890.2113037109375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -11.731489181518555, "rewards/margins": 23.07278823852539, "rewards/rejected": -34.80427551269531, "step": 960 }, { "epoch": 0.5978227060653188, "grad_norm": 0.20698045194149017, "learning_rate": 2.166666666666667e-07, "logits/chosen": -198.67822265625, "logits/rejected": 13.414846420288086, "logps/chosen": -997.76611328125, "logps/rejected": -1627.38330078125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -15.145881652832031, "rewards/margins": 28.393251419067383, "rewards/rejected": -43.53913497924805, "step": 961 }, { "epoch": 0.5984447900466563, "grad_norm": 3.905603080056608e-05, "learning_rate": 2.1111111111111113e-07, "logits/chosen": -128.648193359375, "logits/rejected": 80.0594711303711, "logps/chosen": -833.3491821289062, "logps/rejected": -1576.983154296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.18873119354248, "rewards/margins": 34.50420379638672, "rewards/rejected": -46.69293212890625, "step": 962 }, { "epoch": 0.5990668740279937, "grad_norm": 0.0003395716776140034, "learning_rate": 2.055555555555556e-07, "logits/chosen": -74.6122055053711, "logits/rejected": -5.264270782470703, "logps/chosen": -807.8818359375, "logps/rejected": -1692.639404296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.868765830993652, "rewards/margins": 32.12522506713867, "rewards/rejected": -43.99399185180664, "step": 963 }, { "epoch": 0.5996889580093313, "grad_norm": 0.0001125499329646118, "learning_rate": 2.0000000000000002e-07, "logits/chosen": -137.10545349121094, "logits/rejected": 12.756465911865234, "logps/chosen": -316.5407409667969, "logps/rejected": -670.0057983398438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.7039079666137695, "rewards/margins": 26.4289608001709, "rewards/rejected": -32.132869720458984, "step": 964 }, { "epoch": 0.6003110419906688, "grad_norm": 14.274313926696777, "learning_rate": 1.9444444444444447e-07, "logits/chosen": -58.96481704711914, "logits/rejected": 34.46013641357422, "logps/chosen": -1744.1295166015625, "logps/rejected": -2685.028076171875, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -8.391512870788574, "rewards/margins": 37.230140686035156, "rewards/rejected": -45.62165451049805, "step": 965 }, { "epoch": 0.6009331259720062, "grad_norm": 0.17186103761196136, "learning_rate": 1.888888888888889e-07, "logits/chosen": -133.80136108398438, "logits/rejected": -35.996978759765625, "logps/chosen": -405.1982421875, "logps/rejected": -711.1209106445312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.527985572814941, "rewards/margins": 23.880393981933594, "rewards/rejected": -32.40837860107422, "step": 966 }, { "epoch": 0.6015552099533437, "grad_norm": 0.388028085231781, "learning_rate": 1.8333333333333336e-07, "logits/chosen": -108.74185180664062, "logits/rejected": 47.198974609375, "logps/chosen": -333.2059326171875, "logps/rejected": -694.32763671875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -6.572696685791016, "rewards/margins": 24.838024139404297, "rewards/rejected": -31.410722732543945, "step": 967 }, { "epoch": 0.6021772939346812, "grad_norm": 33.68065643310547, "learning_rate": 1.777777777777778e-07, "logits/chosen": -155.54180908203125, "logits/rejected": 3.1585946083068848, "logps/chosen": -385.47625732421875, "logps/rejected": -873.92041015625, "loss": 0.0866, "rewards/accuracies": 1.0, "rewards/chosen": -6.090001106262207, "rewards/margins": 29.163257598876953, "rewards/rejected": -35.253257751464844, "step": 968 }, { "epoch": 0.6027993779160187, "grad_norm": 5.691498881787993e-05, "learning_rate": 1.7222222222222225e-07, "logits/chosen": -106.64253234863281, "logits/rejected": -14.64812183380127, "logps/chosen": -1026.1185302734375, "logps/rejected": -1382.2001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.9078369140625, "rewards/margins": 22.06609535217285, "rewards/rejected": -33.973934173583984, "step": 969 }, { "epoch": 0.6034214618973561, "grad_norm": 81.30170440673828, "learning_rate": 1.6666666666666668e-07, "logits/chosen": -59.76327133178711, "logits/rejected": 65.6419448852539, "logps/chosen": -519.0238647460938, "logps/rejected": -879.20703125, "loss": 0.4413, "rewards/accuracies": 0.875, "rewards/chosen": -11.8172607421875, "rewards/margins": 25.964611053466797, "rewards/rejected": -37.7818717956543, "step": 970 }, { "epoch": 0.6040435458786936, "grad_norm": 0.014516373164951801, "learning_rate": 1.6111111111111113e-07, "logits/chosen": -89.43550109863281, "logits/rejected": -16.35576820373535, "logps/chosen": -319.6653137207031, "logps/rejected": -658.8167114257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.875820159912109, "rewards/margins": 25.536354064941406, "rewards/rejected": -30.412174224853516, "step": 971 }, { "epoch": 0.6046656298600311, "grad_norm": 0.014515973627567291, "learning_rate": 1.5555555555555556e-07, "logits/chosen": -155.14987182617188, "logits/rejected": -11.390532493591309, "logps/chosen": -694.3985595703125, "logps/rejected": -2436.38916015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.466022491455078, "rewards/margins": 35.63492965698242, "rewards/rejected": -46.1009521484375, "step": 972 }, { "epoch": 0.6052877138413686, "grad_norm": 38.49283218383789, "learning_rate": 1.5000000000000002e-07, "logits/chosen": -121.97566223144531, "logits/rejected": 67.32776641845703, "logps/chosen": -360.0615234375, "logps/rejected": -816.8880615234375, "loss": 0.2143, "rewards/accuracies": 0.875, "rewards/chosen": -7.9398322105407715, "rewards/margins": 29.756240844726562, "rewards/rejected": -37.696075439453125, "step": 973 }, { "epoch": 0.605909797822706, "grad_norm": 0.022081471979618073, "learning_rate": 1.4444444444444445e-07, "logits/chosen": -200.90402221679688, "logits/rejected": -12.88595199584961, "logps/chosen": -425.1784362792969, "logps/rejected": -916.63623046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.462552070617676, "rewards/margins": 24.148040771484375, "rewards/rejected": -30.610593795776367, "step": 974 }, { "epoch": 0.6065318818040435, "grad_norm": 1.9204877389711328e-05, "learning_rate": 1.3888888888888888e-07, "logits/chosen": -212.28834533691406, "logits/rejected": -100.57696533203125, "logps/chosen": -523.309326171875, "logps/rejected": -962.1312866210938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.693882942199707, "rewards/margins": 25.071983337402344, "rewards/rejected": -34.76586151123047, "step": 975 }, { "epoch": 0.6071539657853811, "grad_norm": 9.086454520002007e-05, "learning_rate": 1.3333333333333336e-07, "logits/chosen": -125.12515258789062, "logits/rejected": 29.449159622192383, "logps/chosen": -1106.573974609375, "logps/rejected": -1955.2584228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.933027267456055, "rewards/margins": 35.88373565673828, "rewards/rejected": -50.81676483154297, "step": 976 }, { "epoch": 0.6077760497667185, "grad_norm": 1.8570024967193604, "learning_rate": 1.277777777777778e-07, "logits/chosen": -34.346431732177734, "logits/rejected": 88.42322540283203, "logps/chosen": -831.2744140625, "logps/rejected": -1239.19921875, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -14.735718727111816, "rewards/margins": 20.041797637939453, "rewards/rejected": -34.77751541137695, "step": 977 }, { "epoch": 0.608398133748056, "grad_norm": 7.391184329986572, "learning_rate": 1.2222222222222225e-07, "logits/chosen": -39.92427444458008, "logits/rejected": 13.554544448852539, "logps/chosen": -504.74993896484375, "logps/rejected": -826.9801635742188, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": -9.717469215393066, "rewards/margins": 24.27166175842285, "rewards/rejected": -33.98912811279297, "step": 978 }, { "epoch": 0.6090202177293935, "grad_norm": 0.00012090606469428167, "learning_rate": 1.1666666666666668e-07, "logits/chosen": -140.5585174560547, "logits/rejected": 56.356990814208984, "logps/chosen": -311.0361022949219, "logps/rejected": -715.8753662109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.400900840759277, "rewards/margins": 25.2390079498291, "rewards/rejected": -35.63990783691406, "step": 979 }, { "epoch": 0.609642301710731, "grad_norm": 0.0018111987737938762, "learning_rate": 1.1111111111111112e-07, "logits/chosen": -148.76744079589844, "logits/rejected": -11.034968376159668, "logps/chosen": -440.1136779785156, "logps/rejected": -836.3117065429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.045895099639893, "rewards/margins": 31.700912475585938, "rewards/rejected": -37.74680709838867, "step": 980 }, { "epoch": 0.6102643856920684, "grad_norm": 42.70917510986328, "learning_rate": 1.0555555555555557e-07, "logits/chosen": -46.39480209350586, "logits/rejected": 61.24584197998047, "logps/chosen": -682.0428466796875, "logps/rejected": -960.655029296875, "loss": 0.1393, "rewards/accuracies": 0.875, "rewards/chosen": -14.198314666748047, "rewards/margins": 19.99317169189453, "rewards/rejected": -34.19148635864258, "step": 981 }, { "epoch": 0.6108864696734059, "grad_norm": 4.778564766105831e-10, "learning_rate": 1.0000000000000001e-07, "logits/chosen": -186.54135131835938, "logits/rejected": 79.44303131103516, "logps/chosen": -407.61358642578125, "logps/rejected": -1643.77734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.226995468139648, "rewards/margins": 41.202816009521484, "rewards/rejected": -49.4298095703125, "step": 982 }, { "epoch": 0.6115085536547434, "grad_norm": 7.047739028930664, "learning_rate": 9.444444444444445e-08, "logits/chosen": -80.48294067382812, "logits/rejected": -40.78007888793945, "logps/chosen": -327.14971923828125, "logps/rejected": -1237.299560546875, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": -5.502917289733887, "rewards/margins": 27.259870529174805, "rewards/rejected": -32.762786865234375, "step": 983 }, { "epoch": 0.6121306376360809, "grad_norm": 0.053896401077508926, "learning_rate": 8.88888888888889e-08, "logits/chosen": -89.01689910888672, "logits/rejected": 8.922504425048828, "logps/chosen": -683.2506103515625, "logps/rejected": -989.0587158203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.31430721282959, "rewards/margins": 25.660036087036133, "rewards/rejected": -36.974342346191406, "step": 984 }, { "epoch": 0.6127527216174183, "grad_norm": 0.0017677996074780822, "learning_rate": 8.333333333333334e-08, "logits/chosen": -162.21823120117188, "logits/rejected": 1.7906570434570312, "logps/chosen": -643.2587890625, "logps/rejected": -1052.787841796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.302186965942383, "rewards/margins": 30.356130599975586, "rewards/rejected": -43.65831756591797, "step": 985 }, { "epoch": 0.6133748055987558, "grad_norm": 0.000158111666678451, "learning_rate": 7.777777777777778e-08, "logits/chosen": -153.5557861328125, "logits/rejected": 5.618416786193848, "logps/chosen": -480.3512878417969, "logps/rejected": -978.9366455078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.68058967590332, "rewards/margins": 36.152488708496094, "rewards/rejected": -41.83307647705078, "step": 986 }, { "epoch": 0.6139968895800934, "grad_norm": 0.0001340080489171669, "learning_rate": 7.222222222222222e-08, "logits/chosen": -166.49266052246094, "logits/rejected": -29.10374641418457, "logps/chosen": -529.593994140625, "logps/rejected": -1305.080322265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.319616794586182, "rewards/margins": 30.728534698486328, "rewards/rejected": -38.04814910888672, "step": 987 }, { "epoch": 0.6146189735614308, "grad_norm": 3.360474920555134e-08, "learning_rate": 6.666666666666668e-08, "logits/chosen": -166.9295196533203, "logits/rejected": -35.60263442993164, "logps/chosen": -664.4322509765625, "logps/rejected": -902.1387329101562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.652097702026367, "rewards/margins": 30.31884765625, "rewards/rejected": -38.970947265625, "step": 988 }, { "epoch": 0.6152410575427683, "grad_norm": 5.90954065322876, "learning_rate": 6.111111111111112e-08, "logits/chosen": 9.401673316955566, "logits/rejected": 74.82857513427734, "logps/chosen": -295.3538818359375, "logps/rejected": -618.9197387695312, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -8.275187492370605, "rewards/margins": 23.987794876098633, "rewards/rejected": -32.26298141479492, "step": 989 }, { "epoch": 0.6158631415241057, "grad_norm": 0.06738218665122986, "learning_rate": 5.555555555555556e-08, "logits/chosen": -40.73262405395508, "logits/rejected": 12.165586471557617, "logps/chosen": -949.438232421875, "logps/rejected": -1482.4117431640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.427240371704102, "rewards/margins": 24.615694046020508, "rewards/rejected": -32.04293441772461, "step": 990 }, { "epoch": 0.6164852255054433, "grad_norm": 356.81329345703125, "learning_rate": 5.0000000000000004e-08, "logits/chosen": -86.11222839355469, "logits/rejected": -20.29574203491211, "logps/chosen": -1350.1463623046875, "logps/rejected": -1412.5169677734375, "loss": 0.183, "rewards/accuracies": 0.875, "rewards/chosen": -10.315704345703125, "rewards/margins": 15.264507293701172, "rewards/rejected": -25.58021354675293, "step": 991 }, { "epoch": 0.6171073094867807, "grad_norm": 0.0012973308330401778, "learning_rate": 4.444444444444445e-08, "logits/chosen": -174.97067260742188, "logits/rejected": 15.363943099975586, "logps/chosen": -353.24176025390625, "logps/rejected": -1534.56689453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.122416496276855, "rewards/margins": 30.718175888061523, "rewards/rejected": -39.84059143066406, "step": 992 }, { "epoch": 0.6177293934681182, "grad_norm": 22.53267478942871, "learning_rate": 3.888888888888889e-08, "logits/chosen": -41.92042541503906, "logits/rejected": -11.281787872314453, "logps/chosen": -698.7086791992188, "logps/rejected": -1449.5130615234375, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": -12.50344181060791, "rewards/margins": 22.170122146606445, "rewards/rejected": -34.67356491088867, "step": 993 }, { "epoch": 0.6183514774494556, "grad_norm": 0.26141592860221863, "learning_rate": 3.333333333333334e-08, "logits/chosen": -145.34535217285156, "logits/rejected": 83.27471923828125, "logps/chosen": -469.51434326171875, "logps/rejected": -964.3944091796875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -9.160540580749512, "rewards/margins": 32.561771392822266, "rewards/rejected": -41.72230911254883, "step": 994 }, { "epoch": 0.6189735614307932, "grad_norm": 0.013846839778125286, "learning_rate": 2.777777777777778e-08, "logits/chosen": -168.17926025390625, "logits/rejected": -8.323474884033203, "logps/chosen": -886.221923828125, "logps/rejected": -2335.274169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.601556777954102, "rewards/margins": 30.146575927734375, "rewards/rejected": -41.748130798339844, "step": 995 }, { "epoch": 0.6195956454121306, "grad_norm": 0.36621561646461487, "learning_rate": 2.2222222222222224e-08, "logits/chosen": -111.82756805419922, "logits/rejected": 27.581993103027344, "logps/chosen": -332.3082275390625, "logps/rejected": -673.089111328125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -6.251396179199219, "rewards/margins": 23.449657440185547, "rewards/rejected": -29.701051712036133, "step": 996 }, { "epoch": 0.6202177293934681, "grad_norm": 1.4058661460876465, "learning_rate": 1.666666666666667e-08, "logits/chosen": -133.06005859375, "logits/rejected": 6.373744964599609, "logps/chosen": -1311.1544189453125, "logps/rejected": -2402.772705078125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -15.865715980529785, "rewards/margins": 32.36893844604492, "rewards/rejected": -48.234657287597656, "step": 997 }, { "epoch": 0.6208398133748056, "grad_norm": 0.0005739193293265998, "learning_rate": 1.1111111111111112e-08, "logits/chosen": -159.47232055664062, "logits/rejected": 29.31635284423828, "logps/chosen": -638.7407836914062, "logps/rejected": -1015.0540771484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.453882217407227, "rewards/margins": 26.695383071899414, "rewards/rejected": -38.149269104003906, "step": 998 }, { "epoch": 0.6214618973561431, "grad_norm": 7.188466548919678, "learning_rate": 5.555555555555556e-09, "logits/chosen": -139.99632263183594, "logits/rejected": -14.997485160827637, "logps/chosen": -393.3059997558594, "logps/rejected": -730.1820678710938, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": -7.653923034667969, "rewards/margins": 23.944673538208008, "rewards/rejected": -31.598594665527344, "step": 999 }, { "epoch": 0.6220839813374806, "grad_norm": 1.7136025428771973, "learning_rate": 0.0, "logits/chosen": 1.6329803466796875, "logits/rejected": -8.595043182373047, "logps/chosen": -436.31201171875, "logps/rejected": -645.6025390625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -7.614713668823242, "rewards/margins": 24.103416442871094, "rewards/rejected": -31.718130111694336, "step": 1000 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }