{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 9.523809523809524e-09, "logits/chosen": -1.5964443683624268, "logits/rejected": -1.3291687965393066, "logps/chosen": -474.3575134277344, "logps/rejected": -663.2249755859375, "loss": 0.3087, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 9.523809523809525e-08, "logits/chosen": -1.6813511848449707, "logits/rejected": -1.2188762426376343, "logps/chosen": -449.58489990234375, "logps/rejected": -889.5899047851562, "loss": 0.2155, "rewards/accuracies": 0.3888888955116272, "rewards/chosen": 0.00010568237485131249, "rewards/margins": 6.739808304701e-05, "rewards/rejected": 3.8284317270154133e-05, "step": 10 }, { "epoch": 0.0, "learning_rate": 1.904761904761905e-07, "logits/chosen": -1.7874151468276978, "logits/rejected": -1.1624069213867188, "logps/chosen": -428.51300048828125, "logps/rejected": -803.0301513671875, "loss": 0.2287, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.00012383742432575673, "rewards/margins": 0.0009384436416439712, "rewards/rejected": -0.0010622810805216432, "step": 20 }, { "epoch": 0.01, "learning_rate": 2.8571428571428575e-07, "logits/chosen": -1.5539613962173462, "logits/rejected": -1.1322476863861084, "logps/chosen": -443.5193786621094, "logps/rejected": -830.2403564453125, "loss": 0.2281, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0007974267937242985, "rewards/margins": 0.0017057094955816865, "rewards/rejected": -0.0009082824690267444, "step": 30 }, { "epoch": 0.01, "learning_rate": 3.80952380952381e-07, "logits/chosen": -1.640062689781189, "logits/rejected": -1.2586108446121216, "logps/chosen": -430.81805419921875, "logps/rejected": -854.3474731445312, "loss": 0.2003, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.0021123916376382113, "rewards/margins": 0.005200219340622425, "rewards/rejected": -0.003087828401476145, "step": 40 }, { "epoch": 0.01, "learning_rate": 4.7619047619047623e-07, "logits/chosen": -1.5159130096435547, "logits/rejected": -1.0649659633636475, "logps/chosen": -482.67852783203125, "logps/rejected": -778.0474853515625, "loss": 0.1952, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.0075163752771914005, "rewards/margins": 0.011120806448161602, "rewards/rejected": -0.003604432102292776, "step": 50 }, { "epoch": 0.01, "learning_rate": 5.714285714285715e-07, "logits/chosen": -1.5851542949676514, "logits/rejected": -1.1346595287322998, "logps/chosen": -425.6817932128906, "logps/rejected": -779.3834228515625, "loss": 0.183, "rewards/accuracies": 0.75, "rewards/chosen": 0.01108284667134285, "rewards/margins": 0.01969107612967491, "rewards/rejected": -0.008608227595686913, "step": 60 }, { "epoch": 0.01, "learning_rate": 6.666666666666667e-07, "logits/chosen": -1.533231496810913, "logits/rejected": -1.0766208171844482, "logps/chosen": -488.9933166503906, "logps/rejected": -895.2310791015625, "loss": 0.1867, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.017419463023543358, "rewards/margins": 0.03203754127025604, "rewards/rejected": -0.014618076384067535, "step": 70 }, { "epoch": 0.02, "learning_rate": 7.61904761904762e-07, "logits/chosen": -1.6760826110839844, "logits/rejected": -0.9335016012191772, "logps/chosen": -451.7982482910156, "logps/rejected": -854.0350341796875, "loss": 0.1739, "rewards/accuracies": 0.875, "rewards/chosen": 0.026161080226302147, "rewards/margins": 0.05273517966270447, "rewards/rejected": -0.02657409943640232, "step": 80 }, { "epoch": 0.02, "learning_rate": 8.571428571428572e-07, "logits/chosen": -1.7192933559417725, "logits/rejected": -1.0688517093658447, "logps/chosen": -404.2705078125, "logps/rejected": -811.7678833007812, "loss": 0.1649, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.027517270296812057, "rewards/margins": 0.05363103747367859, "rewards/rejected": -0.02611376717686653, "step": 90 }, { "epoch": 0.02, "learning_rate": 9.523809523809525e-07, "logits/chosen": -1.8428962230682373, "logits/rejected": -1.1792418956756592, "logps/chosen": -430.27227783203125, "logps/rejected": -801.3963623046875, "loss": 0.171, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.007343721576035023, "rewards/margins": 0.0735684260725975, "rewards/rejected": -0.06622470915317535, "step": 100 }, { "epoch": 0.02, "learning_rate": 1.0476190476190478e-06, "logits/chosen": -1.746883749961853, "logits/rejected": -1.1373497247695923, "logps/chosen": -525.6906127929688, "logps/rejected": -1003.6932373046875, "loss": 0.1356, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.020451098680496216, "rewards/margins": 0.09704919159412384, "rewards/rejected": -0.11750028282403946, "step": 110 }, { "epoch": 0.02, "learning_rate": 1.142857142857143e-06, "logits/chosen": -1.8047354221343994, "logits/rejected": -1.0603820085525513, "logps/chosen": -560.2908325195312, "logps/rejected": -1008.1901245117188, "loss": 0.1192, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.043726347386837006, "rewards/margins": 0.1116614118218422, "rewards/rejected": -0.1553877592086792, "step": 120 }, { "epoch": 0.02, "learning_rate": 1.2380952380952382e-06, "logits/chosen": -1.6039708852767944, "logits/rejected": -1.0721991062164307, "logps/chosen": -491.07830810546875, "logps/rejected": -932.6652221679688, "loss": 0.1444, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.03485647588968277, "rewards/margins": 0.11112723499536514, "rewards/rejected": -0.1459837257862091, "step": 130 }, { "epoch": 0.03, "learning_rate": 1.3333333333333334e-06, "logits/chosen": -1.6463550329208374, "logits/rejected": -0.9608979225158691, "logps/chosen": -519.837646484375, "logps/rejected": -954.7674560546875, "loss": 0.1098, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.04095975309610367, "rewards/margins": 0.1356854885816574, "rewards/rejected": -0.17664523422718048, "step": 140 }, { "epoch": 0.03, "learning_rate": 1.4285714285714286e-06, "logits/chosen": -1.7351688146591187, "logits/rejected": -1.1353156566619873, "logps/chosen": -493.08966064453125, "logps/rejected": -1045.0201416015625, "loss": 0.0944, "rewards/accuracies": 0.875, "rewards/chosen": -0.06612655520439148, "rewards/margins": 0.1745416820049286, "rewards/rejected": -0.24066825211048126, "step": 150 }, { "epoch": 0.03, "learning_rate": 1.523809523809524e-06, "logits/chosen": -1.659099817276001, "logits/rejected": -0.8211167454719543, "logps/chosen": -581.5676879882812, "logps/rejected": -1053.3389892578125, "loss": 0.1352, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10605227947235107, "rewards/margins": 0.15199507772922516, "rewards/rejected": -0.25804734230041504, "step": 160 }, { "epoch": 0.03, "learning_rate": 1.6190476190476193e-06, "logits/chosen": -1.67592453956604, "logits/rejected": -1.0631893873214722, "logps/chosen": -580.7254028320312, "logps/rejected": -1190.563720703125, "loss": 0.0905, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10428521782159805, "rewards/margins": 0.2247179001569748, "rewards/rejected": -0.32900312542915344, "step": 170 }, { "epoch": 0.03, "learning_rate": 1.7142857142857145e-06, "logits/chosen": -1.650418996810913, "logits/rejected": -0.8839709162712097, "logps/chosen": -733.577392578125, "logps/rejected": -1300.351318359375, "loss": 0.095, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1777438372373581, "rewards/margins": 0.22024521231651306, "rewards/rejected": -0.3979890048503876, "step": 180 }, { "epoch": 0.04, "learning_rate": 1.8095238095238097e-06, "logits/chosen": -1.567194938659668, "logits/rejected": -0.9303449392318726, "logps/chosen": -665.9202880859375, "logps/rejected": -1264.822021484375, "loss": 0.1258, "rewards/accuracies": 0.75, "rewards/chosen": -0.19641616940498352, "rewards/margins": 0.23649337887763977, "rewards/rejected": -0.4329095482826233, "step": 190 }, { "epoch": 0.04, "learning_rate": 1.904761904761905e-06, "logits/chosen": -1.6757524013519287, "logits/rejected": -1.3166964054107666, "logps/chosen": -670.0177001953125, "logps/rejected": -1232.9072265625, "loss": 0.1117, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1455942690372467, "rewards/margins": 0.193759948015213, "rewards/rejected": -0.3393542170524597, "step": 200 }, { "epoch": 0.04, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -1.8672746419906616, "logits/rejected": -1.0123827457427979, "logps/chosen": -606.1552734375, "logps/rejected": -1232.156005859375, "loss": 0.0717, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.08139447122812271, "rewards/margins": 0.24210381507873535, "rewards/rejected": -0.3234982490539551, "step": 210 }, { "epoch": 0.04, "learning_rate": 2.0952380952380955e-06, "logits/chosen": -1.8828620910644531, "logits/rejected": -1.178390383720398, "logps/chosen": -536.4411010742188, "logps/rejected": -1053.248779296875, "loss": 0.1066, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08360230922698975, "rewards/margins": 0.19688987731933594, "rewards/rejected": -0.2804921865463257, "step": 220 }, { "epoch": 0.04, "learning_rate": 2.1904761904761908e-06, "logits/chosen": -1.6069672107696533, "logits/rejected": -1.1681115627288818, "logps/chosen": -618.42529296875, "logps/rejected": -1184.3099365234375, "loss": 0.1236, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.17492257058620453, "rewards/margins": 0.20526257157325745, "rewards/rejected": -0.3801851272583008, "step": 230 }, { "epoch": 0.05, "learning_rate": 2.285714285714286e-06, "logits/chosen": -1.6747249364852905, "logits/rejected": -1.1488839387893677, "logps/chosen": -539.3297729492188, "logps/rejected": -1078.1259765625, "loss": 0.1195, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.13585665822029114, "rewards/margins": 0.1913582980632782, "rewards/rejected": -0.32721495628356934, "step": 240 }, { "epoch": 0.05, "learning_rate": 2.380952380952381e-06, "logits/chosen": -1.644960641860962, "logits/rejected": -1.2164050340652466, "logps/chosen": -542.1751708984375, "logps/rejected": -1071.045166015625, "loss": 0.1381, "rewards/accuracies": 0.75, "rewards/chosen": -0.13008072972297668, "rewards/margins": 0.19909824430942535, "rewards/rejected": -0.32917895913124084, "step": 250 }, { "epoch": 0.05, "learning_rate": 2.4761904761904764e-06, "logits/chosen": -1.7668380737304688, "logits/rejected": -1.1439602375030518, "logps/chosen": -615.5252075195312, "logps/rejected": -1166.810302734375, "loss": 0.1013, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1580442488193512, "rewards/margins": 0.22525843977928162, "rewards/rejected": -0.3833027184009552, "step": 260 }, { "epoch": 0.05, "learning_rate": 2.571428571428571e-06, "logits/chosen": -1.813973069190979, "logits/rejected": -1.1399123668670654, "logps/chosen": -712.232421875, "logps/rejected": -1244.3934326171875, "loss": 0.1077, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.18287518620491028, "rewards/margins": 0.2252916842699051, "rewards/rejected": -0.40816688537597656, "step": 270 }, { "epoch": 0.05, "learning_rate": 2.666666666666667e-06, "logits/chosen": -1.6180734634399414, "logits/rejected": -1.3701114654541016, "logps/chosen": -494.2696228027344, "logps/rejected": -1138.9771728515625, "loss": 0.1045, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12325029075145721, "rewards/margins": 0.23065133392810822, "rewards/rejected": -0.35390162467956543, "step": 280 }, { "epoch": 0.06, "learning_rate": 2.7619047619047625e-06, "logits/chosen": -1.694424033164978, "logits/rejected": -1.0708407163619995, "logps/chosen": -545.6209716796875, "logps/rejected": -1122.974609375, "loss": 0.1028, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10549769550561905, "rewards/margins": 0.24908974766731262, "rewards/rejected": -0.35458746552467346, "step": 290 }, { "epoch": 0.06, "learning_rate": 2.8571428571428573e-06, "logits/chosen": -1.7387892007827759, "logits/rejected": -1.1617491245269775, "logps/chosen": -610.787841796875, "logps/rejected": -1241.400146484375, "loss": 0.0931, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.170683354139328, "rewards/margins": 0.2684099078178406, "rewards/rejected": -0.4390932619571686, "step": 300 }, { "epoch": 0.06, "learning_rate": 2.9523809523809525e-06, "logits/chosen": -1.7345823049545288, "logits/rejected": -1.1376091241836548, "logps/chosen": -719.1180419921875, "logps/rejected": -1263.5145263671875, "loss": 0.1242, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1590810865163803, "rewards/margins": 0.25278979539871216, "rewards/rejected": -0.41187089681625366, "step": 310 }, { "epoch": 0.06, "learning_rate": 3.047619047619048e-06, "logits/chosen": -1.6919857263565063, "logits/rejected": -1.050135612487793, "logps/chosen": -729.0108642578125, "logps/rejected": -1165.0267333984375, "loss": 0.0923, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.2034168243408203, "rewards/margins": 0.22210320830345154, "rewards/rejected": -0.42552003264427185, "step": 320 }, { "epoch": 0.06, "learning_rate": 3.142857142857143e-06, "logits/chosen": -1.697484016418457, "logits/rejected": -1.0732462406158447, "logps/chosen": -671.1806640625, "logps/rejected": -1204.2303466796875, "loss": 0.0962, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.21383149921894073, "rewards/margins": 0.2506061792373657, "rewards/rejected": -0.4644376337528229, "step": 330 }, { "epoch": 0.06, "learning_rate": 3.2380952380952385e-06, "logits/chosen": -1.5445235967636108, "logits/rejected": -1.014106035232544, "logps/chosen": -623.9879760742188, "logps/rejected": -1408.772705078125, "loss": 0.042, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.15062114596366882, "rewards/margins": 0.3381795883178711, "rewards/rejected": -0.48880070447921753, "step": 340 }, { "epoch": 0.07, "learning_rate": 3.3333333333333333e-06, "logits/chosen": -1.8544213771820068, "logits/rejected": -1.0226833820343018, "logps/chosen": -607.8839111328125, "logps/rejected": -1246.36083984375, "loss": 0.0648, "rewards/accuracies": 0.875, "rewards/chosen": -0.11450695991516113, "rewards/margins": 0.267674058675766, "rewards/rejected": -0.3821810185909271, "step": 350 }, { "epoch": 0.07, "learning_rate": 3.428571428571429e-06, "logits/chosen": -1.7216541767120361, "logits/rejected": -0.9856483340263367, "logps/chosen": -749.8733520507812, "logps/rejected": -1292.38037109375, "loss": 0.068, "rewards/accuracies": 0.875, "rewards/chosen": -0.231466606259346, "rewards/margins": 0.22745048999786377, "rewards/rejected": -0.4589170813560486, "step": 360 }, { "epoch": 0.07, "learning_rate": 3.523809523809524e-06, "logits/chosen": -1.7551990747451782, "logits/rejected": -1.1457812786102295, "logps/chosen": -781.4153442382812, "logps/rejected": -1313.230224609375, "loss": 0.0727, "rewards/accuracies": 0.875, "rewards/chosen": -0.24709895253181458, "rewards/margins": 0.2493913173675537, "rewards/rejected": -0.4964902400970459, "step": 370 }, { "epoch": 0.07, "learning_rate": 3.6190476190476194e-06, "logits/chosen": -1.7148735523223877, "logits/rejected": -1.0211610794067383, "logps/chosen": -636.7725830078125, "logps/rejected": -1297.733154296875, "loss": 0.0641, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2014390230178833, "rewards/margins": 0.2895970940589905, "rewards/rejected": -0.4910360872745514, "step": 380 }, { "epoch": 0.07, "learning_rate": 3.7142857142857146e-06, "logits/chosen": -1.7809518575668335, "logits/rejected": -0.9582511782646179, "logps/chosen": -719.4249267578125, "logps/rejected": -1336.9053955078125, "loss": 0.0912, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.26866966485977173, "rewards/margins": 0.27918726205825806, "rewards/rejected": -0.5478569269180298, "step": 390 }, { "epoch": 0.08, "learning_rate": 3.80952380952381e-06, "logits/chosen": -1.4761449098587036, "logits/rejected": -1.0101737976074219, "logps/chosen": -658.9013671875, "logps/rejected": -1370.678466796875, "loss": 0.0867, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.256266325712204, "rewards/margins": 0.3092409670352936, "rewards/rejected": -0.5655072927474976, "step": 400 }, { "epoch": 0.08, "learning_rate": 3.9047619047619055e-06, "logits/chosen": -1.4691818952560425, "logits/rejected": -1.0662410259246826, "logps/chosen": -654.7586669921875, "logps/rejected": -1156.6165771484375, "loss": 0.1032, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.23384161293506622, "rewards/margins": 0.2041410505771637, "rewards/rejected": -0.4379826486110687, "step": 410 }, { "epoch": 0.08, "learning_rate": 4.000000000000001e-06, "logits/chosen": -1.5862969160079956, "logits/rejected": -1.1011335849761963, "logps/chosen": -621.4929809570312, "logps/rejected": -1237.9970703125, "loss": 0.0978, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2105666697025299, "rewards/margins": 0.2511122226715088, "rewards/rejected": -0.4616789221763611, "step": 420 }, { "epoch": 0.08, "learning_rate": 4.095238095238096e-06, "logits/chosen": -1.661075234413147, "logits/rejected": -1.0312578678131104, "logps/chosen": -711.7174682617188, "logps/rejected": -1414.0506591796875, "loss": 0.0657, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.22503916919231415, "rewards/margins": 0.27457931637763977, "rewards/rejected": -0.4996185302734375, "step": 430 }, { "epoch": 0.08, "learning_rate": 4.190476190476191e-06, "logits/chosen": -1.6692326068878174, "logits/rejected": -0.7681063413619995, "logps/chosen": -756.2827758789062, "logps/rejected": -1376.8885498046875, "loss": 0.0962, "rewards/accuracies": 0.875, "rewards/chosen": -0.22421880066394806, "rewards/margins": 0.2793051600456238, "rewards/rejected": -0.5035240054130554, "step": 440 }, { "epoch": 0.09, "learning_rate": 4.2857142857142855e-06, "logits/chosen": -1.5787713527679443, "logits/rejected": -1.0791809558868408, "logps/chosen": -604.9113159179688, "logps/rejected": -1252.309814453125, "loss": 0.0779, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1460736095905304, "rewards/margins": 0.26809030771255493, "rewards/rejected": -0.4141639769077301, "step": 450 }, { "epoch": 0.09, "learning_rate": 4.3809523809523815e-06, "logits/chosen": -1.7312390804290771, "logits/rejected": -0.9138051867485046, "logps/chosen": -715.1556396484375, "logps/rejected": -1347.625, "loss": 0.0806, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.20960795879364014, "rewards/margins": 0.2895205020904541, "rewards/rejected": -0.49912840127944946, "step": 460 }, { "epoch": 0.09, "learning_rate": 4.476190476190477e-06, "logits/chosen": -1.5289274454116821, "logits/rejected": -1.2004892826080322, "logps/chosen": -628.0669555664062, "logps/rejected": -1309.748779296875, "loss": 0.0994, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1908506602048874, "rewards/margins": 0.2644248604774475, "rewards/rejected": -0.4552755355834961, "step": 470 }, { "epoch": 0.09, "learning_rate": 4.571428571428572e-06, "logits/chosen": -1.7772254943847656, "logits/rejected": -1.2134960889816284, "logps/chosen": -578.4381103515625, "logps/rejected": -1015.9026489257812, "loss": 0.1204, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.12097190320491791, "rewards/margins": 0.13818739354610443, "rewards/rejected": -0.25915926694869995, "step": 480 }, { "epoch": 0.09, "learning_rate": 4.666666666666667e-06, "logits/chosen": -1.963395357131958, "logits/rejected": -1.1699910163879395, "logps/chosen": -568.7506103515625, "logps/rejected": -1148.7525634765625, "loss": 0.0884, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.12507882714271545, "rewards/margins": 0.23613107204437256, "rewards/rejected": -0.3612098693847656, "step": 490 }, { "epoch": 0.1, "learning_rate": 4.761904761904762e-06, "logits/chosen": -1.768434762954712, "logits/rejected": -1.1358940601348877, "logps/chosen": -684.4830322265625, "logps/rejected": -1343.7208251953125, "loss": 0.1135, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.18173658847808838, "rewards/margins": 0.26491695642471313, "rewards/rejected": -0.4466535449028015, "step": 500 }, { "epoch": 0.1, "learning_rate": 4.857142857142858e-06, "logits/chosen": -1.6421762704849243, "logits/rejected": -0.9460729360580444, "logps/chosen": -699.533935546875, "logps/rejected": -1446.2193603515625, "loss": 0.0739, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.24191920459270477, "rewards/margins": 0.28883248567581177, "rewards/rejected": -0.530751645565033, "step": 510 }, { "epoch": 0.1, "learning_rate": 4.952380952380953e-06, "logits/chosen": -1.6789777278900146, "logits/rejected": -1.0460736751556396, "logps/chosen": -814.0193481445312, "logps/rejected": -1421.7989501953125, "loss": 0.0817, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2921529710292816, "rewards/margins": 0.2904835343360901, "rewards/rejected": -0.5826364755630493, "step": 520 }, { "epoch": 0.1, "learning_rate": 4.999986185163754e-06, "logits/chosen": -1.9284679889678955, "logits/rejected": -1.4057942628860474, "logps/chosen": -558.6351318359375, "logps/rejected": -1179.9552001953125, "loss": 0.1042, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15910789370536804, "rewards/margins": 0.24427099525928497, "rewards/rejected": -0.4033789038658142, "step": 530 }, { "epoch": 0.1, "learning_rate": 4.999875667389858e-06, "logits/chosen": -1.66916823387146, "logits/rejected": -1.0905592441558838, "logps/chosen": -606.5189208984375, "logps/rejected": -1062.45947265625, "loss": 0.1271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12606561183929443, "rewards/margins": 0.18360337615013123, "rewards/rejected": -0.30966901779174805, "step": 540 }, { "epoch": 0.1, "learning_rate": 4.999654636727765e-06, "logits/chosen": -1.606479287147522, "logits/rejected": -1.17953622341156, "logps/chosen": -577.0275268554688, "logps/rejected": -1118.5433349609375, "loss": 0.0908, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1509220004081726, "rewards/margins": 0.21017400920391083, "rewards/rejected": -0.36109599471092224, "step": 550 }, { "epoch": 0.11, "learning_rate": 4.999323102948655e-06, "logits/chosen": -1.8047797679901123, "logits/rejected": -1.2963651418685913, "logps/chosen": -632.2808837890625, "logps/rejected": -1058.11083984375, "loss": 0.0952, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.16345271468162537, "rewards/margins": 0.18198499083518982, "rewards/rejected": -0.3454377055168152, "step": 560 }, { "epoch": 0.11, "learning_rate": 4.998881080708759e-06, "logits/chosen": -1.8658708333969116, "logits/rejected": -1.3321269750595093, "logps/chosen": -554.2672119140625, "logps/rejected": -1096.3828125, "loss": 0.0875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14818139374256134, "rewards/margins": 0.24486231803894043, "rewards/rejected": -0.3930436968803406, "step": 570 }, { "epoch": 0.11, "learning_rate": 4.998328589548711e-06, "logits/chosen": -1.7779667377471924, "logits/rejected": -1.069665551185608, "logps/chosen": -675.4833984375, "logps/rejected": -1202.579833984375, "loss": 0.1088, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.17311152815818787, "rewards/margins": 0.23328053951263428, "rewards/rejected": -0.40639209747314453, "step": 580 }, { "epoch": 0.11, "learning_rate": 4.997665653892682e-06, "logits/chosen": -1.888587236404419, "logits/rejected": -1.1475058794021606, "logps/chosen": -627.3590698242188, "logps/rejected": -1246.740478515625, "loss": 0.0582, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.17301705479621887, "rewards/margins": 0.28004103899002075, "rewards/rejected": -0.4530580937862396, "step": 590 }, { "epoch": 0.11, "learning_rate": 4.996892303047306e-06, "logits/chosen": -1.7172037363052368, "logits/rejected": -1.1663614511489868, "logps/chosen": -652.0533447265625, "logps/rejected": -1193.7652587890625, "loss": 0.0975, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.2103462666273117, "rewards/margins": 0.2457568347454071, "rewards/rejected": -0.4561030864715576, "step": 600 }, { "epoch": 0.12, "learning_rate": 4.996008571200375e-06, "logits/chosen": -1.6690038442611694, "logits/rejected": -1.0809996128082275, "logps/chosen": -702.9678955078125, "logps/rejected": -1331.991455078125, "loss": 0.0731, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1795978993177414, "rewards/margins": 0.2755267918109894, "rewards/rejected": -0.4551246762275696, "step": 610 }, { "epoch": 0.12, "learning_rate": 4.995014497419336e-06, "logits/chosen": -1.9420398473739624, "logits/rejected": -1.0578997135162354, "logps/chosen": -653.03759765625, "logps/rejected": -1236.6177978515625, "loss": 0.0922, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.15195727348327637, "rewards/margins": 0.25116264820098877, "rewards/rejected": -0.4031199514865875, "step": 620 }, { "epoch": 0.12, "learning_rate": 4.993910125649561e-06, "logits/chosen": -1.5005296468734741, "logits/rejected": -1.0164679288864136, "logps/chosen": -615.0857543945312, "logps/rejected": -1095.970703125, "loss": 0.1492, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.19480828940868378, "rewards/margins": 0.16079875826835632, "rewards/rejected": -0.3556070625782013, "step": 630 }, { "epoch": 0.12, "learning_rate": 4.992695504712402e-06, "logits/chosen": -1.7464653253555298, "logits/rejected": -1.2301980257034302, "logps/chosen": -601.227783203125, "logps/rejected": -1081.1474609375, "loss": 0.1075, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12417513132095337, "rewards/margins": 0.20312102138996124, "rewards/rejected": -0.3272961378097534, "step": 640 }, { "epoch": 0.12, "learning_rate": 4.9913706883030385e-06, "logits/chosen": -1.81307053565979, "logits/rejected": -1.2394088506698608, "logps/chosen": -603.0763549804688, "logps/rejected": -1259.51318359375, "loss": 0.0631, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1690419614315033, "rewards/margins": 0.2731035649776459, "rewards/rejected": -0.44214552640914917, "step": 650 }, { "epoch": 0.13, "learning_rate": 4.989935734988098e-06, "logits/chosen": -1.7271366119384766, "logits/rejected": -1.0474259853363037, "logps/chosen": -673.4783325195312, "logps/rejected": -1239.798095703125, "loss": 0.0771, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.21036569774150848, "rewards/margins": 0.2717295289039612, "rewards/rejected": -0.48209524154663086, "step": 660 }, { "epoch": 0.13, "learning_rate": 4.988390708203068e-06, "logits/chosen": -1.5817875862121582, "logits/rejected": -1.0434249639511108, "logps/chosen": -648.2857666015625, "logps/rejected": -1361.281982421875, "loss": 0.0946, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.22290560603141785, "rewards/margins": 0.29205870628356934, "rewards/rejected": -0.5149643421173096, "step": 670 }, { "epoch": 0.13, "learning_rate": 4.9867356762494955e-06, "logits/chosen": -1.653638243675232, "logits/rejected": -0.9126855731010437, "logps/chosen": -761.7947998046875, "logps/rejected": -1445.9437255859375, "loss": 0.0596, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.2962535321712494, "rewards/margins": 0.299728125333786, "rewards/rejected": -0.5959817171096802, "step": 680 }, { "epoch": 0.13, "learning_rate": 4.984970712291963e-06, "logits/chosen": -1.6935688257217407, "logits/rejected": -0.995235800743103, "logps/chosen": -836.0851440429688, "logps/rejected": -1352.14892578125, "loss": 0.1075, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.33499711751937866, "rewards/margins": 0.24044211208820343, "rewards/rejected": -0.5754392743110657, "step": 690 }, { "epoch": 0.13, "learning_rate": 4.983095894354858e-06, "logits/chosen": -1.7480075359344482, "logits/rejected": -1.0178004503250122, "logps/chosen": -839.4945068359375, "logps/rejected": -1367.689697265625, "loss": 0.0708, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.285785973072052, "rewards/margins": 0.2461218386888504, "rewards/rejected": -0.5319077372550964, "step": 700 }, { "epoch": 0.14, "learning_rate": 4.981111305318918e-06, "logits/chosen": -1.6903988122940063, "logits/rejected": -1.0864533185958862, "logps/chosen": -639.9157104492188, "logps/rejected": -1278.444580078125, "loss": 0.1003, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.18382251262664795, "rewards/margins": 0.2932285666465759, "rewards/rejected": -0.4770510792732239, "step": 710 }, { "epoch": 0.14, "learning_rate": 4.979017032917576e-06, "logits/chosen": -1.6954967975616455, "logits/rejected": -1.1473253965377808, "logps/chosen": -663.8118896484375, "logps/rejected": -1232.072509765625, "loss": 0.0869, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.16720175743103027, "rewards/margins": 0.2587115168571472, "rewards/rejected": -0.4259132444858551, "step": 720 }, { "epoch": 0.14, "learning_rate": 4.97681316973307e-06, "logits/chosen": -1.6922680139541626, "logits/rejected": -1.0211702585220337, "logps/chosen": -684.892333984375, "logps/rejected": -1208.362060546875, "loss": 0.0882, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.18825049698352814, "rewards/margins": 0.23126792907714844, "rewards/rejected": -0.4195184111595154, "step": 730 }, { "epoch": 0.14, "learning_rate": 4.9744998131923625e-06, "logits/chosen": -1.9704052209854126, "logits/rejected": -1.2731428146362305, "logps/chosen": -636.3994140625, "logps/rejected": -1263.0999755859375, "loss": 0.0978, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.14151528477668762, "rewards/margins": 0.2585150897502899, "rewards/rejected": -0.4000304341316223, "step": 740 }, { "epoch": 0.14, "learning_rate": 4.9720770655628216e-06, "logits/chosen": -1.7848608493804932, "logits/rejected": -1.0659257173538208, "logps/chosen": -657.3236083984375, "logps/rejected": -1317.057373046875, "loss": 0.0631, "rewards/accuracies": 0.875, "rewards/chosen": -0.15722636878490448, "rewards/margins": 0.2890471816062927, "rewards/rejected": -0.4462736248970032, "step": 750 }, { "epoch": 0.14, "learning_rate": 4.969545033947711e-06, "logits/chosen": -1.6175647974014282, "logits/rejected": -1.0041881799697876, "logps/chosen": -588.1198120117188, "logps/rejected": -1212.6221923828125, "loss": 0.0978, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.145894855260849, "rewards/margins": 0.26582056283950806, "rewards/rejected": -0.41171541810035706, "step": 760 }, { "epoch": 0.15, "learning_rate": 4.966903830281449e-06, "logits/chosen": -1.6709251403808594, "logits/rejected": -1.0464791059494019, "logps/chosen": -646.2835693359375, "logps/rejected": -1256.691650390625, "loss": 0.0764, "rewards/accuracies": 0.875, "rewards/chosen": -0.12737174332141876, "rewards/margins": 0.2582496106624603, "rewards/rejected": -0.3856213688850403, "step": 770 }, { "epoch": 0.15, "learning_rate": 4.964153571324658e-06, "logits/chosen": -1.998659372329712, "logits/rejected": -1.257143259048462, "logps/chosen": -553.2044677734375, "logps/rejected": -1021.4732666015625, "loss": 0.0801, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10653400421142578, "rewards/margins": 0.22628924250602722, "rewards/rejected": -0.3328232169151306, "step": 780 }, { "epoch": 0.15, "learning_rate": 4.96129437865901e-06, "logits/chosen": -1.7126219272613525, "logits/rejected": -1.3472967147827148, "logps/chosen": -638.3450927734375, "logps/rejected": -1300.0621337890625, "loss": 0.0974, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.18389222025871277, "rewards/margins": 0.24962754547595978, "rewards/rejected": -0.43351978063583374, "step": 790 }, { "epoch": 0.15, "learning_rate": 4.958326378681849e-06, "logits/chosen": -1.5197988748550415, "logits/rejected": -1.0763747692108154, "logps/chosen": -741.8438720703125, "logps/rejected": -1362.937744140625, "loss": 0.0525, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.2865040600299835, "rewards/margins": 0.27982592582702637, "rewards/rejected": -0.5663300156593323, "step": 800 }, { "epoch": 0.15, "learning_rate": 4.955249702600598e-06, "logits/chosen": -1.8825486898422241, "logits/rejected": -0.9662033319473267, "logps/chosen": -796.578857421875, "logps/rejected": -1616.077880859375, "loss": 0.0481, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3106931746006012, "rewards/margins": 0.35244977474212646, "rewards/rejected": -0.66314297914505, "step": 810 }, { "epoch": 0.16, "learning_rate": 4.952064486426965e-06, "logits/chosen": -1.5078961849212646, "logits/rejected": -0.939461886882782, "logps/chosen": -729.2574462890625, "logps/rejected": -1372.853759765625, "loss": 0.0811, "rewards/accuracies": 0.875, "rewards/chosen": -0.24250641465187073, "rewards/margins": 0.2867491841316223, "rewards/rejected": -0.5292556285858154, "step": 820 }, { "epoch": 0.16, "learning_rate": 4.948770870970929e-06, "logits/chosen": -1.7515017986297607, "logits/rejected": -1.2939387559890747, "logps/chosen": -522.4594116210938, "logps/rejected": -1244.282958984375, "loss": 0.0794, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1327625960111618, "rewards/margins": 0.30878084897994995, "rewards/rejected": -0.44154348969459534, "step": 830 }, { "epoch": 0.16, "learning_rate": 4.9453690018345144e-06, "logits/chosen": -1.435917854309082, "logits/rejected": -0.7938503623008728, "logps/chosen": -638.5426635742188, "logps/rejected": -1297.34521484375, "loss": 0.0819, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.22242049872875214, "rewards/margins": 0.2777343988418579, "rewards/rejected": -0.5001549124717712, "step": 840 }, { "epoch": 0.16, "learning_rate": 4.941859029405354e-06, "logits/chosen": -1.5241379737854004, "logits/rejected": -0.9835844039916992, "logps/chosen": -676.1991577148438, "logps/rejected": -1275.102294921875, "loss": 0.09, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2034449577331543, "rewards/margins": 0.26251763105392456, "rewards/rejected": -0.46596255898475647, "step": 850 }, { "epoch": 0.16, "learning_rate": 4.938241108850039e-06, "logits/chosen": -1.6833593845367432, "logits/rejected": -1.1207275390625, "logps/chosen": -641.3517456054688, "logps/rejected": -1251.5594482421875, "loss": 0.0755, "rewards/accuracies": 0.875, "rewards/chosen": -0.18910585343837738, "rewards/margins": 0.26478061079978943, "rewards/rejected": -0.4538864493370056, "step": 860 }, { "epoch": 0.17, "learning_rate": 4.934515400107266e-06, "logits/chosen": -1.549037218093872, "logits/rejected": -1.1462652683258057, "logps/chosen": -786.075927734375, "logps/rejected": -1397.09619140625, "loss": 0.0853, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2606995105743408, "rewards/margins": 0.2590792179107666, "rewards/rejected": -0.5197787284851074, "step": 870 }, { "epoch": 0.17, "learning_rate": 4.930682067880759e-06, "logits/chosen": -1.7059816122055054, "logits/rejected": -1.009918212890625, "logps/chosen": -720.1533813476562, "logps/rejected": -1230.998779296875, "loss": 0.0719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2683635354042053, "rewards/margins": 0.2336823046207428, "rewards/rejected": -0.5020458698272705, "step": 880 }, { "epoch": 0.17, "learning_rate": 4.926741281631991e-06, "logits/chosen": -1.4708434343338013, "logits/rejected": -1.0346577167510986, "logps/chosen": -668.7374267578125, "logps/rejected": -1247.803955078125, "loss": 0.106, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.25595933198928833, "rewards/margins": 0.2600180506706238, "rewards/rejected": -0.5159772634506226, "step": 890 }, { "epoch": 0.17, "learning_rate": 4.922693215572695e-06, "logits/chosen": -1.2816441059112549, "logits/rejected": -0.8067164421081543, "logps/chosen": -746.5218505859375, "logps/rejected": -1298.2572021484375, "loss": 0.1129, "rewards/accuracies": 0.75, "rewards/chosen": -0.2702776789665222, "rewards/margins": 0.24418941140174866, "rewards/rejected": -0.5144670605659485, "step": 900 }, { "epoch": 0.17, "learning_rate": 4.91853804865716e-06, "logits/chosen": -1.8930604457855225, "logits/rejected": -0.9798396825790405, "logps/chosen": -685.3873291015625, "logps/rejected": -1278.208984375, "loss": 0.0875, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.18094095587730408, "rewards/margins": 0.2873077392578125, "rewards/rejected": -0.4682486951351166, "step": 910 }, { "epoch": 0.18, "learning_rate": 4.91427596457432e-06, "logits/chosen": -1.7996612787246704, "logits/rejected": -1.2704670429229736, "logps/chosen": -548.4567260742188, "logps/rejected": -1128.166259765625, "loss": 0.1133, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.14571422338485718, "rewards/margins": 0.24841785430908203, "rewards/rejected": -0.39413201808929443, "step": 920 }, { "epoch": 0.18, "learning_rate": 4.909907151739634e-06, "logits/chosen": -1.5436075925827026, "logits/rejected": -1.099595308303833, "logps/chosen": -691.2551879882812, "logps/rejected": -1293.091552734375, "loss": 0.0535, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.2075759917497635, "rewards/margins": 0.2663406729698181, "rewards/rejected": -0.4739166796207428, "step": 930 }, { "epoch": 0.18, "learning_rate": 4.905431803286756e-06, "logits/chosen": -1.9107078313827515, "logits/rejected": -0.9111030697822571, "logps/chosen": -640.4327392578125, "logps/rejected": -1321.55859375, "loss": 0.0529, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1775406301021576, "rewards/margins": 0.2937074601650238, "rewards/rejected": -0.4712480902671814, "step": 940 }, { "epoch": 0.18, "learning_rate": 4.900850117059e-06, "logits/chosen": -1.5877889394760132, "logits/rejected": -0.9842106103897095, "logps/chosen": -651.833984375, "logps/rejected": -1406.99755859375, "loss": 0.0446, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.20326288044452667, "rewards/margins": 0.3253769278526306, "rewards/rejected": -0.5286397933959961, "step": 950 }, { "epoch": 0.18, "learning_rate": 4.8961622956005895e-06, "logits/chosen": -1.8441241979599, "logits/rejected": -1.0815012454986572, "logps/chosen": -683.8507690429688, "logps/rejected": -1448.83251953125, "loss": 0.0413, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.2018641233444214, "rewards/margins": 0.36305880546569824, "rewards/rejected": -0.5649229288101196, "step": 960 }, { "epoch": 0.18, "learning_rate": 4.891368546147707e-06, "logits/chosen": -1.7532212734222412, "logits/rejected": -1.226049780845642, "logps/chosen": -605.4937744140625, "logps/rejected": -1278.6380615234375, "loss": 0.0747, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.18908478319644928, "rewards/margins": 0.3011362552642822, "rewards/rejected": -0.4902211129665375, "step": 970 }, { "epoch": 0.19, "learning_rate": 4.88646908061933e-06, "logits/chosen": -1.7473132610321045, "logits/rejected": -0.987618088722229, "logps/chosen": -714.2844848632812, "logps/rejected": -1421.032470703125, "loss": 0.0738, "rewards/accuracies": 0.875, "rewards/chosen": -0.24226772785186768, "rewards/margins": 0.3205004930496216, "rewards/rejected": -0.5627682209014893, "step": 980 }, { "epoch": 0.19, "learning_rate": 4.881464115607866e-06, "logits/chosen": -1.8514864444732666, "logits/rejected": -1.102475643157959, "logps/chosen": -690.0552368164062, "logps/rejected": -1299.9759521484375, "loss": 0.0803, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.22629126906394958, "rewards/margins": 0.30736809968948364, "rewards/rejected": -0.5336593985557556, "step": 990 }, { "epoch": 0.19, "learning_rate": 4.876353872369573e-06, "logits/chosen": -1.8886613845825195, "logits/rejected": -1.1019657850265503, "logps/chosen": -635.8572998046875, "logps/rejected": -1302.39453125, "loss": 0.063, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.17442987859249115, "rewards/margins": 0.3282366394996643, "rewards/rejected": -0.5026665329933167, "step": 1000 }, { "epoch": 0.19, "learning_rate": 4.871138576814782e-06, "logits/chosen": -1.7834192514419556, "logits/rejected": -1.2944515943527222, "logps/chosen": -820.8410034179688, "logps/rejected": -1417.619873046875, "loss": 0.0897, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2722592055797577, "rewards/margins": 0.28066331148147583, "rewards/rejected": -0.5529226064682007, "step": 1010 }, { "epoch": 0.19, "learning_rate": 4.865818459497911e-06, "logits/chosen": -1.4951821565628052, "logits/rejected": -0.9517828822135925, "logps/chosen": -659.3931884765625, "logps/rejected": -1239.884033203125, "loss": 0.0906, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.22253477573394775, "rewards/margins": 0.25982776284217834, "rewards/rejected": -0.4823625683784485, "step": 1020 }, { "epoch": 0.2, "learning_rate": 4.860393755607266e-06, "logits/chosen": -1.7842209339141846, "logits/rejected": -1.2502198219299316, "logps/chosen": -612.7080078125, "logps/rejected": -1303.169189453125, "loss": 0.0567, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.16437311470508575, "rewards/margins": 0.3025432229042053, "rewards/rejected": -0.46691638231277466, "step": 1030 }, { "epoch": 0.2, "learning_rate": 4.854864704954654e-06, "logits/chosen": -1.4791333675384521, "logits/rejected": -0.8677660822868347, "logps/chosen": -597.54296875, "logps/rejected": -1209.46435546875, "loss": 0.0787, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.18261802196502686, "rewards/margins": 0.2735896706581116, "rewards/rejected": -0.4562076926231384, "step": 1040 }, { "epoch": 0.2, "learning_rate": 4.849231551964771e-06, "logits/chosen": -1.5059902667999268, "logits/rejected": -1.0160598754882812, "logps/chosen": -761.7801513671875, "logps/rejected": -1293.54248046875, "loss": 0.0912, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.255204439163208, "rewards/margins": 0.23546037077903748, "rewards/rejected": -0.4906648099422455, "step": 1050 }, { "epoch": 0.2, "learning_rate": 4.843494545664407e-06, "logits/chosen": -1.6067050695419312, "logits/rejected": -1.233272910118103, "logps/chosen": -522.5372314453125, "logps/rejected": -1160.08642578125, "loss": 0.0894, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.14243915677070618, "rewards/margins": 0.2808658480644226, "rewards/rejected": -0.4233049750328064, "step": 1060 }, { "epoch": 0.2, "learning_rate": 4.837653939671427e-06, "logits/chosen": -1.5032033920288086, "logits/rejected": -0.8692194819450378, "logps/chosen": -603.5737915039062, "logps/rejected": -1262.277099609375, "loss": 0.0768, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.19397929310798645, "rewards/margins": 0.2933715283870697, "rewards/rejected": -0.48735085129737854, "step": 1070 }, { "epoch": 0.21, "learning_rate": 4.8317099921835695e-06, "logits/chosen": -1.7000210285186768, "logits/rejected": -0.9161543846130371, "logps/chosen": -673.3897705078125, "logps/rejected": -1238.2972412109375, "loss": 0.0756, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.18038219213485718, "rewards/margins": 0.2853950262069702, "rewards/rejected": -0.4657772183418274, "step": 1080 }, { "epoch": 0.21, "learning_rate": 4.825662965967023e-06, "logits/chosen": -1.2642765045166016, "logits/rejected": -1.0610508918762207, "logps/chosen": -568.6837768554688, "logps/rejected": -1316.013427734375, "loss": 0.086, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1493438184261322, "rewards/margins": 0.304331511259079, "rewards/rejected": -0.4536752700805664, "step": 1090 }, { "epoch": 0.21, "learning_rate": 4.819513128344814e-06, "logits/chosen": -1.4977022409439087, "logits/rejected": -0.9430916905403137, "logps/chosen": -771.9939575195312, "logps/rejected": -1390.864013671875, "loss": 0.0744, "rewards/accuracies": 0.875, "rewards/chosen": -0.2511390447616577, "rewards/margins": 0.27243170142173767, "rewards/rejected": -0.5235707759857178, "step": 1100 }, { "epoch": 0.21, "learning_rate": 4.813260751184992e-06, "logits/chosen": -1.6140292882919312, "logits/rejected": -1.1604433059692383, "logps/chosen": -611.8510131835938, "logps/rejected": -1271.919677734375, "loss": 0.0816, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.23755709826946259, "rewards/margins": 0.29791098833084106, "rewards/rejected": -0.5354681015014648, "step": 1110 }, { "epoch": 0.21, "learning_rate": 4.806906110888606e-06, "logits/chosen": -1.57673180103302, "logits/rejected": -0.8760968446731567, "logps/chosen": -773.2528076171875, "logps/rejected": -1487.0355224609375, "loss": 0.0695, "rewards/accuracies": 0.875, "rewards/chosen": -0.26283028721809387, "rewards/margins": 0.34201639890670776, "rewards/rejected": -0.604846715927124, "step": 1120 }, { "epoch": 0.22, "learning_rate": 4.8004494883774885e-06, "logits/chosen": -1.4768116474151611, "logits/rejected": -1.0381088256835938, "logps/chosen": -592.5392456054688, "logps/rejected": -1219.542724609375, "loss": 0.0907, "rewards/accuracies": 0.75, "rewards/chosen": -0.1403500735759735, "rewards/margins": 0.27375248074531555, "rewards/rejected": -0.41410255432128906, "step": 1130 }, { "epoch": 0.22, "learning_rate": 4.793891169081835e-06, "logits/chosen": -1.7134135961532593, "logits/rejected": -1.066749095916748, "logps/chosen": -520.9646606445312, "logps/rejected": -1094.593994140625, "loss": 0.1118, "rewards/accuracies": 0.875, "rewards/chosen": -0.056602220982313156, "rewards/margins": 0.25242939591407776, "rewards/rejected": -0.30903160572052, "step": 1140 }, { "epoch": 0.22, "learning_rate": 4.787231442927587e-06, "logits/chosen": -1.4377086162567139, "logits/rejected": -0.8465448617935181, "logps/chosen": -630.1041259765625, "logps/rejected": -1104.6734619140625, "loss": 0.094, "rewards/accuracies": 0.75, "rewards/chosen": -0.15882766246795654, "rewards/margins": 0.21694330871105194, "rewards/rejected": -0.3757709264755249, "step": 1150 }, { "epoch": 0.22, "learning_rate": 4.780470604323616e-06, "logits/chosen": -1.6286237239837646, "logits/rejected": -0.8310405611991882, "logps/chosen": -762.2435302734375, "logps/rejected": -1267.188232421875, "loss": 0.0882, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2872321307659149, "rewards/margins": 0.22836999595165253, "rewards/rejected": -0.5156021118164062, "step": 1160 }, { "epoch": 0.22, "learning_rate": 4.773608952148706e-06, "logits/chosen": -1.4241678714752197, "logits/rejected": -1.0249742269515991, "logps/chosen": -659.6388549804688, "logps/rejected": -1119.01904296875, "loss": 0.1083, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1795882135629654, "rewards/margins": 0.19241158664226532, "rewards/rejected": -0.3719998002052307, "step": 1170 }, { "epoch": 0.22, "learning_rate": 4.766646789738342e-06, "logits/chosen": -1.4974104166030884, "logits/rejected": -1.0342199802398682, "logps/chosen": -511.42803955078125, "logps/rejected": -1142.3897705078125, "loss": 0.0633, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.128463476896286, "rewards/margins": 0.25377964973449707, "rewards/rejected": -0.3822430968284607, "step": 1180 }, { "epoch": 0.23, "learning_rate": 4.759584424871302e-06, "logits/chosen": -1.4972057342529297, "logits/rejected": -1.0252101421356201, "logps/chosen": -576.6381225585938, "logps/rejected": -1152.332763671875, "loss": 0.1047, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16848218441009521, "rewards/margins": 0.2421194612979889, "rewards/rejected": -0.4106016755104065, "step": 1190 }, { "epoch": 0.23, "learning_rate": 4.752422169756048e-06, "logits/chosen": -1.483394980430603, "logits/rejected": -1.089908242225647, "logps/chosen": -659.2501831054688, "logps/rejected": -1330.499267578125, "loss": 0.0568, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.18682973086833954, "rewards/margins": 0.303545743227005, "rewards/rejected": -0.49037545919418335, "step": 1200 }, { "epoch": 0.23, "learning_rate": 4.745160341016927e-06, "logits/chosen": -1.725095510482788, "logits/rejected": -1.0486876964569092, "logps/chosen": -745.1365356445312, "logps/rejected": -1397.880859375, "loss": 0.0587, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.23421664535999298, "rewards/margins": 0.3120633661746979, "rewards/rejected": -0.546280026435852, "step": 1210 }, { "epoch": 0.23, "learning_rate": 4.737799259680172e-06, "logits/chosen": -1.8329044580459595, "logits/rejected": -1.0362406969070435, "logps/chosen": -665.9381103515625, "logps/rejected": -1365.664306640625, "loss": 0.0595, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1887119710445404, "rewards/margins": 0.34067028760910034, "rewards/rejected": -0.5293822288513184, "step": 1220 }, { "epoch": 0.23, "learning_rate": 4.730339251159709e-06, "logits/chosen": -1.4984453916549683, "logits/rejected": -0.9244080781936646, "logps/chosen": -627.0536499023438, "logps/rejected": -1200.630615234375, "loss": 0.076, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.16129754483699799, "rewards/margins": 0.27342361211776733, "rewards/rejected": -0.4347211718559265, "step": 1230 }, { "epoch": 0.24, "learning_rate": 4.722780645242775e-06, "logits/chosen": -1.8025153875350952, "logits/rejected": -1.0183075666427612, "logps/chosen": -678.0471801757812, "logps/rejected": -1255.756591796875, "loss": 0.073, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.18587812781333923, "rewards/margins": 0.2857830822467804, "rewards/rejected": -0.47166118025779724, "step": 1240 }, { "epoch": 0.24, "learning_rate": 4.715123776075337e-06, "logits/chosen": -1.7555253505706787, "logits/rejected": -1.0257951021194458, "logps/chosen": -708.4110107421875, "logps/rejected": -1359.506103515625, "loss": 0.0791, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.2188255488872528, "rewards/margins": 0.30667853355407715, "rewards/rejected": -0.5255040526390076, "step": 1250 }, { "epoch": 0.24, "learning_rate": 4.707368982147318e-06, "logits/chosen": -1.6728156805038452, "logits/rejected": -0.9375909566879272, "logps/chosen": -703.1410522460938, "logps/rejected": -1263.3475341796875, "loss": 0.0895, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.21338506042957306, "rewards/margins": 0.2970745265483856, "rewards/rejected": -0.5104595422744751, "step": 1260 }, { "epoch": 0.24, "learning_rate": 4.699516606277638e-06, "logits/chosen": -1.8740787506103516, "logits/rejected": -1.362985610961914, "logps/chosen": -728.4583129882812, "logps/rejected": -1369.810302734375, "loss": 0.0746, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.23265771567821503, "rewards/margins": 0.2759070098400116, "rewards/rejected": -0.5085647702217102, "step": 1270 }, { "epoch": 0.24, "learning_rate": 4.691566995599056e-06, "logits/chosen": -1.7465486526489258, "logits/rejected": -0.9074887037277222, "logps/chosen": -530.6092529296875, "logps/rejected": -1085.4388427734375, "loss": 0.074, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1201125830411911, "rewards/margins": 0.26156720519065857, "rewards/rejected": -0.38167980313301086, "step": 1280 }, { "epoch": 0.25, "learning_rate": 4.683520501542825e-06, "logits/chosen": -1.7438312768936157, "logits/rejected": -1.0178749561309814, "logps/chosen": -585.673583984375, "logps/rejected": -1169.25732421875, "loss": 0.0714, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.14680668711662292, "rewards/margins": 0.25362518429756165, "rewards/rejected": -0.40043187141418457, "step": 1290 }, { "epoch": 0.25, "learning_rate": 4.675377479823153e-06, "logits/chosen": -1.4712369441986084, "logits/rejected": -1.0751581192016602, "logps/chosen": -608.4730834960938, "logps/rejected": -1213.798095703125, "loss": 0.0928, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16733074188232422, "rewards/margins": 0.2824379503726959, "rewards/rejected": -0.44976872205734253, "step": 1300 }, { "epoch": 0.25, "learning_rate": 4.667138290421483e-06, "logits/chosen": -1.4218109846115112, "logits/rejected": -1.0891635417938232, "logps/chosen": -546.42236328125, "logps/rejected": -1120.02734375, "loss": 0.0776, "rewards/accuracies": 0.75, "rewards/chosen": -0.17138846218585968, "rewards/margins": 0.25035151839256287, "rewards/rejected": -0.42173999547958374, "step": 1310 }, { "epoch": 0.25, "learning_rate": 4.658803297570578e-06, "logits/chosen": -1.5527435541152954, "logits/rejected": -0.772042989730835, "logps/chosen": -707.126708984375, "logps/rejected": -1424.598876953125, "loss": 0.0438, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.19388630986213684, "rewards/margins": 0.3253108859062195, "rewards/rejected": -0.5191971063613892, "step": 1320 }, { "epoch": 0.25, "learning_rate": 4.650372869738415e-06, "logits/chosen": -1.4559705257415771, "logits/rejected": -0.9750394821166992, "logps/chosen": -564.2633056640625, "logps/rejected": -1231.0230712890625, "loss": 0.0747, "rewards/accuracies": 0.875, "rewards/chosen": -0.16324150562286377, "rewards/margins": 0.28780093789100647, "rewards/rejected": -0.45104241371154785, "step": 1330 }, { "epoch": 0.26, "learning_rate": 4.641847379611898e-06, "logits/chosen": -1.6298139095306396, "logits/rejected": -0.9655359387397766, "logps/chosen": -630.3314208984375, "logps/rejected": -1392.5352783203125, "loss": 0.0764, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.19043368101119995, "rewards/margins": 0.3249918818473816, "rewards/rejected": -0.5154255628585815, "step": 1340 }, { "epoch": 0.26, "learning_rate": 4.633227204080389e-06, "logits/chosen": -1.6190248727798462, "logits/rejected": -1.1195992231369019, "logps/chosen": -551.1062622070312, "logps/rejected": -1251.203857421875, "loss": 0.0634, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.17107483744621277, "rewards/margins": 0.2999062240123749, "rewards/rejected": -0.47098103165626526, "step": 1350 }, { "epoch": 0.26, "learning_rate": 4.624512724219038e-06, "logits/chosen": -1.4386488199234009, "logits/rejected": -0.9899671673774719, "logps/chosen": -709.4961547851562, "logps/rejected": -1226.273681640625, "loss": 0.1151, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2317754030227661, "rewards/margins": 0.25371408462524414, "rewards/rejected": -0.48548945784568787, "step": 1360 }, { "epoch": 0.26, "learning_rate": 4.6157043252719374e-06, "logits/chosen": -1.7422698736190796, "logits/rejected": -1.1107302904129028, "logps/chosen": -825.3232421875, "logps/rejected": -1382.18603515625, "loss": 0.0723, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.27356773614883423, "rewards/margins": 0.28873828053474426, "rewards/rejected": -0.5623060464859009, "step": 1370 }, { "epoch": 0.26, "learning_rate": 4.606802396635098e-06, "logits/chosen": -1.6875635385513306, "logits/rejected": -1.162154197692871, "logps/chosen": -702.98095703125, "logps/rejected": -1256.966796875, "loss": 0.079, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.22531962394714355, "rewards/margins": 0.25439611077308655, "rewards/rejected": -0.4797157347202301, "step": 1380 }, { "epoch": 0.26, "learning_rate": 4.597807331839229e-06, "logits/chosen": -1.7705532312393188, "logits/rejected": -0.9259660840034485, "logps/chosen": -735.7838745117188, "logps/rejected": -1272.888427734375, "loss": 0.0651, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.19407150149345398, "rewards/margins": 0.2555926740169525, "rewards/rejected": -0.4496641755104065, "step": 1390 }, { "epoch": 0.27, "learning_rate": 4.588719528532342e-06, "logits/chosen": -1.7049188613891602, "logits/rejected": -1.1274850368499756, "logps/chosen": -646.8198852539062, "logps/rejected": -1386.414306640625, "loss": 0.053, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.15076547861099243, "rewards/margins": 0.32411348819732666, "rewards/rejected": -0.4748789668083191, "step": 1400 }, { "epoch": 0.27, "learning_rate": 4.5795393884621735e-06, "logits/chosen": -1.9536349773406982, "logits/rejected": -1.1568233966827393, "logps/chosen": -591.2590942382812, "logps/rejected": -1258.59423828125, "loss": 0.0747, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.10888688266277313, "rewards/margins": 0.2812551259994507, "rewards/rejected": -0.390142023563385, "step": 1410 }, { "epoch": 0.27, "learning_rate": 4.5702673174584236e-06, "logits/chosen": -1.6346734762191772, "logits/rejected": -1.1741018295288086, "logps/chosen": -746.7779541015625, "logps/rejected": -1454.915771484375, "loss": 0.0742, "rewards/accuracies": 0.875, "rewards/chosen": -0.2458966076374054, "rewards/margins": 0.2765346169471741, "rewards/rejected": -0.5224311947822571, "step": 1420 }, { "epoch": 0.27, "learning_rate": 4.560903725414816e-06, "logits/chosen": -1.743407964706421, "logits/rejected": -0.8613992929458618, "logps/chosen": -671.2027587890625, "logps/rejected": -1287.640380859375, "loss": 0.0842, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.20365352928638458, "rewards/margins": 0.26316341757774353, "rewards/rejected": -0.4668169617652893, "step": 1430 }, { "epoch": 0.27, "learning_rate": 4.551449026270979e-06, "logits/chosen": -1.7058141231536865, "logits/rejected": -1.1486746072769165, "logps/chosen": -598.8638916015625, "logps/rejected": -1300.490234375, "loss": 0.0781, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.19492121040821075, "rewards/margins": 0.283249169588089, "rewards/rejected": -0.47817039489746094, "step": 1440 }, { "epoch": 0.28, "learning_rate": 4.541903637994142e-06, "logits/chosen": -1.6145961284637451, "logits/rejected": -1.0189629793167114, "logps/chosen": -635.6578369140625, "logps/rejected": -1167.5048828125, "loss": 0.0972, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.18294695019721985, "rewards/margins": 0.23105594515800476, "rewards/rejected": -0.4140028953552246, "step": 1450 }, { "epoch": 0.28, "learning_rate": 4.532267982560662e-06, "logits/chosen": -1.6065614223480225, "logits/rejected": -1.3095319271087646, "logps/chosen": -658.8798828125, "logps/rejected": -1332.044921875, "loss": 0.0866, "rewards/accuracies": 0.875, "rewards/chosen": -0.20158949494361877, "rewards/margins": 0.29118824005126953, "rewards/rejected": -0.49277767539024353, "step": 1460 }, { "epoch": 0.28, "learning_rate": 4.522542485937369e-06, "logits/chosen": -1.469521403312683, "logits/rejected": -0.9519163966178894, "logps/chosen": -594.472412109375, "logps/rejected": -1056.48193359375, "loss": 0.1178, "rewards/accuracies": 0.75, "rewards/chosen": -0.16802926361560822, "rewards/margins": 0.1843966841697693, "rewards/rejected": -0.3524259328842163, "step": 1470 }, { "epoch": 0.28, "learning_rate": 4.512727578062733e-06, "logits/chosen": -1.5861170291900635, "logits/rejected": -0.9400532841682434, "logps/chosen": -671.851318359375, "logps/rejected": -1219.797119140625, "loss": 0.0934, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1471502184867859, "rewards/margins": 0.26563459634780884, "rewards/rejected": -0.41278475522994995, "step": 1480 }, { "epoch": 0.28, "learning_rate": 4.502823692827859e-06, "logits/chosen": -1.5500032901763916, "logits/rejected": -1.1829593181610107, "logps/chosen": -658.6322021484375, "logps/rejected": -1304.2266845703125, "loss": 0.077, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.18946467339992523, "rewards/margins": 0.2668909430503845, "rewards/rejected": -0.45635563135147095, "step": 1490 }, { "epoch": 0.29, "learning_rate": 4.492831268057307e-06, "logits/chosen": -1.625832200050354, "logits/rejected": -1.0373882055282593, "logps/chosen": -696.6791381835938, "logps/rejected": -1352.232177734375, "loss": 0.0882, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1951400637626648, "rewards/margins": 0.277163565158844, "rewards/rejected": -0.472303569316864, "step": 1500 }, { "epoch": 0.29, "learning_rate": 4.482750745489733e-06, "logits/chosen": -1.787223219871521, "logits/rejected": -1.222612977027893, "logps/chosen": -550.8800659179688, "logps/rejected": -1146.3341064453125, "loss": 0.0782, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10685940086841583, "rewards/margins": 0.2857755422592163, "rewards/rejected": -0.39263495802879333, "step": 1510 }, { "epoch": 0.29, "learning_rate": 4.472582570758367e-06, "logits/chosen": -1.728491187095642, "logits/rejected": -1.01820969581604, "logps/chosen": -593.0372314453125, "logps/rejected": -1177.662353515625, "loss": 0.0638, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16049079596996307, "rewards/margins": 0.2860396206378937, "rewards/rejected": -0.44653043150901794, "step": 1520 }, { "epoch": 0.29, "learning_rate": 4.4623271933713065e-06, "logits/chosen": -1.6648460626602173, "logits/rejected": -1.033857822418213, "logps/chosen": -671.03955078125, "logps/rejected": -1315.7705078125, "loss": 0.0945, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.22596442699432373, "rewards/margins": 0.28475135564804077, "rewards/rejected": -0.5107157826423645, "step": 1530 }, { "epoch": 0.29, "learning_rate": 4.451985066691649e-06, "logits/chosen": -1.670251488685608, "logits/rejected": -1.3727920055389404, "logps/chosen": -600.6444091796875, "logps/rejected": -1437.553466796875, "loss": 0.0697, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.20430073142051697, "rewards/margins": 0.3402923047542572, "rewards/rejected": -0.5445930361747742, "step": 1540 }, { "epoch": 0.3, "learning_rate": 4.441556647917447e-06, "logits/chosen": -1.5559931993484497, "logits/rejected": -0.9859651327133179, "logps/chosen": -561.6361694335938, "logps/rejected": -1220.6103515625, "loss": 0.0743, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13392092287540436, "rewards/margins": 0.2867361903190613, "rewards/rejected": -0.42065709829330444, "step": 1550 }, { "epoch": 0.3, "learning_rate": 4.431042398061499e-06, "logits/chosen": -1.6274524927139282, "logits/rejected": -1.1113866567611694, "logps/chosen": -502.992919921875, "logps/rejected": -1168.184814453125, "loss": 0.0575, "rewards/accuracies": 0.875, "rewards/chosen": -0.12179882824420929, "rewards/margins": 0.3009914457798004, "rewards/rejected": -0.4227902889251709, "step": 1560 }, { "epoch": 0.3, "learning_rate": 4.420442781930971e-06, "logits/chosen": -1.6948182582855225, "logits/rejected": -1.1259328126907349, "logps/chosen": -610.978515625, "logps/rejected": -1261.6180419921875, "loss": 0.0832, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.14310654997825623, "rewards/margins": 0.2861831784248352, "rewards/rejected": -0.42928972840309143, "step": 1570 }, { "epoch": 0.3, "learning_rate": 4.409758268106842e-06, "logits/chosen": -1.746582269668579, "logits/rejected": -0.878632664680481, "logps/chosen": -616.5509033203125, "logps/rejected": -1264.705810546875, "loss": 0.0566, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.17718395590782166, "rewards/margins": 0.28132373094558716, "rewards/rejected": -0.4585076868534088, "step": 1580 }, { "epoch": 0.3, "learning_rate": 4.398989328923196e-06, "logits/chosen": -1.4909377098083496, "logits/rejected": -1.01114022731781, "logps/chosen": -630.6038818359375, "logps/rejected": -1251.208740234375, "loss": 0.0856, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.20834538340568542, "rewards/margins": 0.24381554126739502, "rewards/rejected": -0.45216089487075806, "step": 1590 }, { "epoch": 0.3, "learning_rate": 4.388136440446338e-06, "logits/chosen": -1.6888008117675781, "logits/rejected": -0.99406498670578, "logps/chosen": -651.6956787109375, "logps/rejected": -1196.780029296875, "loss": 0.0897, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.18217213451862335, "rewards/margins": 0.24324896931648254, "rewards/rejected": -0.4254210889339447, "step": 1600 }, { "epoch": 0.31, "learning_rate": 4.377200082453748e-06, "logits/chosen": -1.49949312210083, "logits/rejected": -0.9790937304496765, "logps/chosen": -680.9384765625, "logps/rejected": -1257.7886962890625, "loss": 0.0735, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.21910205483436584, "rewards/margins": 0.2613365054130554, "rewards/rejected": -0.48043856024742126, "step": 1610 }, { "epoch": 0.31, "learning_rate": 4.366180738412876e-06, "logits/chosen": -1.6257928609848022, "logits/rejected": -0.9221351742744446, "logps/chosen": -675.370849609375, "logps/rejected": -1347.3052978515625, "loss": 0.062, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.21879585087299347, "rewards/margins": 0.3183351159095764, "rewards/rejected": -0.5371309518814087, "step": 1620 }, { "epoch": 0.31, "learning_rate": 4.355078895459761e-06, "logits/chosen": -1.6502177715301514, "logits/rejected": -1.099336862564087, "logps/chosen": -707.7600708007812, "logps/rejected": -1389.7249755859375, "loss": 0.0547, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.2330164611339569, "rewards/margins": 0.3357909321784973, "rewards/rejected": -0.5688074231147766, "step": 1630 }, { "epoch": 0.31, "learning_rate": 4.343895044377504e-06, "logits/chosen": -1.718955397605896, "logits/rejected": -0.9656432867050171, "logps/chosen": -754.4881591796875, "logps/rejected": -1390.9036865234375, "loss": 0.0546, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.255190908908844, "rewards/margins": 0.3234071433544159, "rewards/rejected": -0.5785980224609375, "step": 1640 }, { "epoch": 0.31, "learning_rate": 4.332629679574566e-06, "logits/chosen": -1.55838143825531, "logits/rejected": -0.9745651483535767, "logps/chosen": -663.7210083007812, "logps/rejected": -1346.267578125, "loss": 0.0885, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.23024721443653107, "rewards/margins": 0.3076723515987396, "rewards/rejected": -0.5379196405410767, "step": 1650 }, { "epoch": 0.32, "learning_rate": 4.321283299062916e-06, "logits/chosen": -1.5372596979141235, "logits/rejected": -0.9359537959098816, "logps/chosen": -734.9241943359375, "logps/rejected": -1340.1929931640625, "loss": 0.1026, "rewards/accuracies": 0.75, "rewards/chosen": -0.28472191095352173, "rewards/margins": 0.2690187096595764, "rewards/rejected": -0.5537406802177429, "step": 1660 }, { "epoch": 0.32, "learning_rate": 4.309856404436013e-06, "logits/chosen": -1.6407476663589478, "logits/rejected": -0.9992292523384094, "logps/chosen": -695.8706665039062, "logps/rejected": -1365.452392578125, "loss": 0.0707, "rewards/accuracies": 0.875, "rewards/chosen": -0.2776133120059967, "rewards/margins": 0.31161263585090637, "rewards/rejected": -0.5892259478569031, "step": 1670 }, { "epoch": 0.32, "learning_rate": 4.2983495008466285e-06, "logits/chosen": -1.4128013849258423, "logits/rejected": -1.0901668071746826, "logps/chosen": -641.1650390625, "logps/rejected": -1331.385986328125, "loss": 0.0782, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.25103798508644104, "rewards/margins": 0.30623993277549744, "rewards/rejected": -0.5572779774665833, "step": 1680 }, { "epoch": 0.32, "learning_rate": 4.2867630969845235e-06, "logits/chosen": -1.4749224185943604, "logits/rejected": -0.9041854739189148, "logps/chosen": -793.1533203125, "logps/rejected": -1348.232177734375, "loss": 0.0763, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.30572837591171265, "rewards/margins": 0.26624101400375366, "rewards/rejected": -0.5719693303108215, "step": 1690 }, { "epoch": 0.32, "learning_rate": 4.275097705053951e-06, "logits/chosen": -1.478694200515747, "logits/rejected": -0.8133836984634399, "logps/chosen": -868.2639770507812, "logps/rejected": -1334.706787109375, "loss": 0.0688, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.3474667966365814, "rewards/margins": 0.25684088468551636, "rewards/rejected": -0.6043076515197754, "step": 1700 }, { "epoch": 0.33, "learning_rate": 4.263353840751023e-06, "logits/chosen": -1.2793939113616943, "logits/rejected": -0.7485678195953369, "logps/chosen": -711.3675537109375, "logps/rejected": -1447.356201171875, "loss": 0.0921, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.257205069065094, "rewards/margins": 0.3200908303260803, "rewards/rejected": -0.5772958397865295, "step": 1710 }, { "epoch": 0.33, "learning_rate": 4.251532023240901e-06, "logits/chosen": -1.4069713354110718, "logits/rejected": -0.8339581489562988, "logps/chosen": -700.1744384765625, "logps/rejected": -1356.793701171875, "loss": 0.0729, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.22528398036956787, "rewards/margins": 0.28295961022377014, "rewards/rejected": -0.5082435607910156, "step": 1720 }, { "epoch": 0.33, "learning_rate": 4.239632775134857e-06, "logits/chosen": -1.3647840023040771, "logits/rejected": -0.9706377983093262, "logps/chosen": -675.4508056640625, "logps/rejected": -1254.943603515625, "loss": 0.0813, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.20136883854866028, "rewards/margins": 0.24962525069713593, "rewards/rejected": -0.4509941041469574, "step": 1730 }, { "epoch": 0.33, "learning_rate": 4.227656622467162e-06, "logits/chosen": -1.5252206325531006, "logits/rejected": -1.0396662950515747, "logps/chosen": -519.870849609375, "logps/rejected": -1278.7230224609375, "loss": 0.054, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11870086193084717, "rewards/margins": 0.32463350892066956, "rewards/rejected": -0.4433344006538391, "step": 1740 }, { "epoch": 0.33, "learning_rate": 4.215604094671835e-06, "logits/chosen": -1.6197888851165771, "logits/rejected": -1.015608787536621, "logps/chosen": -514.943359375, "logps/rejected": -1060.022216796875, "loss": 0.0818, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12082231044769287, "rewards/margins": 0.24318580329418182, "rewards/rejected": -0.3640081286430359, "step": 1750 }, { "epoch": 0.34, "learning_rate": 4.203475724559235e-06, "logits/chosen": -1.5419548749923706, "logits/rejected": -1.1807676553726196, "logps/chosen": -548.9449462890625, "logps/rejected": -1381.0179443359375, "loss": 0.0432, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.17818590998649597, "rewards/margins": 0.34219759702682495, "rewards/rejected": -0.5203834772109985, "step": 1760 }, { "epoch": 0.34, "learning_rate": 4.191272048292514e-06, "logits/chosen": -1.5359946489334106, "logits/rejected": -1.1761115789413452, "logps/chosen": -714.53466796875, "logps/rejected": -1336.4539794921875, "loss": 0.0709, "rewards/accuracies": 0.875, "rewards/chosen": -0.22599129378795624, "rewards/margins": 0.2689371705055237, "rewards/rejected": -0.4949284493923187, "step": 1770 }, { "epoch": 0.34, "learning_rate": 4.178993605363904e-06, "logits/chosen": -1.7051982879638672, "logits/rejected": -1.004176378250122, "logps/chosen": -622.5487670898438, "logps/rejected": -1316.314208984375, "loss": 0.0553, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.15982475876808167, "rewards/margins": 0.31387537717819214, "rewards/rejected": -0.4737001061439514, "step": 1780 }, { "epoch": 0.34, "learning_rate": 4.166640938570879e-06, "logits/chosen": -1.5779173374176025, "logits/rejected": -1.2296186685562134, "logps/chosen": -609.0247802734375, "logps/rejected": -1203.373046875, "loss": 0.0864, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.19219158589839935, "rewards/margins": 0.26055121421813965, "rewards/rejected": -0.4527428150177002, "step": 1790 }, { "epoch": 0.34, "learning_rate": 4.154214593992149e-06, "logits/chosen": -1.8979780673980713, "logits/rejected": -0.9662348628044128, "logps/chosen": -790.3106079101562, "logps/rejected": -1465.6546630859375, "loss": 0.0573, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.2489619255065918, "rewards/margins": 0.33502355217933655, "rewards/rejected": -0.583985447883606, "step": 1800 }, { "epoch": 0.34, "learning_rate": 4.1417151209635265e-06, "logits/chosen": -1.5197267532348633, "logits/rejected": -1.0553096532821655, "logps/chosen": -655.1430053710938, "logps/rejected": -1308.0142822265625, "loss": 0.0688, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2508172392845154, "rewards/margins": 0.29503312706947327, "rewards/rejected": -0.5458503365516663, "step": 1810 }, { "epoch": 0.35, "learning_rate": 4.129143072053639e-06, "logits/chosen": -1.7092673778533936, "logits/rejected": -1.0430828332901, "logps/chosen": -774.0709228515625, "logps/rejected": -1407.5379638671875, "loss": 0.0717, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.259227991104126, "rewards/margins": 0.3105766773223877, "rewards/rejected": -0.5698047280311584, "step": 1820 }, { "epoch": 0.35, "learning_rate": 4.116499003039499e-06, "logits/chosen": -1.3776369094848633, "logits/rejected": -0.7794798612594604, "logps/chosen": -695.4063720703125, "logps/rejected": -1304.434814453125, "loss": 0.064, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.22463175654411316, "rewards/margins": 0.31490933895111084, "rewards/rejected": -0.5395411252975464, "step": 1830 }, { "epoch": 0.35, "learning_rate": 4.103783472881942e-06, "logits/chosen": -1.6217248439788818, "logits/rejected": -0.9625973701477051, "logps/chosen": -617.6864624023438, "logps/rejected": -1318.3341064453125, "loss": 0.0532, "rewards/accuracies": 0.875, "rewards/chosen": -0.16038154065608978, "rewards/margins": 0.3341715931892395, "rewards/rejected": -0.49455317854881287, "step": 1840 }, { "epoch": 0.35, "learning_rate": 4.0909970437009094e-06, "logits/chosen": -1.7111873626708984, "logits/rejected": -1.0022821426391602, "logps/chosen": -752.5694580078125, "logps/rejected": -1340.786376953125, "loss": 0.0797, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.19375209510326385, "rewards/margins": 0.29098087549209595, "rewards/rejected": -0.4847329556941986, "step": 1850 }, { "epoch": 0.35, "learning_rate": 4.078140280750598e-06, "logits/chosen": -1.473215103149414, "logits/rejected": -1.1144354343414307, "logps/chosen": -651.291015625, "logps/rejected": -1229.4078369140625, "loss": 0.0944, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.17073918879032135, "rewards/margins": 0.27028197050094604, "rewards/rejected": -0.4410211145877838, "step": 1860 }, { "epoch": 0.36, "learning_rate": 4.065213752394478e-06, "logits/chosen": -1.4655654430389404, "logits/rejected": -0.9170185327529907, "logps/chosen": -611.453857421875, "logps/rejected": -1353.949462890625, "loss": 0.0492, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.16493529081344604, "rewards/margins": 0.3255433142185211, "rewards/rejected": -0.49047860503196716, "step": 1870 }, { "epoch": 0.36, "learning_rate": 4.052218030080162e-06, "logits/chosen": -1.6494277715682983, "logits/rejected": -0.9448171854019165, "logps/chosen": -615.288330078125, "logps/rejected": -1280.2767333984375, "loss": 0.0619, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.170337975025177, "rewards/margins": 0.3003271818161011, "rewards/rejected": -0.4706651568412781, "step": 1880 }, { "epoch": 0.36, "learning_rate": 4.039153688314146e-06, "logits/chosen": -1.4522556066513062, "logits/rejected": -0.8698140978813171, "logps/chosen": -699.1949462890625, "logps/rejected": -1415.995849609375, "loss": 0.0806, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.21898791193962097, "rewards/margins": 0.3242533802986145, "rewards/rejected": -0.5432413220405579, "step": 1890 }, { "epoch": 0.36, "learning_rate": 4.026021304636408e-06, "logits/chosen": -1.6501353979110718, "logits/rejected": -0.9669955372810364, "logps/chosen": -650.6583251953125, "logps/rejected": -1219.7510986328125, "loss": 0.0777, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1721065789461136, "rewards/margins": 0.31094521284103394, "rewards/rejected": -0.4830518364906311, "step": 1900 }, { "epoch": 0.36, "learning_rate": 4.012821459594881e-06, "logits/chosen": -1.8619811534881592, "logits/rejected": -1.1319575309753418, "logps/chosen": -692.3233642578125, "logps/rejected": -1297.4775390625, "loss": 0.1147, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1922949254512787, "rewards/margins": 0.2827715277671814, "rewards/rejected": -0.47506648302078247, "step": 1910 }, { "epoch": 0.37, "learning_rate": 3.999554736719785e-06, "logits/chosen": -1.6963351964950562, "logits/rejected": -1.0642411708831787, "logps/chosen": -529.7073364257812, "logps/rejected": -1265.6956787109375, "loss": 0.0572, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.10542023181915283, "rewards/margins": 0.31158146262168884, "rewards/rejected": -0.4170016646385193, "step": 1920 }, { "epoch": 0.37, "learning_rate": 3.986221722497832e-06, "logits/chosen": -1.8552402257919312, "logits/rejected": -1.0955214500427246, "logps/chosen": -552.8602294921875, "logps/rejected": -1160.6795654296875, "loss": 0.0736, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.0734359547495842, "rewards/margins": 0.3090042471885681, "rewards/rejected": -0.3824402391910553, "step": 1930 }, { "epoch": 0.37, "learning_rate": 3.9728230063463e-06, "logits/chosen": -1.6904840469360352, "logits/rejected": -0.9247332811355591, "logps/chosen": -652.7967529296875, "logps/rejected": -1129.207763671875, "loss": 0.0962, "rewards/accuracies": 0.75, "rewards/chosen": -0.14699819684028625, "rewards/margins": 0.23404502868652344, "rewards/rejected": -0.3810432553291321, "step": 1940 }, { "epoch": 0.37, "learning_rate": 3.9593591805869755e-06, "logits/chosen": -1.4540979862213135, "logits/rejected": -1.1804428100585938, "logps/chosen": -464.2632751464844, "logps/rejected": -1101.6966552734375, "loss": 0.1024, "rewards/accuracies": 0.75, "rewards/chosen": -0.11493609845638275, "rewards/margins": 0.2585905194282532, "rewards/rejected": -0.3735266327857971, "step": 1950 }, { "epoch": 0.37, "learning_rate": 3.945830840419966e-06, "logits/chosen": -1.5080819129943848, "logits/rejected": -1.2832539081573486, "logps/chosen": -410.7259216308594, "logps/rejected": -1009.2428588867188, "loss": 0.0981, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.12702463567256927, "rewards/margins": 0.23238396644592285, "rewards/rejected": -0.35940855741500854, "step": 1960 }, { "epoch": 0.38, "learning_rate": 3.932238583897395e-06, "logits/chosen": -1.5733039379119873, "logits/rejected": -1.2267423868179321, "logps/chosen": -605.8873901367188, "logps/rejected": -1320.41650390625, "loss": 0.0694, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.17582616209983826, "rewards/margins": 0.3171050250530243, "rewards/rejected": -0.49293118715286255, "step": 1970 }, { "epoch": 0.38, "learning_rate": 3.918583011896955e-06, "logits/chosen": -1.406954050064087, "logits/rejected": -1.0951248407363892, "logps/chosen": -575.7022705078125, "logps/rejected": -1197.1514892578125, "loss": 0.0894, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.20731861889362335, "rewards/margins": 0.25613903999328613, "rewards/rejected": -0.46345773339271545, "step": 1980 }, { "epoch": 0.38, "learning_rate": 3.904864728095349e-06, "logits/chosen": -1.706703543663025, "logits/rejected": -1.1287411451339722, "logps/chosen": -643.6716918945312, "logps/rejected": -1208.3297119140625, "loss": 0.0859, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.16700610518455505, "rewards/margins": 0.2838577926158905, "rewards/rejected": -0.4508638381958008, "step": 1990 }, { "epoch": 0.38, "learning_rate": 3.891084338941603e-06, "logits/chosen": -1.658524751663208, "logits/rejected": -0.8833999633789062, "logps/chosen": -667.8445434570312, "logps/rejected": -1186.5399169921875, "loss": 0.0769, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.17360621690750122, "rewards/margins": 0.27768850326538086, "rewards/rejected": -0.4512947201728821, "step": 2000 }, { "epoch": 0.38, "learning_rate": 3.8772424536302565e-06, "logits/chosen": -1.5163154602050781, "logits/rejected": -0.9248741865158081, "logps/chosen": -573.7681884765625, "logps/rejected": -1262.5050048828125, "loss": 0.0581, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1913374364376068, "rewards/margins": 0.3188498616218567, "rewards/rejected": -0.5101873278617859, "step": 2010 }, { "epoch": 0.38, "learning_rate": 3.863339684074432e-06, "logits/chosen": -1.5402344465255737, "logits/rejected": -0.9123631715774536, "logps/chosen": -709.202392578125, "logps/rejected": -1308.1771240234375, "loss": 0.0794, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.24668677151203156, "rewards/margins": 0.2680239975452423, "rewards/rejected": -0.5147107839584351, "step": 2020 }, { "epoch": 0.39, "learning_rate": 3.849376644878783e-06, "logits/chosen": -1.5158917903900146, "logits/rejected": -1.0071513652801514, "logps/chosen": -599.4667358398438, "logps/rejected": -1142.518798828125, "loss": 0.1031, "rewards/accuracies": 0.75, "rewards/chosen": -0.18736699223518372, "rewards/margins": 0.24289944767951965, "rewards/rejected": -0.43026643991470337, "step": 2030 }, { "epoch": 0.39, "learning_rate": 3.835353953312322e-06, "logits/chosen": -1.3921092748641968, "logits/rejected": -1.161055326461792, "logps/chosen": -493.7007751464844, "logps/rejected": -1077.16455078125, "loss": 0.0943, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.14471833407878876, "rewards/margins": 0.25077611207962036, "rewards/rejected": -0.3954944312572479, "step": 2040 }, { "epoch": 0.39, "learning_rate": 3.821272229281139e-06, "logits/chosen": -1.4992625713348389, "logits/rejected": -1.0259852409362793, "logps/chosen": -612.4508056640625, "logps/rejected": -1262.510498046875, "loss": 0.0754, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15795397758483887, "rewards/margins": 0.28065261244773865, "rewards/rejected": -0.4386065602302551, "step": 2050 }, { "epoch": 0.39, "learning_rate": 3.8071320953009906e-06, "logits/chosen": -1.6317722797393799, "logits/rejected": -1.0791391134262085, "logps/chosen": -639.4310913085938, "logps/rejected": -1332.7215576171875, "loss": 0.0858, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.12871623039245605, "rewards/margins": 0.3394959568977356, "rewards/rejected": -0.46821218729019165, "step": 2060 }, { "epoch": 0.39, "learning_rate": 3.792934176469782e-06, "logits/chosen": -1.7077200412750244, "logits/rejected": -1.005631685256958, "logps/chosen": -544.0747680664062, "logps/rejected": -1025.9820556640625, "loss": 0.0778, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11793072521686554, "rewards/margins": 0.2298382818698883, "rewards/rejected": -0.34776902198791504, "step": 2070 }, { "epoch": 0.4, "learning_rate": 3.7786791004399353e-06, "logits/chosen": -1.5485365390777588, "logits/rejected": -1.1217293739318848, "logps/chosen": -728.8970947265625, "logps/rejected": -1328.073486328125, "loss": 0.0871, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1932309865951538, "rewards/margins": 0.2715621888637543, "rewards/rejected": -0.46479320526123047, "step": 2080 }, { "epoch": 0.4, "learning_rate": 3.764367497390642e-06, "logits/chosen": -1.6724570989608765, "logits/rejected": -0.9786966443061829, "logps/chosen": -723.045166015625, "logps/rejected": -1434.869873046875, "loss": 0.0727, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.2174513041973114, "rewards/margins": 0.30513912439346313, "rewards/rejected": -0.5225903987884521, "step": 2090 }, { "epoch": 0.4, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -1.5985078811645508, "logits/rejected": -1.1455590724945068, "logps/chosen": -587.4654541015625, "logps/rejected": -1450.05419921875, "loss": 0.0538, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.151766836643219, "rewards/margins": 0.36734539270401, "rewards/rejected": -0.5191121697425842, "step": 2100 }, { "epoch": 0.4, "learning_rate": 3.7355772434170523e-06, "logits/chosen": -1.6532714366912842, "logits/rejected": -0.9889874458312988, "logps/chosen": -683.9132690429688, "logps/rejected": -1231.6641845703125, "loss": 0.0865, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1688445806503296, "rewards/margins": 0.2573260962963104, "rewards/rejected": -0.42617067694664, "step": 2110 }, { "epoch": 0.4, "learning_rate": 3.7210998652337016e-06, "logits/chosen": -1.560417890548706, "logits/rejected": -0.908432126045227, "logps/chosen": -532.7721557617188, "logps/rejected": -1283.0491943359375, "loss": 0.0665, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.13333137333393097, "rewards/margins": 0.32335206866264343, "rewards/rejected": -0.4566834568977356, "step": 2120 }, { "epoch": 0.41, "learning_rate": 3.7065685054565277e-06, "logits/chosen": -1.462009310722351, "logits/rejected": -1.0526163578033447, "logps/chosen": -528.8088989257812, "logps/rejected": -1030.3155517578125, "loss": 0.0945, "rewards/accuracies": 0.75, "rewards/chosen": -0.10231202840805054, "rewards/margins": 0.24501392245292664, "rewards/rejected": -0.3473259210586548, "step": 2130 }, { "epoch": 0.41, "learning_rate": 3.691983806478494e-06, "logits/chosen": -1.4341996908187866, "logits/rejected": -1.0435141324996948, "logps/chosen": -599.2943115234375, "logps/rejected": -1383.003173828125, "loss": 0.0725, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.15288788080215454, "rewards/margins": 0.33814504742622375, "rewards/rejected": -0.4910329282283783, "step": 2140 }, { "epoch": 0.41, "learning_rate": 3.677346413050551e-06, "logits/chosen": -1.4380934238433838, "logits/rejected": -0.8723732829093933, "logps/chosen": -682.8527221679688, "logps/rejected": -1212.371826171875, "loss": 0.1084, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.17531666159629822, "rewards/margins": 0.25657838582992554, "rewards/rejected": -0.43189501762390137, "step": 2150 }, { "epoch": 0.41, "learning_rate": 3.6626569722531268e-06, "logits/chosen": -1.407714605331421, "logits/rejected": -0.9261984825134277, "logps/chosen": -701.3203125, "logps/rejected": -1251.0504150390625, "loss": 0.0886, "rewards/accuracies": 0.875, "rewards/chosen": -0.17605462670326233, "rewards/margins": 0.24869613349437714, "rewards/rejected": -0.4247507154941559, "step": 2160 }, { "epoch": 0.41, "learning_rate": 3.6479161334675294e-06, "logits/chosen": -1.385465145111084, "logits/rejected": -1.017727017402649, "logps/chosen": -544.0186767578125, "logps/rejected": -1228.675537109375, "loss": 0.0593, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1482720822095871, "rewards/margins": 0.2906687259674072, "rewards/rejected": -0.4389408528804779, "step": 2170 }, { "epoch": 0.42, "learning_rate": 3.6331245483472353e-06, "logits/chosen": -1.6134536266326904, "logits/rejected": -0.9634687304496765, "logps/chosen": -553.2294311523438, "logps/rejected": -1121.5455322265625, "loss": 0.0862, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.14353647828102112, "rewards/margins": 0.2658248543739319, "rewards/rejected": -0.4093613028526306, "step": 2180 }, { "epoch": 0.42, "learning_rate": 3.6182828707890816e-06, "logits/chosen": -1.6181968450546265, "logits/rejected": -1.0317254066467285, "logps/chosen": -711.2445068359375, "logps/rejected": -1320.489501953125, "loss": 0.0524, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.20726224780082703, "rewards/margins": 0.29739540815353394, "rewards/rejected": -0.5046576857566833, "step": 2190 }, { "epoch": 0.42, "learning_rate": 3.6033917569043604e-06, "logits/chosen": -1.510839819908142, "logits/rejected": -0.9395051002502441, "logps/chosen": -561.166015625, "logps/rejected": -1221.585205078125, "loss": 0.0543, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.14164069294929504, "rewards/margins": 0.31262117624282837, "rewards/rejected": -0.454261839389801, "step": 2200 }, { "epoch": 0.42, "learning_rate": 3.588451864989811e-06, "logits/chosen": -1.5775179862976074, "logits/rejected": -1.0709121227264404, "logps/chosen": -516.7582397460938, "logps/rejected": -1127.0125732421875, "loss": 0.0696, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1386266052722931, "rewards/margins": 0.3021091818809509, "rewards/rejected": -0.44073575735092163, "step": 2210 }, { "epoch": 0.42, "learning_rate": 3.5734638554985234e-06, "logits/chosen": -1.7261450290679932, "logits/rejected": -0.9754320383071899, "logps/chosen": -618.065673828125, "logps/rejected": -1307.2408447265625, "loss": 0.0597, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.13468700647354126, "rewards/margins": 0.3289644122123718, "rewards/rejected": -0.4636514186859131, "step": 2220 }, { "epoch": 0.42, "learning_rate": 3.5584283910107343e-06, "logits/chosen": -1.7003357410430908, "logits/rejected": -1.0963810682296753, "logps/chosen": -618.1256103515625, "logps/rejected": -1277.525634765625, "loss": 0.0583, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1276676058769226, "rewards/margins": 0.3247441351413727, "rewards/rejected": -0.4524117410182953, "step": 2230 }, { "epoch": 0.43, "learning_rate": 3.543346136204545e-06, "logits/chosen": -1.4668004512786865, "logits/rejected": -0.9574554562568665, "logps/chosen": -630.6279296875, "logps/rejected": -1333.8809814453125, "loss": 0.0762, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.18021656572818756, "rewards/margins": 0.3060380518436432, "rewards/rejected": -0.48625463247299194, "step": 2240 }, { "epoch": 0.43, "learning_rate": 3.5282177578265295e-06, "logits/chosen": -1.3924225568771362, "logits/rejected": -0.8624798655509949, "logps/chosen": -583.6441650390625, "logps/rejected": -1162.06640625, "loss": 0.0732, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13777074217796326, "rewards/margins": 0.300098717212677, "rewards/rejected": -0.43786945939064026, "step": 2250 }, { "epoch": 0.43, "learning_rate": 3.5130439246622635e-06, "logits/chosen": -1.4143173694610596, "logits/rejected": -0.8775388598442078, "logps/chosen": -585.23583984375, "logps/rejected": -1257.826904296875, "loss": 0.0709, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.17448213696479797, "rewards/margins": 0.32015711069107056, "rewards/rejected": -0.49463921785354614, "step": 2260 }, { "epoch": 0.43, "learning_rate": 3.497825307506758e-06, "logits/chosen": -1.631155252456665, "logits/rejected": -0.947667121887207, "logps/chosen": -575.4785766601562, "logps/rejected": -1216.88525390625, "loss": 0.0858, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.19915080070495605, "rewards/margins": 0.2901036739349365, "rewards/rejected": -0.48925453424453735, "step": 2270 }, { "epoch": 0.43, "learning_rate": 3.4825625791348093e-06, "logits/chosen": -1.5258907079696655, "logits/rejected": -0.9407553672790527, "logps/chosen": -690.2452392578125, "logps/rejected": -1400.36572265625, "loss": 0.0567, "rewards/accuracies": 0.875, "rewards/chosen": -0.2197599709033966, "rewards/margins": 0.328549325466156, "rewards/rejected": -0.5483092665672302, "step": 2280 }, { "epoch": 0.44, "learning_rate": 3.467256414271249e-06, "logits/chosen": -1.3535867929458618, "logits/rejected": -0.7650678753852844, "logps/chosen": -732.3662719726562, "logps/rejected": -1243.236328125, "loss": 0.0992, "rewards/accuracies": 0.875, "rewards/chosen": -0.21381525695323944, "rewards/margins": 0.27646294236183167, "rewards/rejected": -0.4902781844139099, "step": 2290 }, { "epoch": 0.44, "learning_rate": 3.4519074895611245e-06, "logits/chosen": -1.2721103429794312, "logits/rejected": -0.9371077418327332, "logps/chosen": -701.7020263671875, "logps/rejected": -1331.9573974609375, "loss": 0.0805, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2324238270521164, "rewards/margins": 0.27934929728507996, "rewards/rejected": -0.5117732286453247, "step": 2300 }, { "epoch": 0.44, "learning_rate": 3.436516483539781e-06, "logits/chosen": -1.5690281391143799, "logits/rejected": -1.041538953781128, "logps/chosen": -624.0325927734375, "logps/rejected": -1159.1929931640625, "loss": 0.0922, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.20282158255577087, "rewards/margins": 0.2440890371799469, "rewards/rejected": -0.44691067934036255, "step": 2310 }, { "epoch": 0.44, "learning_rate": 3.421084076602867e-06, "logits/chosen": -1.6616109609603882, "logits/rejected": -1.080195426940918, "logps/chosen": -724.8908081054688, "logps/rejected": -1440.558837890625, "loss": 0.0344, "rewards/accuracies": 0.875, "rewards/chosen": -0.2283150851726532, "rewards/margins": 0.3158523738384247, "rewards/rejected": -0.5441675186157227, "step": 2320 }, { "epoch": 0.44, "learning_rate": 3.405610950976257e-06, "logits/chosen": -1.6437885761260986, "logits/rejected": -0.6748986840248108, "logps/chosen": -597.8292846679688, "logps/rejected": -1239.0748291015625, "loss": 0.0444, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.14971794188022614, "rewards/margins": 0.32578349113464355, "rewards/rejected": -0.4755013883113861, "step": 2330 }, { "epoch": 0.45, "learning_rate": 3.3900977906858923e-06, "logits/chosen": -1.4760338068008423, "logits/rejected": -1.0113236904144287, "logps/chosen": -593.802734375, "logps/rejected": -1221.1602783203125, "loss": 0.0987, "rewards/accuracies": 0.75, "rewards/chosen": -0.1533535122871399, "rewards/margins": 0.2769462466239929, "rewards/rejected": -0.4302998185157776, "step": 2340 }, { "epoch": 0.45, "learning_rate": 3.3745452815275375e-06, "logits/chosen": -1.625119924545288, "logits/rejected": -0.9021151661872864, "logps/chosen": -557.1563720703125, "logps/rejected": -1177.1480712890625, "loss": 0.0825, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1604781150817871, "rewards/margins": 0.3083071708679199, "rewards/rejected": -0.46878522634506226, "step": 2350 }, { "epoch": 0.45, "learning_rate": 3.3589541110364678e-06, "logits/chosen": -1.6174547672271729, "logits/rejected": -1.021515130996704, "logps/chosen": -600.9921875, "logps/rejected": -1149.3212890625, "loss": 0.0879, "rewards/accuracies": 0.75, "rewards/chosen": -0.1592303216457367, "rewards/margins": 0.2715124487876892, "rewards/rejected": -0.4307428002357483, "step": 2360 }, { "epoch": 0.45, "learning_rate": 3.3433249684570757e-06, "logits/chosen": -1.6038291454315186, "logits/rejected": -0.984597384929657, "logps/chosen": -691.2508544921875, "logps/rejected": -1356.6763916015625, "loss": 0.0788, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1866895705461502, "rewards/margins": 0.31433457136154175, "rewards/rejected": -0.5010241270065308, "step": 2370 }, { "epoch": 0.45, "learning_rate": 3.3276585447123957e-06, "logits/chosen": -1.625759482383728, "logits/rejected": -0.9024505615234375, "logps/chosen": -655.2415771484375, "logps/rejected": -1165.357666015625, "loss": 0.079, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.14526958763599396, "rewards/margins": 0.23506641387939453, "rewards/rejected": -0.3803360164165497, "step": 2380 }, { "epoch": 0.46, "learning_rate": 3.3119555323735664e-06, "logits/chosen": -1.5876622200012207, "logits/rejected": -0.9881450533866882, "logps/chosen": -586.2982177734375, "logps/rejected": -1114.760498046875, "loss": 0.089, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14424452185630798, "rewards/margins": 0.24635834991931915, "rewards/rejected": -0.3906029164791107, "step": 2390 }, { "epoch": 0.46, "learning_rate": 3.2962166256292116e-06, "logits/chosen": -1.7776038646697998, "logits/rejected": -0.9887340664863586, "logps/chosen": -622.7711791992188, "logps/rejected": -1265.460693359375, "loss": 0.0615, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.16243186593055725, "rewards/margins": 0.31626278162002563, "rewards/rejected": -0.4786946177482605, "step": 2400 }, { "epoch": 0.46, "learning_rate": 3.2804425202547494e-06, "logits/chosen": -1.6128498315811157, "logits/rejected": -1.0431678295135498, "logps/chosen": -668.9735107421875, "logps/rejected": -1388.3677978515625, "loss": 0.0744, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.2042228728532791, "rewards/margins": 0.31508609652519226, "rewards/rejected": -0.5193089246749878, "step": 2410 }, { "epoch": 0.46, "learning_rate": 3.2646339135816386e-06, "logits/chosen": -1.6792058944702148, "logits/rejected": -1.1176880598068237, "logps/chosen": -575.1004638671875, "logps/rejected": -1309.0078125, "loss": 0.0722, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1628337800502777, "rewards/margins": 0.32463642954826355, "rewards/rejected": -0.48747020959854126, "step": 2420 }, { "epoch": 0.46, "learning_rate": 3.2487915044665485e-06, "logits/chosen": -1.3517749309539795, "logits/rejected": -0.8303594589233398, "logps/chosen": -623.1898193359375, "logps/rejected": -1244.054931640625, "loss": 0.1096, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.20480790734291077, "rewards/margins": 0.27906081080436707, "rewards/rejected": -0.48386868834495544, "step": 2430 }, { "epoch": 0.46, "learning_rate": 3.2329159932604638e-06, "logits/chosen": -1.2916462421417236, "logits/rejected": -0.7104039192199707, "logps/chosen": -619.9585571289062, "logps/rejected": -1238.257568359375, "loss": 0.0759, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.17360931634902954, "rewards/margins": 0.2984062731266022, "rewards/rejected": -0.4720155596733093, "step": 2440 }, { "epoch": 0.47, "learning_rate": 3.217008081777726e-06, "logits/chosen": -1.5925421714782715, "logits/rejected": -1.388627290725708, "logps/chosen": -634.1315307617188, "logps/rejected": -1347.8963623046875, "loss": 0.0678, "rewards/accuracies": 0.875, "rewards/chosen": -0.21671870350837708, "rewards/margins": 0.3036806285381317, "rewards/rejected": -0.5203993916511536, "step": 2450 }, { "epoch": 0.47, "learning_rate": 3.201068473265007e-06, "logits/chosen": -1.611342430114746, "logits/rejected": -1.019357442855835, "logps/chosen": -568.1485595703125, "logps/rejected": -1200.0748291015625, "loss": 0.0905, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14752653241157532, "rewards/margins": 0.2930060923099518, "rewards/rejected": -0.4405326843261719, "step": 2460 }, { "epoch": 0.47, "learning_rate": 3.1850978723702213e-06, "logits/chosen": -1.7960550785064697, "logits/rejected": -0.867364764213562, "logps/chosen": -668.3005981445312, "logps/rejected": -1216.97705078125, "loss": 0.0916, "rewards/accuracies": 0.875, "rewards/chosen": -0.174250066280365, "rewards/margins": 0.27992844581604004, "rewards/rejected": -0.45417851209640503, "step": 2470 }, { "epoch": 0.47, "learning_rate": 3.1690969851113724e-06, "logits/chosen": -1.581278681755066, "logits/rejected": -1.1784000396728516, "logps/chosen": -590.9406127929688, "logps/rejected": -1250.147216796875, "loss": 0.0727, "rewards/accuracies": 0.875, "rewards/chosen": -0.1811566799879074, "rewards/margins": 0.29458481073379517, "rewards/rejected": -0.47574153542518616, "step": 2480 }, { "epoch": 0.47, "learning_rate": 3.1530665188453463e-06, "logits/chosen": -1.3773750066757202, "logits/rejected": -0.6731222867965698, "logps/chosen": -648.3175659179688, "logps/rejected": -1236.1195068359375, "loss": 0.0595, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.20099523663520813, "rewards/margins": 0.27121472358703613, "rewards/rejected": -0.4722098708152771, "step": 2490 }, { "epoch": 0.48, "learning_rate": 3.137007182236637e-06, "logits/chosen": -1.514936089515686, "logits/rejected": -0.6673987507820129, "logps/chosen": -754.236083984375, "logps/rejected": -1341.1190185546875, "loss": 0.0783, "rewards/accuracies": 0.875, "rewards/chosen": -0.196518212556839, "rewards/margins": 0.2775440514087677, "rewards/rejected": -0.4740622937679291, "step": 2500 }, { "epoch": 0.48, "learning_rate": 3.1209196852260204e-06, "logits/chosen": -1.3017624616622925, "logits/rejected": -1.00785231590271, "logps/chosen": -599.2673950195312, "logps/rejected": -1185.9400634765625, "loss": 0.0936, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.17537134885787964, "rewards/margins": 0.24747446179389954, "rewards/rejected": -0.4228457808494568, "step": 2510 }, { "epoch": 0.48, "learning_rate": 3.1048047389991693e-06, "logits/chosen": -1.690582036972046, "logits/rejected": -1.104832410812378, "logps/chosen": -619.1864624023438, "logps/rejected": -1118.77197265625, "loss": 0.0764, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13324618339538574, "rewards/margins": 0.25045424699783325, "rewards/rejected": -0.383700430393219, "step": 2520 }, { "epoch": 0.48, "learning_rate": 3.0886630559552144e-06, "logits/chosen": -1.3345118761062622, "logits/rejected": -0.9842830896377563, "logps/chosen": -694.3142700195312, "logps/rejected": -1348.347412109375, "loss": 0.0762, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.14755675196647644, "rewards/margins": 0.30060485005378723, "rewards/rejected": -0.44816160202026367, "step": 2530 }, { "epoch": 0.48, "learning_rate": 3.072495349675249e-06, "logits/chosen": -1.6009747982025146, "logits/rejected": -0.8453266024589539, "logps/chosen": -539.679931640625, "logps/rejected": -1170.426025390625, "loss": 0.071, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1122439056634903, "rewards/margins": 0.2621922492980957, "rewards/rejected": -0.3744361996650696, "step": 2540 }, { "epoch": 0.49, "learning_rate": 3.056302334890786e-06, "logits/chosen": -1.6523168087005615, "logits/rejected": -1.110414743423462, "logps/chosen": -545.8953247070312, "logps/rejected": -1274.470458984375, "loss": 0.0555, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1152963787317276, "rewards/margins": 0.3395312428474426, "rewards/rejected": -0.4548276364803314, "step": 2550 }, { "epoch": 0.49, "learning_rate": 3.04008472745216e-06, "logits/chosen": -1.7794824838638306, "logits/rejected": -0.8890671730041504, "logps/chosen": -637.3228759765625, "logps/rejected": -1193.5184326171875, "loss": 0.0773, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16369478404521942, "rewards/margins": 0.2698102593421936, "rewards/rejected": -0.43350496888160706, "step": 2560 }, { "epoch": 0.49, "learning_rate": 3.0238432442968803e-06, "logits/chosen": -1.5971051454544067, "logits/rejected": -0.9152080416679382, "logps/chosen": -570.8718872070312, "logps/rejected": -1310.4375, "loss": 0.053, "rewards/accuracies": 0.875, "rewards/chosen": -0.13165855407714844, "rewards/margins": 0.33412402868270874, "rewards/rejected": -0.46578264236450195, "step": 2570 }, { "epoch": 0.49, "learning_rate": 3.0075786034179407e-06, "logits/chosen": -1.2595056295394897, "logits/rejected": -0.8606653213500977, "logps/chosen": -552.0274658203125, "logps/rejected": -1287.727783203125, "loss": 0.0654, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1181732639670372, "rewards/margins": 0.3271896541118622, "rewards/rejected": -0.4453628957271576, "step": 2580 }, { "epoch": 0.49, "learning_rate": 2.9912915238320755e-06, "logits/chosen": -1.5348694324493408, "logits/rejected": -0.9599231481552124, "logps/chosen": -560.0164184570312, "logps/rejected": -1273.4046630859375, "loss": 0.0714, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1421751081943512, "rewards/margins": 0.29482603073120117, "rewards/rejected": -0.43700116872787476, "step": 2590 }, { "epoch": 0.5, "learning_rate": 2.974982725547976e-06, "logits/chosen": -1.4486770629882812, "logits/rejected": -1.0592507123947144, "logps/chosen": -680.0877685546875, "logps/rejected": -1446.515869140625, "loss": 0.0431, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.17707717418670654, "rewards/margins": 0.3384220600128174, "rewards/rejected": -0.5154992341995239, "step": 2600 }, { "epoch": 0.5, "learning_rate": 2.958652929534456e-06, "logits/chosen": -1.2989368438720703, "logits/rejected": -1.0571434497833252, "logps/chosen": -603.8392333984375, "logps/rejected": -1268.223876953125, "loss": 0.1061, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.18222357332706451, "rewards/margins": 0.2554742693901062, "rewards/rejected": -0.4376978278160095, "step": 2610 }, { "epoch": 0.5, "learning_rate": 2.9423028576885894e-06, "logits/chosen": -1.5412323474884033, "logits/rejected": -1.0013290643692017, "logps/chosen": -640.44921875, "logps/rejected": -1294.619873046875, "loss": 0.0688, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.14701518416404724, "rewards/margins": 0.29219189286231995, "rewards/rejected": -0.4392070770263672, "step": 2620 }, { "epoch": 0.5, "learning_rate": 2.9259332328037852e-06, "logits/chosen": -1.396354079246521, "logits/rejected": -0.7972344756126404, "logps/chosen": -504.8910217285156, "logps/rejected": -1190.3065185546875, "loss": 0.0614, "rewards/accuracies": 0.875, "rewards/chosen": -0.1175163984298706, "rewards/margins": 0.2946506142616272, "rewards/rejected": -0.4121670722961426, "step": 2630 }, { "epoch": 0.5, "learning_rate": 2.9095447785378446e-06, "logits/chosen": -1.371985673904419, "logits/rejected": -0.9223454594612122, "logps/chosen": -605.7084350585938, "logps/rejected": -1249.2154541015625, "loss": 0.0785, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.13958588242530823, "rewards/margins": 0.3104739189147949, "rewards/rejected": -0.45005980134010315, "step": 2640 }, { "epoch": 0.5, "learning_rate": 2.893138219380964e-06, "logits/chosen": -1.2879749536514282, "logits/rejected": -0.7854377627372742, "logps/chosen": -605.78271484375, "logps/rejected": -1397.0079345703125, "loss": 0.0434, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.125517338514328, "rewards/margins": 0.345243901014328, "rewards/rejected": -0.4707612097263336, "step": 2650 }, { "epoch": 0.51, "learning_rate": 2.876714280623708e-06, "logits/chosen": -1.526444673538208, "logits/rejected": -0.7589839100837708, "logps/chosen": -647.5239868164062, "logps/rejected": -1338.138427734375, "loss": 0.0568, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.13042430579662323, "rewards/margins": 0.32712113857269287, "rewards/rejected": -0.45754551887512207, "step": 2660 }, { "epoch": 0.51, "learning_rate": 2.8602736883249504e-06, "logits/chosen": -1.6594903469085693, "logits/rejected": -0.9879050254821777, "logps/chosen": -514.2203979492188, "logps/rejected": -1181.108154296875, "loss": 0.0527, "rewards/accuracies": 0.875, "rewards/chosen": -0.0958632081747055, "rewards/margins": 0.3106859624385834, "rewards/rejected": -0.4065491557121277, "step": 2670 }, { "epoch": 0.51, "learning_rate": 2.843817169279772e-06, "logits/chosen": -1.4798604249954224, "logits/rejected": -0.861343264579773, "logps/chosen": -565.4484252929688, "logps/rejected": -1215.5482177734375, "loss": 0.0675, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1356905698776245, "rewards/margins": 0.29588332772254944, "rewards/rejected": -0.43157386779785156, "step": 2680 }, { "epoch": 0.51, "learning_rate": 2.8273454509873333e-06, "logits/chosen": -1.714463472366333, "logits/rejected": -0.9027876853942871, "logps/chosen": -557.419189453125, "logps/rejected": -1265.908935546875, "loss": 0.0516, "rewards/accuracies": 0.875, "rewards/chosen": -0.10656454414129257, "rewards/margins": 0.33830881118774414, "rewards/rejected": -0.4448733925819397, "step": 2690 }, { "epoch": 0.51, "learning_rate": 2.8108592616187135e-06, "logits/chosen": -1.7163057327270508, "logits/rejected": -1.0684901475906372, "logps/chosen": -590.5923461914062, "logps/rejected": -1185.128173828125, "loss": 0.0723, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.13913396000862122, "rewards/margins": 0.26712125539779663, "rewards/rejected": -0.40625524520874023, "step": 2700 }, { "epoch": 0.52, "learning_rate": 2.7943593299847186e-06, "logits/chosen": -1.6501725912094116, "logits/rejected": -0.7960497736930847, "logps/chosen": -609.9940185546875, "logps/rejected": -1179.6832275390625, "loss": 0.0629, "rewards/accuracies": 0.875, "rewards/chosen": -0.15014216303825378, "rewards/margins": 0.286409467458725, "rewards/rejected": -0.43655166029930115, "step": 2710 }, { "epoch": 0.52, "learning_rate": 2.7778463855036656e-06, "logits/chosen": -1.40164315700531, "logits/rejected": -0.8055755496025085, "logps/chosen": -687.6304931640625, "logps/rejected": -1379.4273681640625, "loss": 0.0578, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.16296520829200745, "rewards/margins": 0.3322266936302185, "rewards/rejected": -0.4951918125152588, "step": 2720 }, { "epoch": 0.52, "learning_rate": 2.761321158169134e-06, "logits/chosen": -1.5054407119750977, "logits/rejected": -0.9149681329727173, "logps/chosen": -664.0867309570312, "logps/rejected": -1220.485107421875, "loss": 0.086, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.18236112594604492, "rewards/margins": 0.2533378303050995, "rewards/rejected": -0.435698926448822, "step": 2730 }, { "epoch": 0.52, "learning_rate": 2.7447843785176958e-06, "logits/chosen": -1.680153489112854, "logits/rejected": -1.1227161884307861, "logps/chosen": -620.3820190429688, "logps/rejected": -1187.011474609375, "loss": 0.086, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.13061869144439697, "rewards/margins": 0.29502156376838684, "rewards/rejected": -0.4256402850151062, "step": 2740 }, { "epoch": 0.52, "learning_rate": 2.728236777596621e-06, "logits/chosen": -1.6802221536636353, "logits/rejected": -0.9513761401176453, "logps/chosen": -532.1759643554688, "logps/rejected": -1148.6627197265625, "loss": 0.0723, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10019969940185547, "rewards/margins": 0.30757206678390503, "rewards/rejected": -0.4077717661857605, "step": 2750 }, { "epoch": 0.53, "learning_rate": 2.7116790869315583e-06, "logits/chosen": -1.591932773590088, "logits/rejected": -0.9616080522537231, "logps/chosen": -556.04296875, "logps/rejected": -1179.85791015625, "loss": 0.0578, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13644689321517944, "rewards/margins": 0.290695458650589, "rewards/rejected": -0.42714232206344604, "step": 2760 }, { "epoch": 0.53, "learning_rate": 2.695112038494198e-06, "logits/chosen": -1.633966088294983, "logits/rejected": -0.9920031428337097, "logps/chosen": -703.5379028320312, "logps/rejected": -1436.71533203125, "loss": 0.0612, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.17566515505313873, "rewards/margins": 0.3405109643936157, "rewards/rejected": -0.516176164150238, "step": 2770 }, { "epoch": 0.53, "learning_rate": 2.6785363646699125e-06, "logits/chosen": -1.5663446187973022, "logits/rejected": -0.9617182612419128, "logps/chosen": -687.4505615234375, "logps/rejected": -1352.730224609375, "loss": 0.0586, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1697978675365448, "rewards/margins": 0.31239965558052063, "rewards/rejected": -0.48219752311706543, "step": 2780 }, { "epoch": 0.53, "learning_rate": 2.6619527982253796e-06, "logits/chosen": -1.663469910621643, "logits/rejected": -1.2072179317474365, "logps/chosen": -623.8106689453125, "logps/rejected": -1280.404052734375, "loss": 0.0712, "rewards/accuracies": 0.875, "rewards/chosen": -0.14441026747226715, "rewards/margins": 0.30307531356811523, "rewards/rejected": -0.4474855959415436, "step": 2790 }, { "epoch": 0.53, "learning_rate": 2.6453620722761897e-06, "logits/chosen": -1.5637850761413574, "logits/rejected": -1.064086675643921, "logps/chosen": -672.9819946289062, "logps/rejected": -1308.570556640625, "loss": 0.1009, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10943061113357544, "rewards/margins": 0.3063245117664337, "rewards/rejected": -0.41575512290000916, "step": 2800 }, { "epoch": 0.54, "learning_rate": 2.628764920254435e-06, "logits/chosen": -1.5698572397232056, "logits/rejected": -1.0133156776428223, "logps/chosen": -510.04412841796875, "logps/rejected": -1206.696533203125, "loss": 0.0806, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.04279591515660286, "rewards/margins": 0.33799928426742554, "rewards/rejected": -0.3807952404022217, "step": 2810 }, { "epoch": 0.54, "learning_rate": 2.6121620758762877e-06, "logits/chosen": -1.9097235202789307, "logits/rejected": -1.2351771593093872, "logps/chosen": -511.05865478515625, "logps/rejected": -1213.244873046875, "loss": 0.0559, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.04127006232738495, "rewards/margins": 0.349868506193161, "rewards/rejected": -0.39113855361938477, "step": 2820 }, { "epoch": 0.54, "learning_rate": 2.595554273109564e-06, "logits/chosen": -1.3623178005218506, "logits/rejected": -0.9337053298950195, "logps/chosen": -494.78363037109375, "logps/rejected": -1105.956787109375, "loss": 0.0832, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.06648660451173782, "rewards/margins": 0.2886095941066742, "rewards/rejected": -0.3550961911678314, "step": 2830 }, { "epoch": 0.54, "learning_rate": 2.5789422461412776e-06, "logits/chosen": -1.5411412715911865, "logits/rejected": -1.069968581199646, "logps/chosen": -604.7297973632812, "logps/rejected": -1145.9554443359375, "loss": 0.0999, "rewards/accuracies": 0.75, "rewards/chosen": -0.14811378717422485, "rewards/margins": 0.24691152572631836, "rewards/rejected": -0.3950252830982208, "step": 2840 }, { "epoch": 0.54, "learning_rate": 2.5623267293451827e-06, "logits/chosen": -1.8171898126602173, "logits/rejected": -1.014732003211975, "logps/chosen": -595.6168212890625, "logps/rejected": -1368.0377197265625, "loss": 0.0434, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.11283586919307709, "rewards/margins": 0.3873719573020935, "rewards/rejected": -0.5002078413963318, "step": 2850 }, { "epoch": 0.54, "learning_rate": 2.5457084572493094e-06, "logits/chosen": -1.6847915649414062, "logits/rejected": -0.8486756086349487, "logps/chosen": -611.9779663085938, "logps/rejected": -1245.8311767578125, "loss": 0.067, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11453308165073395, "rewards/margins": 0.3070717453956604, "rewards/rejected": -0.42160478234291077, "step": 2860 }, { "epoch": 0.55, "learning_rate": 2.5290881645034932e-06, "logits/chosen": -1.5983669757843018, "logits/rejected": -1.0357117652893066, "logps/chosen": -570.3855590820312, "logps/rejected": -1264.300537109375, "loss": 0.0579, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11936721950769424, "rewards/margins": 0.3358164429664612, "rewards/rejected": -0.45518365502357483, "step": 2870 }, { "epoch": 0.55, "learning_rate": 2.5124665858468956e-06, "logits/chosen": -1.475404977798462, "logits/rejected": -0.9934619069099426, "logps/chosen": -562.2199096679688, "logps/rejected": -1392.272216796875, "loss": 0.0574, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.13683505356311798, "rewards/margins": 0.36336854100227356, "rewards/rejected": -0.5002034902572632, "step": 2880 }, { "epoch": 0.55, "learning_rate": 2.4958444560755268e-06, "logits/chosen": -1.543084979057312, "logits/rejected": -0.8719257116317749, "logps/chosen": -697.3746948242188, "logps/rejected": -1420.4222412109375, "loss": 0.0386, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.19695594906806946, "rewards/margins": 0.35786157846450806, "rewards/rejected": -0.5548174977302551, "step": 2890 }, { "epoch": 0.55, "learning_rate": 2.479222510009758e-06, "logits/chosen": -1.5835835933685303, "logits/rejected": -0.9517591595649719, "logps/chosen": -674.9082641601562, "logps/rejected": -1285.297119140625, "loss": 0.0815, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1738702654838562, "rewards/margins": 0.2946757674217224, "rewards/rejected": -0.4685460031032562, "step": 2900 }, { "epoch": 0.55, "learning_rate": 2.4626014824618418e-06, "logits/chosen": -1.5905416011810303, "logits/rejected": -0.8818572163581848, "logps/chosen": -649.43505859375, "logps/rejected": -1258.1546630859375, "loss": 0.0676, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.13180610537528992, "rewards/margins": 0.34726306796073914, "rewards/rejected": -0.47906917333602905, "step": 2910 }, { "epoch": 0.56, "learning_rate": 2.445982108203422e-06, "logits/chosen": -1.4787397384643555, "logits/rejected": -0.7833604216575623, "logps/chosen": -612.2667846679688, "logps/rejected": -1210.948974609375, "loss": 0.0592, "rewards/accuracies": 0.875, "rewards/chosen": -0.1284160166978836, "rewards/margins": 0.31005439162254333, "rewards/rejected": -0.43847042322158813, "step": 2920 }, { "epoch": 0.56, "learning_rate": 2.4293651219330614e-06, "logits/chosen": -1.5488474369049072, "logits/rejected": -0.9995074272155762, "logps/chosen": -633.7059936523438, "logps/rejected": -1303.328857421875, "loss": 0.0551, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.13731206953525543, "rewards/margins": 0.3035878539085388, "rewards/rejected": -0.44089993834495544, "step": 2930 }, { "epoch": 0.56, "learning_rate": 2.4127512582437486e-06, "logits/chosen": -1.6039226055145264, "logits/rejected": -0.9585688710212708, "logps/chosen": -592.0299072265625, "logps/rejected": -1321.1495361328125, "loss": 0.0554, "rewards/accuracies": 0.875, "rewards/chosen": -0.11685065180063248, "rewards/margins": 0.3511469066143036, "rewards/rejected": -0.46799755096435547, "step": 2940 }, { "epoch": 0.56, "learning_rate": 2.3961412515904337e-06, "logits/chosen": -1.3695369958877563, "logits/rejected": -0.9800816774368286, "logps/chosen": -594.6244506835938, "logps/rejected": -1240.9417724609375, "loss": 0.0855, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.14591816067695618, "rewards/margins": 0.3062663674354553, "rewards/rejected": -0.4521844983100891, "step": 2950 }, { "epoch": 0.56, "learning_rate": 2.3795358362575618e-06, "logits/chosen": -1.5104422569274902, "logits/rejected": -1.1466343402862549, "logps/chosen": -510.5021057128906, "logps/rejected": -1294.258544921875, "loss": 0.0478, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1302535980939865, "rewards/margins": 0.3396373689174652, "rewards/rejected": -0.4698910117149353, "step": 2960 }, { "epoch": 0.57, "learning_rate": 2.3629357463266e-06, "logits/chosen": -1.6058807373046875, "logits/rejected": -1.0046648979187012, "logps/chosen": -508.4608459472656, "logps/rejected": -1332.2764892578125, "loss": 0.0465, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.06275545060634613, "rewards/margins": 0.3741517961025238, "rewards/rejected": -0.43690723180770874, "step": 2970 }, { "epoch": 0.57, "learning_rate": 2.346341715643601e-06, "logits/chosen": -1.4893066883087158, "logits/rejected": -0.8853636980056763, "logps/chosen": -449.1748046875, "logps/rejected": -1121.550537109375, "loss": 0.0551, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.063736692070961, "rewards/margins": 0.3206193447113037, "rewards/rejected": -0.38435599207878113, "step": 2980 }, { "epoch": 0.57, "learning_rate": 2.32975447778675e-06, "logits/chosen": -1.7218694686889648, "logits/rejected": -0.8579978942871094, "logps/chosen": -607.2933349609375, "logps/rejected": -1323.5452880859375, "loss": 0.0698, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12219960987567902, "rewards/margins": 0.35332444310188293, "rewards/rejected": -0.47552403807640076, "step": 2990 }, { "epoch": 0.57, "learning_rate": 2.3131747660339396e-06, "logits/chosen": -1.630824089050293, "logits/rejected": -1.1554635763168335, "logps/chosen": -639.3021240234375, "logps/rejected": -1315.35498046875, "loss": 0.087, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.14220556616783142, "rewards/margins": 0.30729326605796814, "rewards/rejected": -0.44949883222579956, "step": 3000 }, { "epoch": 0.57, "learning_rate": 2.296603313330355e-06, "logits/chosen": -1.4276154041290283, "logits/rejected": -0.9441580772399902, "logps/chosen": -727.0963134765625, "logps/rejected": -1390.979736328125, "loss": 0.1062, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.15815845131874084, "rewards/margins": 0.3167917728424072, "rewards/rejected": -0.47495022416114807, "step": 3010 }, { "epoch": 0.58, "learning_rate": 2.280040852256068e-06, "logits/chosen": -1.4897199869155884, "logits/rejected": -1.0120981931686401, "logps/chosen": -607.09521484375, "logps/rejected": -1325.50390625, "loss": 0.0785, "rewards/accuracies": 0.875, "rewards/chosen": -0.11801674216985703, "rewards/margins": 0.3201891779899597, "rewards/rejected": -0.4382059574127197, "step": 3020 }, { "epoch": 0.58, "learning_rate": 2.2634881149936576e-06, "logits/chosen": -1.4906480312347412, "logits/rejected": -1.0610196590423584, "logps/chosen": -488.85565185546875, "logps/rejected": -1112.202392578125, "loss": 0.0616, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.0860569030046463, "rewards/margins": 0.2713943421840668, "rewards/rejected": -0.35745126008987427, "step": 3030 }, { "epoch": 0.58, "learning_rate": 2.246945833295836e-06, "logits/chosen": -1.4534862041473389, "logits/rejected": -0.9673159718513489, "logps/chosen": -605.3302001953125, "logps/rejected": -1201.044189453125, "loss": 0.076, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.133608877658844, "rewards/margins": 0.2811334431171417, "rewards/rejected": -0.41474229097366333, "step": 3040 }, { "epoch": 0.58, "learning_rate": 2.230414738453104e-06, "logits/chosen": -1.767216682434082, "logits/rejected": -0.8330327868461609, "logps/chosen": -564.5643310546875, "logps/rejected": -1205.147705078125, "loss": 0.054, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.09630884975194931, "rewards/margins": 0.3197113871574402, "rewards/rejected": -0.4160202443599701, "step": 3050 }, { "epoch": 0.58, "learning_rate": 2.2138955612614206e-06, "logits/chosen": -1.421764612197876, "logits/rejected": -1.0521514415740967, "logps/chosen": -628.13232421875, "logps/rejected": -1224.50830078125, "loss": 0.0946, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.14950081706047058, "rewards/margins": 0.2471323311328888, "rewards/rejected": -0.396633118391037, "step": 3060 }, { "epoch": 0.58, "learning_rate": 2.1973890319898965e-06, "logits/chosen": -1.6031911373138428, "logits/rejected": -1.0214576721191406, "logps/chosen": -562.112060546875, "logps/rejected": -1285.865966796875, "loss": 0.0528, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.08155658096075058, "rewards/margins": 0.31444963812828064, "rewards/rejected": -0.3960062563419342, "step": 3070 }, { "epoch": 0.59, "learning_rate": 2.1808958803485134e-06, "logits/chosen": -1.4915971755981445, "logits/rejected": -1.0089516639709473, "logps/chosen": -610.0130004882812, "logps/rejected": -1197.3531494140625, "loss": 0.0601, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1019025444984436, "rewards/margins": 0.26567164063453674, "rewards/rejected": -0.36757418513298035, "step": 3080 }, { "epoch": 0.59, "learning_rate": 2.1644168354558623e-06, "logits/chosen": -1.6156599521636963, "logits/rejected": -1.1080366373062134, "logps/chosen": -492.7898864746094, "logps/rejected": -1027.3740234375, "loss": 0.0683, "rewards/accuracies": 0.75, "rewards/chosen": -0.05810853838920593, "rewards/margins": 0.2624923884868622, "rewards/rejected": -0.3206009268760681, "step": 3090 }, { "epoch": 0.59, "learning_rate": 2.1479526258069086e-06, "logits/chosen": -1.5177949666976929, "logits/rejected": -0.9924715757369995, "logps/chosen": -546.3245849609375, "logps/rejected": -1170.1224365234375, "loss": 0.0951, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08852513134479523, "rewards/margins": 0.27009642124176025, "rewards/rejected": -0.3586215376853943, "step": 3100 }, { "epoch": 0.59, "learning_rate": 2.1315039792407975e-06, "logits/chosen": -1.3530235290527344, "logits/rejected": -0.7246929407119751, "logps/chosen": -548.1713256835938, "logps/rejected": -1333.8492431640625, "loss": 0.0413, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.09849376976490021, "rewards/margins": 0.3531506657600403, "rewards/rejected": -0.4516444206237793, "step": 3110 }, { "epoch": 0.59, "learning_rate": 2.115071622908666e-06, "logits/chosen": -1.4942448139190674, "logits/rejected": -0.7404344081878662, "logps/chosen": -500.0982360839844, "logps/rejected": -1106.1728515625, "loss": 0.0538, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.06125213950872421, "rewards/margins": 0.31473299860954285, "rewards/rejected": -0.37598511576652527, "step": 3120 }, { "epoch": 0.6, "learning_rate": 2.0986562832415063e-06, "logits/chosen": -1.6442813873291016, "logits/rejected": -1.1128791570663452, "logps/chosen": -557.3236083984375, "logps/rejected": -1182.487060546875, "loss": 0.0558, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.04658445343375206, "rewards/margins": 0.33271127939224243, "rewards/rejected": -0.3792957663536072, "step": 3130 }, { "epoch": 0.6, "learning_rate": 2.082258685918047e-06, "logits/chosen": -1.6417179107666016, "logits/rejected": -1.0537028312683105, "logps/chosen": -473.1177673339844, "logps/rejected": -1105.520263671875, "loss": 0.0614, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05988591909408569, "rewards/margins": 0.2984519600868225, "rewards/rejected": -0.358337938785553, "step": 3140 }, { "epoch": 0.6, "learning_rate": 2.0658795558326745e-06, "logits/chosen": -1.6194617748260498, "logits/rejected": -1.0565998554229736, "logps/chosen": -530.3972778320312, "logps/rejected": -1221.812744140625, "loss": 0.0723, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07586511969566345, "rewards/margins": 0.324541836977005, "rewards/rejected": -0.40040698647499084, "step": 3150 }, { "epoch": 0.6, "learning_rate": 2.049519617063389e-06, "logits/chosen": -1.5704630613327026, "logits/rejected": -0.9416986703872681, "logps/chosen": -497.73974609375, "logps/rejected": -1204.732666015625, "loss": 0.0386, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.07720668613910675, "rewards/margins": 0.3270443081855774, "rewards/rejected": -0.40425100922584534, "step": 3160 }, { "epoch": 0.6, "learning_rate": 2.033179592839792e-06, "logits/chosen": -1.8685951232910156, "logits/rejected": -1.069949984550476, "logps/chosen": -582.2603759765625, "logps/rejected": -1132.173095703125, "loss": 0.0761, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.08046362549066544, "rewards/margins": 0.2879236340522766, "rewards/rejected": -0.3683873414993286, "step": 3170 }, { "epoch": 0.61, "learning_rate": 2.0168602055111175e-06, "logits/chosen": -1.5395227670669556, "logits/rejected": -1.046355962753296, "logps/chosen": -564.5604248046875, "logps/rejected": -1171.841064453125, "loss": 0.0757, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10129845142364502, "rewards/margins": 0.29600051045417786, "rewards/rejected": -0.3972989022731781, "step": 3180 }, { "epoch": 0.61, "learning_rate": 2.0005621765142942e-06, "logits/chosen": -1.4990097284317017, "logits/rejected": -0.953599750995636, "logps/chosen": -579.7748413085938, "logps/rejected": -1242.611572265625, "loss": 0.0364, "rewards/accuracies": 0.875, "rewards/chosen": -0.08316659927368164, "rewards/margins": 0.32868489623069763, "rewards/rejected": -0.4118514657020569, "step": 3190 }, { "epoch": 0.61, "learning_rate": 1.9842862263420565e-06, "logits/chosen": -1.3561508655548096, "logits/rejected": -0.8739617466926575, "logps/chosen": -579.7095947265625, "logps/rejected": -1226.96728515625, "loss": 0.0817, "rewards/accuracies": 0.875, "rewards/chosen": -0.10601375252008438, "rewards/margins": 0.302290141582489, "rewards/rejected": -0.4083038866519928, "step": 3200 }, { "epoch": 0.61, "learning_rate": 1.9680330745110954e-06, "logits/chosen": -1.5132472515106201, "logits/rejected": -0.9904989004135132, "logps/chosen": -658.9536743164062, "logps/rejected": -1169.8291015625, "loss": 0.0817, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1283619999885559, "rewards/margins": 0.24815258383750916, "rewards/rejected": -0.37651458382606506, "step": 3210 }, { "epoch": 0.61, "learning_rate": 1.9518034395302413e-06, "logits/chosen": -1.6865606307983398, "logits/rejected": -0.724189281463623, "logps/chosen": -647.4058837890625, "logps/rejected": -1245.415771484375, "loss": 0.0529, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.09649708122015, "rewards/margins": 0.3382863402366638, "rewards/rejected": -0.434783399105072, "step": 3220 }, { "epoch": 0.62, "learning_rate": 1.9355980388687145e-06, "logits/chosen": -1.61764657497406, "logits/rejected": -0.9387199282646179, "logps/chosen": -669.2235107421875, "logps/rejected": -1221.201416015625, "loss": 0.0648, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13963374495506287, "rewards/margins": 0.3037804663181305, "rewards/rejected": -0.44341421127319336, "step": 3230 }, { "epoch": 0.62, "learning_rate": 1.9194175889243942e-06, "logits/chosen": -1.5610865354537964, "logits/rejected": -0.8656753301620483, "logps/chosen": -716.8016967773438, "logps/rejected": -1312.772216796875, "loss": 0.0645, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.15120446681976318, "rewards/margins": 0.3280597925186157, "rewards/rejected": -0.4792643189430237, "step": 3240 }, { "epoch": 0.62, "learning_rate": 1.903262804992156e-06, "logits/chosen": -1.409865140914917, "logits/rejected": -0.7154837846755981, "logps/chosen": -558.9967651367188, "logps/rejected": -1203.494873046875, "loss": 0.0566, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09356357902288437, "rewards/margins": 0.32224196195602417, "rewards/rejected": -0.41580551862716675, "step": 3250 }, { "epoch": 0.62, "learning_rate": 1.8871344012322504e-06, "logits/chosen": -1.3692007064819336, "logits/rejected": -0.7519701719284058, "logps/chosen": -581.4006958007812, "logps/rejected": -1174.0390625, "loss": 0.0703, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10192625224590302, "rewards/margins": 0.308377206325531, "rewards/rejected": -0.4103034436702728, "step": 3260 }, { "epoch": 0.62, "learning_rate": 1.8710330906387288e-06, "logits/chosen": -1.7411953210830688, "logits/rejected": -1.087471842765808, "logps/chosen": -503.864501953125, "logps/rejected": -1072.919677734375, "loss": 0.1001, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.09860959649085999, "rewards/margins": 0.25538548827171326, "rewards/rejected": -0.35399508476257324, "step": 3270 }, { "epoch": 0.62, "learning_rate": 1.8549595850079272e-06, "logits/chosen": -1.6818044185638428, "logits/rejected": -1.2432438135147095, "logps/chosen": -630.8226318359375, "logps/rejected": -1347.3629150390625, "loss": 0.0867, "rewards/accuracies": 0.875, "rewards/chosen": -0.12099182605743408, "rewards/margins": 0.32913532853126526, "rewards/rejected": -0.45012718439102173, "step": 3280 }, { "epoch": 0.63, "learning_rate": 1.8389145949069953e-06, "logits/chosen": -1.6793514490127563, "logits/rejected": -1.0356289148330688, "logps/chosen": -653.2823486328125, "logps/rejected": -1186.5311279296875, "loss": 0.1233, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14252005517482758, "rewards/margins": 0.24804458022117615, "rewards/rejected": -0.3905646502971649, "step": 3290 }, { "epoch": 0.63, "learning_rate": 1.8228988296424877e-06, "logits/chosen": -1.4436924457550049, "logits/rejected": -1.0423951148986816, "logps/chosen": -534.2002563476562, "logps/rejected": -1150.822509765625, "loss": 0.0885, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08847598731517792, "rewards/margins": 0.2850678563117981, "rewards/rejected": -0.3735438287258148, "step": 3300 }, { "epoch": 0.63, "learning_rate": 1.806912997229008e-06, "logits/chosen": -1.5583360195159912, "logits/rejected": -0.9240388870239258, "logps/chosen": -547.8662109375, "logps/rejected": -1232.9605712890625, "loss": 0.0587, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09903976321220398, "rewards/margins": 0.3067070543766022, "rewards/rejected": -0.40574678778648376, "step": 3310 }, { "epoch": 0.63, "learning_rate": 1.7909578043579037e-06, "logits/chosen": -1.6759271621704102, "logits/rejected": -1.00883150100708, "logps/chosen": -617.1407470703125, "logps/rejected": -1213.87109375, "loss": 0.0595, "rewards/accuracies": 0.875, "rewards/chosen": -0.10988689959049225, "rewards/margins": 0.28930503129959106, "rewards/rejected": -0.39919185638427734, "step": 3320 }, { "epoch": 0.63, "learning_rate": 1.7750339563660346e-06, "logits/chosen": -1.8040192127227783, "logits/rejected": -1.1109973192214966, "logps/chosen": -591.2498168945312, "logps/rejected": -1209.500732421875, "loss": 0.0738, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09945808351039886, "rewards/margins": 0.2953023612499237, "rewards/rejected": -0.39476045966148376, "step": 3330 }, { "epoch": 0.64, "learning_rate": 1.759142157204583e-06, "logits/chosen": -1.4551546573638916, "logits/rejected": -1.2808643579483032, "logps/chosen": -430.88592529296875, "logps/rejected": -990.6643676757812, "loss": 0.1033, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07554015517234802, "rewards/margins": 0.23506391048431396, "rewards/rejected": -0.3106040358543396, "step": 3340 }, { "epoch": 0.64, "learning_rate": 1.7432831094079357e-06, "logits/chosen": -1.647020936012268, "logits/rejected": -0.9831321835517883, "logps/chosen": -474.423583984375, "logps/rejected": -1153.595947265625, "loss": 0.0561, "rewards/accuracies": 0.875, "rewards/chosen": -0.05415312573313713, "rewards/margins": 0.315816193819046, "rewards/rejected": -0.36996930837631226, "step": 3350 }, { "epoch": 0.64, "learning_rate": 1.7274575140626318e-06, "logits/chosen": -1.8123953342437744, "logits/rejected": -1.0507439374923706, "logps/chosen": -510.15374755859375, "logps/rejected": -1266.872802734375, "loss": 0.0711, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04254557937383652, "rewards/margins": 0.3346264958381653, "rewards/rejected": -0.37717205286026, "step": 3360 }, { "epoch": 0.64, "learning_rate": 1.7116660707763637e-06, "logits/chosen": -1.4582722187042236, "logits/rejected": -0.9082058668136597, "logps/chosen": -547.4808959960938, "logps/rejected": -1253.8936767578125, "loss": 0.0398, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.05783357471227646, "rewards/margins": 0.33342987298965454, "rewards/rejected": -0.3912634551525116, "step": 3370 }, { "epoch": 0.64, "learning_rate": 1.695909477647054e-06, "logits/chosen": -1.4480509757995605, "logits/rejected": -0.9746176600456238, "logps/chosen": -525.312255859375, "logps/rejected": -1169.0672607421875, "loss": 0.0675, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.07152949273586273, "rewards/margins": 0.29411083459854126, "rewards/rejected": -0.36564040184020996, "step": 3380 }, { "epoch": 0.65, "learning_rate": 1.6801884312319893e-06, "logits/chosen": -1.4334437847137451, "logits/rejected": -0.9882246255874634, "logps/chosen": -490.239013671875, "logps/rejected": -1054.856201171875, "loss": 0.0758, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.033695705235004425, "rewards/margins": 0.2827971577644348, "rewards/rejected": -0.31649279594421387, "step": 3390 }, { "epoch": 0.65, "learning_rate": 1.6645036265170314e-06, "logits/chosen": -1.5410234928131104, "logits/rejected": -0.8343310356140137, "logps/chosen": -517.603759765625, "logps/rejected": -1253.4979248046875, "loss": 0.0561, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.07378663122653961, "rewards/margins": 0.3355935215950012, "rewards/rejected": -0.40938013792037964, "step": 3400 }, { "epoch": 0.65, "learning_rate": 1.648855756885893e-06, "logits/chosen": -1.642655611038208, "logits/rejected": -1.2012007236480713, "logps/chosen": -450.128173828125, "logps/rejected": -1080.1187744140625, "loss": 0.0879, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.05434330180287361, "rewards/margins": 0.27073508501052856, "rewards/rejected": -0.3250783383846283, "step": 3410 }, { "epoch": 0.65, "learning_rate": 1.633245514089482e-06, "logits/chosen": -1.535819411277771, "logits/rejected": -0.8469101190567017, "logps/chosen": -515.0177612304688, "logps/rejected": -1162.944091796875, "loss": 0.0711, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08549628406763077, "rewards/margins": 0.2991836369037628, "rewards/rejected": -0.3846798837184906, "step": 3420 }, { "epoch": 0.65, "learning_rate": 1.6176735882153284e-06, "logits/chosen": -1.6216903924942017, "logits/rejected": -0.785152792930603, "logps/chosen": -591.2189331054688, "logps/rejected": -1124.4949951171875, "loss": 0.0772, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11196261644363403, "rewards/margins": 0.2648715376853943, "rewards/rejected": -0.37683412432670593, "step": 3430 }, { "epoch": 0.66, "learning_rate": 1.6021406676570667e-06, "logits/chosen": -1.3090717792510986, "logits/rejected": -0.9269634485244751, "logps/chosen": -579.5616455078125, "logps/rejected": -1204.068359375, "loss": 0.0947, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14344973862171173, "rewards/margins": 0.2711476683616638, "rewards/rejected": -0.41459742188453674, "step": 3440 }, { "epoch": 0.66, "learning_rate": 1.5866474390840126e-06, "logits/chosen": -1.6060562133789062, "logits/rejected": -1.0789166688919067, "logps/chosen": -631.2396240234375, "logps/rejected": -1278.0540771484375, "loss": 0.065, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12369465827941895, "rewards/margins": 0.3081647455692291, "rewards/rejected": -0.43185940384864807, "step": 3450 }, { "epoch": 0.66, "learning_rate": 1.5711945874108053e-06, "logits/chosen": -1.5090563297271729, "logits/rejected": -0.8889325261116028, "logps/chosen": -547.3897705078125, "logps/rejected": -1296.255859375, "loss": 0.1008, "rewards/accuracies": 0.875, "rewards/chosen": -0.1164555549621582, "rewards/margins": 0.3054220378398895, "rewards/rejected": -0.4218776226043701, "step": 3460 }, { "epoch": 0.66, "learning_rate": 1.5557827957671249e-06, "logits/chosen": -1.4225951433181763, "logits/rejected": -0.8890473246574402, "logps/chosen": -510.88232421875, "logps/rejected": -1235.585205078125, "loss": 0.0621, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09390435367822647, "rewards/margins": 0.33430469036102295, "rewards/rejected": -0.4282090663909912, "step": 3470 }, { "epoch": 0.66, "learning_rate": 1.5404127454674994e-06, "logits/chosen": -1.4549505710601807, "logits/rejected": -0.9507797360420227, "logps/chosen": -447.9923400878906, "logps/rejected": -1089.358642578125, "loss": 0.0869, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08430926501750946, "rewards/margins": 0.2995262145996094, "rewards/rejected": -0.38383546471595764, "step": 3480 }, { "epoch": 0.66, "learning_rate": 1.5250851159811809e-06, "logits/chosen": -1.4489725828170776, "logits/rejected": -0.8240424394607544, "logps/chosen": -536.1524658203125, "logps/rejected": -1204.070068359375, "loss": 0.0628, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09031540900468826, "rewards/margins": 0.32029175758361816, "rewards/rejected": -0.41060715913772583, "step": 3490 }, { "epoch": 0.67, "learning_rate": 1.509800584902108e-06, "logits/chosen": -1.6435320377349854, "logits/rejected": -0.9851503372192383, "logps/chosen": -394.79254150390625, "logps/rejected": -1075.1129150390625, "loss": 0.0555, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.05022016167640686, "rewards/margins": 0.3278888165950775, "rewards/rejected": -0.37810903787612915, "step": 3500 }, { "epoch": 0.67, "learning_rate": 1.4945598279189565e-06, "logits/chosen": -1.6852319240570068, "logits/rejected": -0.7762208580970764, "logps/chosen": -584.0240478515625, "logps/rejected": -1283.089599609375, "loss": 0.0393, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.07786907255649567, "rewards/margins": 0.36127179861068726, "rewards/rejected": -0.4391408860683441, "step": 3510 }, { "epoch": 0.67, "learning_rate": 1.4793635187852622e-06, "logits/chosen": -1.5712199211120605, "logits/rejected": -0.8487402200698853, "logps/chosen": -625.174560546875, "logps/rejected": -1259.09912109375, "loss": 0.0649, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1212628111243248, "rewards/margins": 0.28756266832351685, "rewards/rejected": -0.40882548689842224, "step": 3520 }, { "epoch": 0.67, "learning_rate": 1.4642123292896406e-06, "logits/chosen": -1.7730987071990967, "logits/rejected": -1.187359094619751, "logps/chosen": -506.89727783203125, "logps/rejected": -1099.273193359375, "loss": 0.0719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.055269528180360794, "rewards/margins": 0.2945236563682556, "rewards/rejected": -0.3497931659221649, "step": 3530 }, { "epoch": 0.67, "learning_rate": 1.4491069292260867e-06, "logits/chosen": -1.5588057041168213, "logits/rejected": -0.9192155599594116, "logps/chosen": -591.0391845703125, "logps/rejected": -1198.16015625, "loss": 0.0574, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08599583059549332, "rewards/margins": 0.29813671112060547, "rewards/rejected": -0.3841325342655182, "step": 3540 }, { "epoch": 0.68, "learning_rate": 1.4340479863643658e-06, "logits/chosen": -1.5649362802505493, "logits/rejected": -1.0527899265289307, "logps/chosen": -587.0765380859375, "logps/rejected": -1299.611572265625, "loss": 0.0652, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11504580825567245, "rewards/margins": 0.3227574825286865, "rewards/rejected": -0.43780332803726196, "step": 3550 }, { "epoch": 0.68, "learning_rate": 1.4190361664204936e-06, "logits/chosen": -1.5563275814056396, "logits/rejected": -1.1789706945419312, "logps/chosen": -463.9998474121094, "logps/rejected": -1109.9326171875, "loss": 0.0852, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05924314260482788, "rewards/margins": 0.2901732325553894, "rewards/rejected": -0.3494163751602173, "step": 3560 }, { "epoch": 0.68, "learning_rate": 1.4040721330273063e-06, "logits/chosen": -1.5990869998931885, "logits/rejected": -0.8658841252326965, "logps/chosen": -562.2022705078125, "logps/rejected": -1232.999755859375, "loss": 0.0502, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.06985044479370117, "rewards/margins": 0.32169193029403687, "rewards/rejected": -0.39154237508773804, "step": 3570 }, { "epoch": 0.68, "learning_rate": 1.3891565477051242e-06, "logits/chosen": -1.259183645248413, "logits/rejected": -1.056996464729309, "logps/chosen": -360.9638671875, "logps/rejected": -916.8385620117188, "loss": 0.0722, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.060729265213012695, "rewards/margins": 0.2513827681541443, "rewards/rejected": -0.312112033367157, "step": 3580 }, { "epoch": 0.68, "learning_rate": 1.3742900698325034e-06, "logits/chosen": -1.6714311838150024, "logits/rejected": -0.8850634694099426, "logps/chosen": -618.9185791015625, "logps/rejected": -1252.8792724609375, "loss": 0.0481, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.13342010974884033, "rewards/margins": 0.3281671404838562, "rewards/rejected": -0.46158725023269653, "step": 3590 }, { "epoch": 0.69, "learning_rate": 1.3594733566170925e-06, "logits/chosen": -1.5407627820968628, "logits/rejected": -1.0272929668426514, "logps/chosen": -562.9507446289062, "logps/rejected": -1212.2821044921875, "loss": 0.0512, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11579425632953644, "rewards/margins": 0.3113517165184021, "rewards/rejected": -0.42714595794677734, "step": 3600 }, { "epoch": 0.69, "learning_rate": 1.3447070630665771e-06, "logits/chosen": -1.5532509088516235, "logits/rejected": -1.0339925289154053, "logps/chosen": -618.67626953125, "logps/rejected": -1198.66796875, "loss": 0.075, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12184294313192368, "rewards/margins": 0.2842368483543396, "rewards/rejected": -0.40607982873916626, "step": 3610 }, { "epoch": 0.69, "learning_rate": 1.329991841959717e-06, "logits/chosen": -1.4766697883605957, "logits/rejected": -1.0048980712890625, "logps/chosen": -446.8675231933594, "logps/rejected": -1180.1754150390625, "loss": 0.0837, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.058239441365003586, "rewards/margins": 0.3347640931606293, "rewards/rejected": -0.39300355315208435, "step": 3620 }, { "epoch": 0.69, "learning_rate": 1.3153283438175036e-06, "logits/chosen": -1.6488538980484009, "logits/rejected": -1.1211696863174438, "logps/chosen": -476.58233642578125, "logps/rejected": -1119.6107177734375, "loss": 0.0622, "rewards/accuracies": 0.75, "rewards/chosen": -0.08521339297294617, "rewards/margins": 0.2965385317802429, "rewards/rejected": -0.3817519545555115, "step": 3630 }, { "epoch": 0.69, "learning_rate": 1.3007172168743854e-06, "logits/chosen": -1.5866855382919312, "logits/rejected": -1.1675742864608765, "logps/chosen": -620.083740234375, "logps/rejected": -1222.1119384765625, "loss": 0.0785, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11164329200983047, "rewards/margins": 0.2959319055080414, "rewards/rejected": -0.40757519006729126, "step": 3640 }, { "epoch": 0.7, "learning_rate": 1.2861591070496193e-06, "logits/chosen": -1.6165597438812256, "logits/rejected": -0.9629543423652649, "logps/chosen": -572.568359375, "logps/rejected": -1131.6605224609375, "loss": 0.0532, "rewards/accuracies": 0.875, "rewards/chosen": -0.08107302337884903, "rewards/margins": 0.28896981477737427, "rewards/rejected": -0.3700428009033203, "step": 3650 }, { "epoch": 0.7, "learning_rate": 1.271654657918722e-06, "logits/chosen": -1.4744112491607666, "logits/rejected": -1.0718681812286377, "logps/chosen": -544.6527099609375, "logps/rejected": -1227.4991455078125, "loss": 0.0542, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1165962815284729, "rewards/margins": 0.28793099522590637, "rewards/rejected": -0.4045272767543793, "step": 3660 }, { "epoch": 0.7, "learning_rate": 1.2572045106850051e-06, "logits/chosen": -1.5404466390609741, "logits/rejected": -1.0062000751495361, "logps/chosen": -482.5479431152344, "logps/rejected": -1285.21142578125, "loss": 0.057, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0981147438287735, "rewards/margins": 0.3251039981842041, "rewards/rejected": -0.4232187867164612, "step": 3670 }, { "epoch": 0.7, "learning_rate": 1.2428093041512418e-06, "logits/chosen": -1.6958799362182617, "logits/rejected": -1.104161024093628, "logps/chosen": -482.8475646972656, "logps/rejected": -1170.7442626953125, "loss": 0.072, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08815275132656097, "rewards/margins": 0.3066135048866272, "rewards/rejected": -0.3947662115097046, "step": 3680 }, { "epoch": 0.7, "learning_rate": 1.2284696746914216e-06, "logits/chosen": -1.5207383632659912, "logits/rejected": -1.0584545135498047, "logps/chosen": -582.7368774414062, "logps/rejected": -1198.0496826171875, "loss": 0.0751, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12373187392950058, "rewards/margins": 0.27462083101272583, "rewards/rejected": -0.3983527421951294, "step": 3690 }, { "epoch": 0.7, "learning_rate": 1.2141862562226164e-06, "logits/chosen": -1.727847695350647, "logits/rejected": -0.9943065643310547, "logps/chosen": -509.9764709472656, "logps/rejected": -1151.262451171875, "loss": 0.0466, "rewards/accuracies": 0.875, "rewards/chosen": -0.05588286370038986, "rewards/margins": 0.31229880452156067, "rewards/rejected": -0.36818164587020874, "step": 3700 }, { "epoch": 0.71, "learning_rate": 1.1999596801769617e-06, "logits/chosen": -1.8949705362319946, "logits/rejected": -0.9455671310424805, "logps/chosen": -665.4451904296875, "logps/rejected": -1261.169677734375, "loss": 0.0799, "rewards/accuracies": 0.875, "rewards/chosen": -0.1065397709608078, "rewards/margins": 0.3085806965827942, "rewards/rejected": -0.41512051224708557, "step": 3710 }, { "epoch": 0.71, "learning_rate": 1.185790575473738e-06, "logits/chosen": -1.275059461593628, "logits/rejected": -0.8686116933822632, "logps/chosen": -545.46044921875, "logps/rejected": -1306.507568359375, "loss": 0.0753, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.09290392696857452, "rewards/margins": 0.314125120639801, "rewards/rejected": -0.40702906250953674, "step": 3720 }, { "epoch": 0.71, "learning_rate": 1.1716795684915728e-06, "logits/chosen": -1.5622496604919434, "logits/rejected": -1.0755438804626465, "logps/chosen": -472.22412109375, "logps/rejected": -1130.0338134765625, "loss": 0.0626, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05678543448448181, "rewards/margins": 0.31124743819236755, "rewards/rejected": -0.36803287267684937, "step": 3730 }, { "epoch": 0.71, "learning_rate": 1.1576272830407418e-06, "logits/chosen": -1.4323136806488037, "logits/rejected": -0.9573151469230652, "logps/chosen": -519.433837890625, "logps/rejected": -1087.538818359375, "loss": 0.1109, "rewards/accuracies": 0.75, "rewards/chosen": -0.09503252804279327, "rewards/margins": 0.2525123953819275, "rewards/rejected": -0.34754490852355957, "step": 3740 }, { "epoch": 0.71, "learning_rate": 1.1436343403356019e-06, "logits/chosen": -1.5816457271575928, "logits/rejected": -1.0379760265350342, "logps/chosen": -583.7271728515625, "logps/rejected": -1389.2052001953125, "loss": 0.0636, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.08136254549026489, "rewards/margins": 0.3822881281375885, "rewards/rejected": -0.4636506140232086, "step": 3750 }, { "epoch": 0.72, "learning_rate": 1.129701358967123e-06, "logits/chosen": -1.5716511011123657, "logits/rejected": -0.927304744720459, "logps/chosen": -579.1673583984375, "logps/rejected": -1177.848876953125, "loss": 0.0707, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.0907040387392044, "rewards/margins": 0.2821573317050934, "rewards/rejected": -0.372861385345459, "step": 3760 }, { "epoch": 0.72, "learning_rate": 1.11582895487554e-06, "logits/chosen": -1.665675401687622, "logits/rejected": -1.0130513906478882, "logps/chosen": -493.2945861816406, "logps/rejected": -1227.0123291015625, "loss": 0.0586, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05040637403726578, "rewards/margins": 0.33554786443710327, "rewards/rejected": -0.38595423102378845, "step": 3770 }, { "epoch": 0.72, "learning_rate": 1.1020177413231334e-06, "logits/chosen": -1.416183352470398, "logits/rejected": -1.0358549356460571, "logps/chosen": -512.5576782226562, "logps/rejected": -1100.0, "loss": 0.0715, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10818634927272797, "rewards/margins": 0.26378339529037476, "rewards/rejected": -0.3719697594642639, "step": 3780 }, { "epoch": 0.72, "learning_rate": 1.0882683288671041e-06, "logits/chosen": -1.4680287837982178, "logits/rejected": -0.9518529176712036, "logps/chosen": -548.0827026367188, "logps/rejected": -1328.7269287109375, "loss": 0.0503, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.07367765158414841, "rewards/margins": 0.3535308241844177, "rewards/rejected": -0.4272085130214691, "step": 3790 }, { "epoch": 0.72, "learning_rate": 1.0745813253325957e-06, "logits/chosen": -1.6463426351547241, "logits/rejected": -0.9568156003952026, "logps/chosen": -647.0377197265625, "logps/rejected": -1093.1588134765625, "loss": 0.0765, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.09032820910215378, "rewards/margins": 0.251170814037323, "rewards/rejected": -0.3414990305900574, "step": 3800 }, { "epoch": 0.73, "learning_rate": 1.0609573357858166e-06, "logits/chosen": -1.671026587486267, "logits/rejected": -1.1713273525238037, "logps/chosen": -475.3999938964844, "logps/rejected": -1094.6204833984375, "loss": 0.077, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07541962713003159, "rewards/margins": 0.2850693166255951, "rewards/rejected": -0.3604889512062073, "step": 3810 }, { "epoch": 0.73, "learning_rate": 1.0473969625072922e-06, "logits/chosen": -1.8827602863311768, "logits/rejected": -1.1715481281280518, "logps/chosen": -581.587890625, "logps/rejected": -1301.4676513671875, "loss": 0.0463, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.09528975188732147, "rewards/margins": 0.347636878490448, "rewards/rejected": -0.44292664527893066, "step": 3820 }, { "epoch": 0.73, "learning_rate": 1.0339008049652427e-06, "logits/chosen": -1.6441001892089844, "logits/rejected": -0.9791193008422852, "logps/chosen": -703.3151245117188, "logps/rejected": -1367.9464111328125, "loss": 0.0797, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.14000825583934784, "rewards/margins": 0.32136717438697815, "rewards/rejected": -0.46137547492980957, "step": 3830 }, { "epoch": 0.73, "learning_rate": 1.0204694597890814e-06, "logits/chosen": -1.3856605291366577, "logits/rejected": -0.8451806306838989, "logps/chosen": -538.6572875976562, "logps/rejected": -1143.777587890625, "loss": 0.0825, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.12030009925365448, "rewards/margins": 0.28403720259666443, "rewards/rejected": -0.4043373167514801, "step": 3840 }, { "epoch": 0.73, "learning_rate": 1.0071035207430352e-06, "logits/chosen": -1.4096628427505493, "logits/rejected": -0.9546631574630737, "logps/chosen": -471.92523193359375, "logps/rejected": -1200.522216796875, "loss": 0.0552, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10387523472309113, "rewards/margins": 0.31556516885757446, "rewards/rejected": -0.419440358877182, "step": 3850 }, { "epoch": 0.74, "learning_rate": 9.938035786999018e-07, "logits/chosen": -1.5193308591842651, "logits/rejected": -0.9365988969802856, "logps/chosen": -572.107421875, "logps/rejected": -1061.31689453125, "loss": 0.1134, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11436333507299423, "rewards/margins": 0.2368105947971344, "rewards/rejected": -0.35117393732070923, "step": 3860 }, { "epoch": 0.74, "learning_rate": 9.805702216149252e-07, "logits/chosen": -1.5686334371566772, "logits/rejected": -0.9215704202651978, "logps/chosen": -517.8717041015625, "logps/rejected": -1208.834228515625, "loss": 0.0644, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09085571765899658, "rewards/margins": 0.3140665292739868, "rewards/rejected": -0.4049221873283386, "step": 3870 }, { "epoch": 0.74, "learning_rate": 9.674040344998056e-07, "logits/chosen": -1.3506265878677368, "logits/rejected": -0.9446635246276855, "logps/chosen": -512.5413818359375, "logps/rejected": -1294.516357421875, "loss": 0.0453, "rewards/accuracies": 0.875, "rewards/chosen": -0.12554454803466797, "rewards/margins": 0.3319617509841919, "rewards/rejected": -0.45750635862350464, "step": 3880 }, { "epoch": 0.74, "learning_rate": 9.543055993968339e-07, "logits/chosen": -1.6407482624053955, "logits/rejected": -1.1494085788726807, "logps/chosen": -494.1866149902344, "logps/rejected": -1177.568115234375, "loss": 0.0758, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11050983518362045, "rewards/margins": 0.3092818558216095, "rewards/rejected": -0.41979169845581055, "step": 3890 }, { "epoch": 0.74, "learning_rate": 9.412754953531664e-07, "logits/chosen": -1.5899689197540283, "logits/rejected": -0.9618569612503052, "logps/chosen": -592.8374633789062, "logps/rejected": -1161.0634765625, "loss": 0.0644, "rewards/accuracies": 0.875, "rewards/chosen": -0.10876324027776718, "rewards/margins": 0.295836865901947, "rewards/rejected": -0.40460005402565, "step": 3900 }, { "epoch": 0.74, "learning_rate": 9.283142983952231e-07, "logits/chosen": -1.3185956478118896, "logits/rejected": -0.7824636101722717, "logps/chosen": -538.34912109375, "logps/rejected": -1115.861328125, "loss": 0.0765, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.14305037260055542, "rewards/margins": 0.2506643235683441, "rewards/rejected": -0.3937147259712219, "step": 3910 }, { "epoch": 0.75, "learning_rate": 9.154225815032242e-07, "logits/chosen": -1.5346519947052002, "logits/rejected": -0.8861868977546692, "logps/chosen": -551.1466064453125, "logps/rejected": -1097.8878173828125, "loss": 0.0767, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10631624609231949, "rewards/margins": 0.27936822175979614, "rewards/rejected": -0.38568446040153503, "step": 3920 }, { "epoch": 0.75, "learning_rate": 9.026009145858608e-07, "logits/chosen": -1.7078043222427368, "logits/rejected": -0.961190402507782, "logps/chosen": -557.4398193359375, "logps/rejected": -1204.511474609375, "loss": 0.054, "rewards/accuracies": 0.875, "rewards/chosen": -0.10399410873651505, "rewards/margins": 0.31865912675857544, "rewards/rejected": -0.4226532578468323, "step": 3930 }, { "epoch": 0.75, "learning_rate": 8.898498644550973e-07, "logits/chosen": -1.577141284942627, "logits/rejected": -0.9125620722770691, "logps/chosen": -578.0216064453125, "logps/rejected": -1236.27392578125, "loss": 0.0674, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1092778891324997, "rewards/margins": 0.3120391368865967, "rewards/rejected": -0.4213170111179352, "step": 3940 }, { "epoch": 0.75, "learning_rate": 8.771699948011203e-07, "logits/chosen": -1.5836879014968872, "logits/rejected": -1.0778744220733643, "logps/chosen": -575.2800903320312, "logps/rejected": -1225.6197509765625, "loss": 0.0894, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10885939747095108, "rewards/margins": 0.3017827272415161, "rewards/rejected": -0.410642147064209, "step": 3950 }, { "epoch": 0.75, "learning_rate": 8.645618661674144e-07, "logits/chosen": -1.499451994895935, "logits/rejected": -0.9791691899299622, "logps/chosen": -474.6846618652344, "logps/rejected": -1006.2005004882812, "loss": 0.1105, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0962987095117569, "rewards/margins": 0.2620392441749573, "rewards/rejected": -0.35833796858787537, "step": 3960 }, { "epoch": 0.76, "learning_rate": 8.520260359259822e-07, "logits/chosen": -1.5831758975982666, "logits/rejected": -1.1838436126708984, "logps/chosen": -551.3702392578125, "logps/rejected": -1213.033447265625, "loss": 0.1138, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.106439970433712, "rewards/margins": 0.28389599919319153, "rewards/rejected": -0.39033594727516174, "step": 3970 }, { "epoch": 0.76, "learning_rate": 8.395630582527075e-07, "logits/chosen": -1.674072265625, "logits/rejected": -0.7699130773544312, "logps/chosen": -510.9281311035156, "logps/rejected": -1112.57763671875, "loss": 0.048, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.07025657594203949, "rewards/margins": 0.30410030484199524, "rewards/rejected": -0.37435686588287354, "step": 3980 }, { "epoch": 0.76, "learning_rate": 8.271734841028553e-07, "logits/chosen": -1.3906548023223877, "logits/rejected": -0.737972617149353, "logps/chosen": -568.7453002929688, "logps/rejected": -1248.777099609375, "loss": 0.0676, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.13117803633213043, "rewards/margins": 0.30835506319999695, "rewards/rejected": -0.4395330846309662, "step": 3990 }, { "epoch": 0.76, "learning_rate": 8.148578611867114e-07, "logits/chosen": -1.6215349435806274, "logits/rejected": -0.7484738230705261, "logps/chosen": -614.5372314453125, "logps/rejected": -1331.172607421875, "loss": 0.0462, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.1255607157945633, "rewards/margins": 0.3478773832321167, "rewards/rejected": -0.4734380841255188, "step": 4000 }, { "epoch": 0.76, "learning_rate": 8.026167339453792e-07, "logits/chosen": -1.3855167627334595, "logits/rejected": -1.0264958143234253, "logps/chosen": -564.0972900390625, "logps/rejected": -1141.349609375, "loss": 0.0905, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11972753703594208, "rewards/margins": 0.2558014690876007, "rewards/rejected": -0.3755289912223816, "step": 4010 }, { "epoch": 0.77, "learning_rate": 7.904506435266998e-07, "logits/chosen": -1.6003910303115845, "logits/rejected": -1.0422978401184082, "logps/chosen": -550.9716796875, "logps/rejected": -1115.030517578125, "loss": 0.0766, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11674098670482635, "rewards/margins": 0.26603561639785767, "rewards/rejected": -0.3827766180038452, "step": 4020 }, { "epoch": 0.77, "learning_rate": 7.783601277613378e-07, "logits/chosen": -1.427156686782837, "logits/rejected": -1.0377559661865234, "logps/chosen": -507.49102783203125, "logps/rejected": -1206.0267333984375, "loss": 0.0485, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.12479288876056671, "rewards/margins": 0.3031744360923767, "rewards/rejected": -0.42796725034713745, "step": 4030 }, { "epoch": 0.77, "learning_rate": 7.66345721139003e-07, "logits/chosen": -1.3637679815292358, "logits/rejected": -1.0013728141784668, "logps/chosen": -511.2125549316406, "logps/rejected": -1133.815185546875, "loss": 0.0831, "rewards/accuracies": 0.75, "rewards/chosen": -0.11282964795827866, "rewards/margins": 0.2991596758365631, "rewards/rejected": -0.4119893014431, "step": 4040 }, { "epoch": 0.77, "learning_rate": 7.544079547848183e-07, "logits/chosen": -1.6487575769424438, "logits/rejected": -1.1513216495513916, "logps/chosen": -475.4693298339844, "logps/rejected": -1217.268310546875, "loss": 0.0694, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0943184494972229, "rewards/margins": 0.32787054777145386, "rewards/rejected": -0.42218899726867676, "step": 4050 }, { "epoch": 0.77, "learning_rate": 7.425473564358457e-07, "logits/chosen": -1.631055235862732, "logits/rejected": -0.8726965188980103, "logps/chosen": -664.9032592773438, "logps/rejected": -1289.385498046875, "loss": 0.0738, "rewards/accuracies": 0.875, "rewards/chosen": -0.14752522110939026, "rewards/margins": 0.3137827515602112, "rewards/rejected": -0.46130794286727905, "step": 4060 }, { "epoch": 0.78, "learning_rate": 7.307644504177539e-07, "logits/chosen": -1.548463225364685, "logits/rejected": -1.0570564270019531, "logps/chosen": -530.0663452148438, "logps/rejected": -1171.269775390625, "loss": 0.0673, "rewards/accuracies": 0.875, "rewards/chosen": -0.1084066778421402, "rewards/margins": 0.29972022771835327, "rewards/rejected": -0.40812692046165466, "step": 4070 }, { "epoch": 0.78, "learning_rate": 7.190597576216385e-07, "logits/chosen": -1.524394154548645, "logits/rejected": -0.9965178370475769, "logps/chosen": -584.171142578125, "logps/rejected": -1225.20654296875, "loss": 0.0845, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13969434797763824, "rewards/margins": 0.2867378890514374, "rewards/rejected": -0.4264322817325592, "step": 4080 }, { "epoch": 0.78, "learning_rate": 7.074337954809945e-07, "logits/chosen": -1.565197229385376, "logits/rejected": -1.1793510913848877, "logps/chosen": -513.2730712890625, "logps/rejected": -1081.368408203125, "loss": 0.0747, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11965713649988174, "rewards/margins": 0.256893515586853, "rewards/rejected": -0.3765506446361542, "step": 4090 }, { "epoch": 0.78, "learning_rate": 6.958870779488447e-07, "logits/chosen": -1.3191499710083008, "logits/rejected": -1.0770232677459717, "logps/chosen": -588.6995849609375, "logps/rejected": -1273.339111328125, "loss": 0.0705, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13210487365722656, "rewards/margins": 0.285585880279541, "rewards/rejected": -0.4176907539367676, "step": 4100 }, { "epoch": 0.78, "learning_rate": 6.844201154750176e-07, "logits/chosen": -1.632425308227539, "logits/rejected": -1.1742222309112549, "logps/chosen": -581.7577514648438, "logps/rejected": -1191.5350341796875, "loss": 0.0732, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09745045006275177, "rewards/margins": 0.3215068578720093, "rewards/rejected": -0.41895729303359985, "step": 4110 }, { "epoch": 0.78, "learning_rate": 6.730334149835788e-07, "logits/chosen": -1.598053216934204, "logits/rejected": -1.1327468156814575, "logps/chosen": -611.4447631835938, "logps/rejected": -1131.943115234375, "loss": 0.0696, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11202128231525421, "rewards/margins": 0.2611934244632721, "rewards/rejected": -0.3732147216796875, "step": 4120 }, { "epoch": 0.79, "learning_rate": 6.617274798504286e-07, "logits/chosen": -1.5097488164901733, "logits/rejected": -1.096771001815796, "logps/chosen": -632.8778076171875, "logps/rejected": -1227.204345703125, "loss": 0.0667, "rewards/accuracies": 0.875, "rewards/chosen": -0.09271929413080215, "rewards/margins": 0.3107157349586487, "rewards/rejected": -0.4034350514411926, "step": 4130 }, { "epoch": 0.79, "learning_rate": 6.505028098810407e-07, "logits/chosen": -1.4080469608306885, "logits/rejected": -0.762841522693634, "logps/chosen": -522.7771606445312, "logps/rejected": -1254.11181640625, "loss": 0.0451, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.10984624922275543, "rewards/margins": 0.32150495052337646, "rewards/rejected": -0.4313511848449707, "step": 4140 }, { "epoch": 0.79, "learning_rate": 6.393599012883709e-07, "logits/chosen": -1.4583604335784912, "logits/rejected": -0.8535671234130859, "logps/chosen": -446.69268798828125, "logps/rejected": -977.5573120117188, "loss": 0.0725, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08450157195329666, "rewards/margins": 0.2547328472137451, "rewards/rejected": -0.3392344117164612, "step": 4150 }, { "epoch": 0.79, "learning_rate": 6.282992466709247e-07, "logits/chosen": -1.353318691253662, "logits/rejected": -0.9638460278511047, "logps/chosen": -498.1649475097656, "logps/rejected": -1317.721923828125, "loss": 0.055, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11576496064662933, "rewards/margins": 0.34007635712623596, "rewards/rejected": -0.4558412432670593, "step": 4160 }, { "epoch": 0.79, "learning_rate": 6.17321334990973e-07, "logits/chosen": -1.6091725826263428, "logits/rejected": -1.1610180139541626, "logps/chosen": -663.9533081054688, "logps/rejected": -1272.82470703125, "loss": 0.1, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13398803770542145, "rewards/margins": 0.2794187664985657, "rewards/rejected": -0.4134067893028259, "step": 4170 }, { "epoch": 0.8, "learning_rate": 6.064266515529419e-07, "logits/chosen": -1.58311927318573, "logits/rejected": -1.1838868856430054, "logps/chosen": -530.5707397460938, "logps/rejected": -1247.7623291015625, "loss": 0.0641, "rewards/accuracies": 0.875, "rewards/chosen": -0.1168406754732132, "rewards/margins": 0.30344682931900024, "rewards/rejected": -0.42028751969337463, "step": 4180 }, { "epoch": 0.8, "learning_rate": 5.956156779819586e-07, "logits/chosen": -1.5429699420928955, "logits/rejected": -0.9784320592880249, "logps/chosen": -561.57958984375, "logps/rejected": -1228.4150390625, "loss": 0.0637, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1092168539762497, "rewards/margins": 0.3057827055454254, "rewards/rejected": -0.4149995744228363, "step": 4190 }, { "epoch": 0.8, "learning_rate": 5.848888922025553e-07, "logits/chosen": -1.4399769306182861, "logits/rejected": -1.0240304470062256, "logps/chosen": -719.931396484375, "logps/rejected": -1181.3548583984375, "loss": 0.1283, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.16944709420204163, "rewards/margins": 0.2121535837650299, "rewards/rejected": -0.3816007077693939, "step": 4200 }, { "epoch": 0.8, "learning_rate": 5.742467684175473e-07, "logits/chosen": -1.7172958850860596, "logits/rejected": -0.8663654327392578, "logps/chosen": -643.3468017578125, "logps/rejected": -1167.5980224609375, "loss": 0.0659, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.12012696266174316, "rewards/margins": 0.28652092814445496, "rewards/rejected": -0.40664786100387573, "step": 4210 }, { "epoch": 0.8, "learning_rate": 5.636897770870667e-07, "logits/chosen": -1.4587557315826416, "logits/rejected": -0.9724730253219604, "logps/chosen": -552.9734497070312, "logps/rejected": -1146.55322265625, "loss": 0.0693, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11410423368215561, "rewards/margins": 0.2997921407222748, "rewards/rejected": -0.4138964116573334, "step": 4220 }, { "epoch": 0.81, "learning_rate": 5.532183849077651e-07, "logits/chosen": -1.3316497802734375, "logits/rejected": -0.9100072979927063, "logps/chosen": -582.8646240234375, "logps/rejected": -1303.5687255859375, "loss": 0.075, "rewards/accuracies": 0.875, "rewards/chosen": -0.09175514429807663, "rewards/margins": 0.33713623881340027, "rewards/rejected": -0.4288913607597351, "step": 4230 }, { "epoch": 0.81, "learning_rate": 5.428330547921809e-07, "logits/chosen": -1.5007743835449219, "logits/rejected": -0.6980355381965637, "logps/chosen": -580.4159545898438, "logps/rejected": -1250.198486328125, "loss": 0.0566, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09494878351688385, "rewards/margins": 0.3454148471355438, "rewards/rejected": -0.44036364555358887, "step": 4240 }, { "epoch": 0.81, "learning_rate": 5.32534245848278e-07, "logits/chosen": -1.7881839275360107, "logits/rejected": -0.9309795498847961, "logps/chosen": -604.7741088867188, "logps/rejected": -1202.27880859375, "loss": 0.0643, "rewards/accuracies": 0.875, "rewards/chosen": -0.12172114849090576, "rewards/margins": 0.2990148663520813, "rewards/rejected": -0.42073601484298706, "step": 4250 }, { "epoch": 0.81, "learning_rate": 5.223224133591475e-07, "logits/chosen": -1.5164127349853516, "logits/rejected": -0.9188127517700195, "logps/chosen": -619.7620849609375, "logps/rejected": -1258.8397216796875, "loss": 0.0661, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10920927673578262, "rewards/margins": 0.3095013201236725, "rewards/rejected": -0.4187105596065521, "step": 4260 }, { "epoch": 0.81, "learning_rate": 5.121980087628802e-07, "logits/chosen": -1.653839349746704, "logits/rejected": -1.0274848937988281, "logps/chosen": -611.0381469726562, "logps/rejected": -1282.819091796875, "loss": 0.0573, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12401090562343597, "rewards/margins": 0.30650681257247925, "rewards/rejected": -0.4305177330970764, "step": 4270 }, { "epoch": 0.82, "learning_rate": 5.021614796326155e-07, "logits/chosen": -1.6808589696884155, "logits/rejected": -1.0335091352462769, "logps/chosen": -587.5392456054688, "logps/rejected": -1243.9168701171875, "loss": 0.0807, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11875418573617935, "rewards/margins": 0.32050734758377075, "rewards/rejected": -0.4392614960670471, "step": 4280 }, { "epoch": 0.82, "learning_rate": 4.922132696567463e-07, "logits/chosen": -1.5289744138717651, "logits/rejected": -0.8728163838386536, "logps/chosen": -581.1839599609375, "logps/rejected": -1340.7869873046875, "loss": 0.0481, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.10356955230236053, "rewards/margins": 0.3660332262516022, "rewards/rejected": -0.4696027636528015, "step": 4290 }, { "epoch": 0.82, "learning_rate": 4.823538186193097e-07, "logits/chosen": -1.6917448043823242, "logits/rejected": -1.0616384744644165, "logps/chosen": -584.0740966796875, "logps/rejected": -1156.746826171875, "loss": 0.0751, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11137738078832626, "rewards/margins": 0.2847217619419098, "rewards/rejected": -0.3960992097854614, "step": 4300 }, { "epoch": 0.82, "learning_rate": 4.725835623805494e-07, "logits/chosen": -1.7799545526504517, "logits/rejected": -1.0782766342163086, "logps/chosen": -671.3793334960938, "logps/rejected": -1264.038330078125, "loss": 0.0916, "rewards/accuracies": 0.875, "rewards/chosen": -0.15113036334514618, "rewards/margins": 0.29365235567092896, "rewards/rejected": -0.44478267431259155, "step": 4310 }, { "epoch": 0.82, "learning_rate": 4.6290293285763816e-07, "logits/chosen": -1.568554162979126, "logits/rejected": -1.0866026878356934, "logps/chosen": -611.8372802734375, "logps/rejected": -1266.229736328125, "loss": 0.0709, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11851924657821655, "rewards/margins": 0.29408353567123413, "rewards/rejected": -0.4126027524471283, "step": 4320 }, { "epoch": 0.82, "learning_rate": 4.533123580055909e-07, "logits/chosen": -1.5841357707977295, "logits/rejected": -0.8760364651679993, "logps/chosen": -622.0775756835938, "logps/rejected": -1131.180908203125, "loss": 0.086, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1051706075668335, "rewards/margins": 0.26066774129867554, "rewards/rejected": -0.36583834886550903, "step": 4330 }, { "epoch": 0.83, "learning_rate": 4.438122617983442e-07, "logits/chosen": -1.7880083322525024, "logits/rejected": -1.1745917797088623, "logps/chosen": -556.8238525390625, "logps/rejected": -1160.031982421875, "loss": 0.0511, "rewards/accuracies": 0.875, "rewards/chosen": -0.10341344028711319, "rewards/margins": 0.3007115423679352, "rewards/rejected": -0.4041249752044678, "step": 4340 }, { "epoch": 0.83, "learning_rate": 4.344030642100133e-07, "logits/chosen": -1.4846160411834717, "logits/rejected": -0.7649267315864563, "logps/chosen": -535.6936645507812, "logps/rejected": -1125.717041015625, "loss": 0.0646, "rewards/accuracies": 0.875, "rewards/chosen": -0.08845969289541245, "rewards/margins": 0.31220993399620056, "rewards/rejected": -0.4006696343421936, "step": 4350 }, { "epoch": 0.83, "learning_rate": 4.250851811963236e-07, "logits/chosen": -1.4102054834365845, "logits/rejected": -0.9203283190727234, "logps/chosen": -609.0619506835938, "logps/rejected": -1315.4090576171875, "loss": 0.0778, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.14391538500785828, "rewards/margins": 0.2974294126033783, "rewards/rejected": -0.4413447380065918, "step": 4360 }, { "epoch": 0.83, "learning_rate": 4.158590246762278e-07, "logits/chosen": -1.4081202745437622, "logits/rejected": -1.04823899269104, "logps/chosen": -553.0235595703125, "logps/rejected": -1091.3880615234375, "loss": 0.0946, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12480837106704712, "rewards/margins": 0.2424461394548416, "rewards/rejected": -0.36725446581840515, "step": 4370 }, { "epoch": 0.83, "learning_rate": 4.0672500251369204e-07, "logits/chosen": -1.7001903057098389, "logits/rejected": -1.0566041469573975, "logps/chosen": -509.71173095703125, "logps/rejected": -1251.2403564453125, "loss": 0.0554, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09820754826068878, "rewards/margins": 0.3354089856147766, "rewards/rejected": -0.4336165487766266, "step": 4380 }, { "epoch": 0.84, "learning_rate": 3.976835184996644e-07, "logits/chosen": -1.4017540216445923, "logits/rejected": -0.9382025599479675, "logps/chosen": -563.7012939453125, "logps/rejected": -1116.6812744140625, "loss": 0.0802, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11456887423992157, "rewards/margins": 0.2591772973537445, "rewards/rejected": -0.37374621629714966, "step": 4390 }, { "epoch": 0.84, "learning_rate": 3.887349723342304e-07, "logits/chosen": -1.4030841588974, "logits/rejected": -0.8099882006645203, "logps/chosen": -509.5604553222656, "logps/rejected": -1305.5618896484375, "loss": 0.0575, "rewards/accuracies": 0.875, "rewards/chosen": -0.08357210457324982, "rewards/margins": 0.340532124042511, "rewards/rejected": -0.4241042733192444, "step": 4400 }, { "epoch": 0.84, "learning_rate": 3.798797596089351e-07, "logits/chosen": -1.508704662322998, "logits/rejected": -0.7648983001708984, "logps/chosen": -666.0118408203125, "logps/rejected": -1355.562255859375, "loss": 0.0627, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.16047951579093933, "rewards/margins": 0.33138307929039, "rewards/rejected": -0.49186262488365173, "step": 4410 }, { "epoch": 0.84, "learning_rate": 3.711182717893011e-07, "logits/chosen": -1.6102008819580078, "logits/rejected": -0.9493368864059448, "logps/chosen": -687.1583862304688, "logps/rejected": -1226.296630859375, "loss": 0.1014, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.155551940202713, "rewards/margins": 0.2699928879737854, "rewards/rejected": -0.4255448281764984, "step": 4420 }, { "epoch": 0.84, "learning_rate": 3.624508961975215e-07, "logits/chosen": -1.6212804317474365, "logits/rejected": -1.1436877250671387, "logps/chosen": -575.4029541015625, "logps/rejected": -1286.482177734375, "loss": 0.0744, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1249982938170433, "rewards/margins": 0.3159942030906677, "rewards/rejected": -0.4409925043582916, "step": 4430 }, { "epoch": 0.85, "learning_rate": 3.538780159953348e-07, "logits/chosen": -1.238149642944336, "logits/rejected": -0.9261630773544312, "logps/chosen": -583.4049072265625, "logps/rejected": -1233.3751220703125, "loss": 0.0636, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1378888636827469, "rewards/margins": 0.2824627161026001, "rewards/rejected": -0.4203515946865082, "step": 4440 }, { "epoch": 0.85, "learning_rate": 3.454000101670901e-07, "logits/chosen": -1.3475556373596191, "logits/rejected": -0.9213441610336304, "logps/chosen": -625.7487182617188, "logps/rejected": -1175.872314453125, "loss": 0.0997, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1552494466304779, "rewards/margins": 0.24734191596508026, "rewards/rejected": -0.40259137749671936, "step": 4450 }, { "epoch": 0.85, "learning_rate": 3.3701725350299143e-07, "logits/chosen": -1.5352661609649658, "logits/rejected": -1.1658331155776978, "logps/chosen": -521.9005126953125, "logps/rejected": -1237.071044921875, "loss": 0.0584, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11491229385137558, "rewards/margins": 0.31515389680862427, "rewards/rejected": -0.43006619811058044, "step": 4460 }, { "epoch": 0.85, "learning_rate": 3.2873011658252796e-07, "logits/chosen": -1.5209187269210815, "logits/rejected": -0.6413207054138184, "logps/chosen": -619.4137573242188, "logps/rejected": -1321.85107421875, "loss": 0.0355, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.10239653289318085, "rewards/margins": 0.3683980703353882, "rewards/rejected": -0.47079458832740784, "step": 4470 }, { "epoch": 0.85, "learning_rate": 3.2053896575809426e-07, "logits/chosen": -1.7566978931427002, "logits/rejected": -1.147778868675232, "logps/chosen": -541.6817016601562, "logps/rejected": -1217.3521728515625, "loss": 0.06, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10900872945785522, "rewards/margins": 0.3140162527561188, "rewards/rejected": -0.423024982213974, "step": 4480 }, { "epoch": 0.86, "learning_rate": 3.124441631387931e-07, "logits/chosen": -1.3777107000350952, "logits/rejected": -0.8700397610664368, "logps/chosen": -538.3272094726562, "logps/rejected": -1210.5562744140625, "loss": 0.089, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09773439168930054, "rewards/margins": 0.3020879328250885, "rewards/rejected": -0.39982232451438904, "step": 4490 }, { "epoch": 0.86, "learning_rate": 3.044460665744284e-07, "logits/chosen": -1.3997788429260254, "logits/rejected": -0.8937880396842957, "logps/chosen": -535.1387939453125, "logps/rejected": -1171.378662109375, "loss": 0.073, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1069500669836998, "rewards/margins": 0.30229440331459045, "rewards/rejected": -0.40924444794654846, "step": 4500 }, { "epoch": 0.86, "learning_rate": 2.9654502963968575e-07, "logits/chosen": -1.4595518112182617, "logits/rejected": -0.9943321943283081, "logps/chosen": -663.2821655273438, "logps/rejected": -1067.5167236328125, "loss": 0.1362, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.14275141060352325, "rewards/margins": 0.21759268641471863, "rewards/rejected": -0.3603441119194031, "step": 4510 }, { "epoch": 0.86, "learning_rate": 2.8874140161849915e-07, "logits/chosen": -1.4665237665176392, "logits/rejected": -0.9472633600234985, "logps/chosen": -534.7359008789062, "logps/rejected": -1252.173583984375, "loss": 0.0576, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.10868203639984131, "rewards/margins": 0.32610076665878296, "rewards/rejected": -0.43478280305862427, "step": 4520 }, { "epoch": 0.86, "learning_rate": 2.810355274886148e-07, "logits/chosen": -1.4627294540405273, "logits/rejected": -0.7866867780685425, "logps/chosen": -580.9840087890625, "logps/rejected": -1238.630859375, "loss": 0.056, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.06367157399654388, "rewards/margins": 0.33369964361190796, "rewards/rejected": -0.39737120270729065, "step": 4530 }, { "epoch": 0.86, "learning_rate": 2.7342774790633686e-07, "logits/chosen": -1.4155508279800415, "logits/rejected": -0.8982292413711548, "logps/chosen": -569.6241455078125, "logps/rejected": -1352.642333984375, "loss": 0.0385, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.12062004953622818, "rewards/margins": 0.3387775719165802, "rewards/rejected": -0.4593976140022278, "step": 4540 }, { "epoch": 0.87, "learning_rate": 2.6591839919146963e-07, "logits/chosen": -1.459616780281067, "logits/rejected": -1.2273374795913696, "logps/chosen": -510.34149169921875, "logps/rejected": -1148.1497802734375, "loss": 0.0898, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11455903947353363, "rewards/margins": 0.27048274874687195, "rewards/rejected": -0.38504183292388916, "step": 4550 }, { "epoch": 0.87, "learning_rate": 2.58507813312448e-07, "logits/chosen": -1.6770299673080444, "logits/rejected": -1.0825966596603394, "logps/chosen": -490.6595764160156, "logps/rejected": -1214.4444580078125, "loss": 0.0705, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1060783639550209, "rewards/margins": 0.30906233191490173, "rewards/rejected": -0.41514068841934204, "step": 4560 }, { "epoch": 0.87, "learning_rate": 2.511963178716648e-07, "logits/chosen": -1.7969233989715576, "logits/rejected": -1.0599218606948853, "logps/chosen": -517.402587890625, "logps/rejected": -1123.163330078125, "loss": 0.0669, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09941872954368591, "rewards/margins": 0.28655630350112915, "rewards/rejected": -0.38597503304481506, "step": 4570 }, { "epoch": 0.87, "learning_rate": 2.439842360909864e-07, "logits/chosen": -1.678088903427124, "logits/rejected": -0.8970896601676941, "logps/chosen": -642.3028564453125, "logps/rejected": -1249.11767578125, "loss": 0.0721, "rewards/accuracies": 0.875, "rewards/chosen": -0.13762176036834717, "rewards/margins": 0.3055310547351837, "rewards/rejected": -0.4431528151035309, "step": 4580 }, { "epoch": 0.87, "learning_rate": 2.3687188679746314e-07, "logits/chosen": -1.7187156677246094, "logits/rejected": -1.1094046831130981, "logps/chosen": -660.9083862304688, "logps/rejected": -1219.8033447265625, "loss": 0.0702, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11426536738872528, "rewards/margins": 0.27515870332717896, "rewards/rejected": -0.38942405581474304, "step": 4590 }, { "epoch": 0.88, "learning_rate": 2.2985958440923772e-07, "logits/chosen": -1.6766948699951172, "logits/rejected": -0.9987794756889343, "logps/chosen": -513.1531372070312, "logps/rejected": -1196.9598388671875, "loss": 0.0741, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07514746487140656, "rewards/margins": 0.32911479473114014, "rewards/rejected": -0.4042623043060303, "step": 4600 }, { "epoch": 0.88, "learning_rate": 2.2294763892164284e-07, "logits/chosen": -1.507949948310852, "logits/rejected": -0.9113578796386719, "logps/chosen": -518.02294921875, "logps/rejected": -1056.095947265625, "loss": 0.064, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.0894726812839508, "rewards/margins": 0.2744660973548889, "rewards/rejected": -0.3639387786388397, "step": 4610 }, { "epoch": 0.88, "learning_rate": 2.1613635589349756e-07, "logits/chosen": -1.50367271900177, "logits/rejected": -1.0042375326156616, "logps/chosen": -592.36865234375, "logps/rejected": -1257.645751953125, "loss": 0.0786, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11847710609436035, "rewards/margins": 0.3014236092567444, "rewards/rejected": -0.41990071535110474, "step": 4620 }, { "epoch": 0.88, "learning_rate": 2.094260364336026e-07, "logits/chosen": -1.5850324630737305, "logits/rejected": -1.1900346279144287, "logps/chosen": -493.8600158691406, "logps/rejected": -1127.6407470703125, "loss": 0.077, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11056572198867798, "rewards/margins": 0.2895745635032654, "rewards/rejected": -0.40014028549194336, "step": 4630 }, { "epoch": 0.88, "learning_rate": 2.0281697718742333e-07, "logits/chosen": -1.6359741687774658, "logits/rejected": -0.8543532490730286, "logps/chosen": -719.7067260742188, "logps/rejected": -1214.4210205078125, "loss": 0.0717, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.11109743267297745, "rewards/margins": 0.2912042737007141, "rewards/rejected": -0.40230169892311096, "step": 4640 }, { "epoch": 0.89, "learning_rate": 1.9630947032398068e-07, "logits/chosen": -1.665879487991333, "logits/rejected": -0.9275639653205872, "logps/chosen": -542.2432861328125, "logps/rejected": -1198.4542236328125, "loss": 0.0642, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11047229915857315, "rewards/margins": 0.3198242485523224, "rewards/rejected": -0.43029651045799255, "step": 4650 }, { "epoch": 0.89, "learning_rate": 1.899038035229342e-07, "logits/chosen": -1.3144588470458984, "logits/rejected": -0.8643208742141724, "logps/chosen": -531.2198486328125, "logps/rejected": -1097.012451171875, "loss": 0.0797, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1076936349272728, "rewards/margins": 0.27217811346054077, "rewards/rejected": -0.37987175583839417, "step": 4660 }, { "epoch": 0.89, "learning_rate": 1.8360025996186138e-07, "logits/chosen": -1.4649903774261475, "logits/rejected": -0.9471826553344727, "logps/chosen": -557.9927978515625, "logps/rejected": -1155.4908447265625, "loss": 0.0715, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10311124473810196, "rewards/margins": 0.27083897590637207, "rewards/rejected": -0.3739502727985382, "step": 4670 }, { "epoch": 0.89, "learning_rate": 1.7739911830374352e-07, "logits/chosen": -1.4640676975250244, "logits/rejected": -0.7999382019042969, "logps/chosen": -548.3810424804688, "logps/rejected": -1133.371337890625, "loss": 0.0762, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11513851583003998, "rewards/margins": 0.25577312707901, "rewards/rejected": -0.3709116578102112, "step": 4680 }, { "epoch": 0.89, "learning_rate": 1.713006526846439e-07, "logits/chosen": -1.418255090713501, "logits/rejected": -0.7617162466049194, "logps/chosen": -526.6948852539062, "logps/rejected": -1178.375244140625, "loss": 0.0627, "rewards/accuracies": 0.875, "rewards/chosen": -0.10353302955627441, "rewards/margins": 0.3077407777309418, "rewards/rejected": -0.4112738072872162, "step": 4690 }, { "epoch": 0.9, "learning_rate": 1.6530513270159116e-07, "logits/chosen": -1.6661808490753174, "logits/rejected": -1.0216115713119507, "logps/chosen": -575.176513671875, "logps/rejected": -1170.5579833984375, "loss": 0.0789, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12035945802927017, "rewards/margins": 0.28512948751449585, "rewards/rejected": -0.40548890829086304, "step": 4700 }, { "epoch": 0.9, "learning_rate": 1.59412823400657e-07, "logits/chosen": -1.348259687423706, "logits/rejected": -0.726833701133728, "logps/chosen": -512.1627197265625, "logps/rejected": -1199.069091796875, "loss": 0.0751, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10438477993011475, "rewards/margins": 0.2982539236545563, "rewards/rejected": -0.4026387333869934, "step": 4710 }, { "epoch": 0.9, "learning_rate": 1.5362398526524463e-07, "logits/chosen": -1.6081886291503906, "logits/rejected": -1.0844703912734985, "logps/chosen": -531.6392822265625, "logps/rejected": -1220.3037109375, "loss": 0.0601, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09040726721286774, "rewards/margins": 0.32025203108787537, "rewards/rejected": -0.4106592535972595, "step": 4720 }, { "epoch": 0.9, "learning_rate": 1.4793887420457008e-07, "logits/chosen": -1.6161870956420898, "logits/rejected": -1.0405725240707397, "logps/chosen": -516.3242797851562, "logps/rejected": -1223.478759765625, "loss": 0.0697, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09660589694976807, "rewards/margins": 0.3125608563423157, "rewards/rejected": -0.40916675329208374, "step": 4730 }, { "epoch": 0.9, "learning_rate": 1.4235774154234855e-07, "logits/chosen": -1.5217710733413696, "logits/rejected": -0.954803466796875, "logps/chosen": -702.4046630859375, "logps/rejected": -1238.7601318359375, "loss": 0.0664, "rewards/accuracies": 0.875, "rewards/chosen": -0.14844849705696106, "rewards/margins": 0.2766263484954834, "rewards/rejected": -0.42507481575012207, "step": 4740 }, { "epoch": 0.9, "learning_rate": 1.368808340056879e-07, "logits/chosen": -1.5613772869110107, "logits/rejected": -0.8760370016098022, "logps/chosen": -583.1823120117188, "logps/rejected": -1393.697998046875, "loss": 0.0325, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.1385408341884613, "rewards/margins": 0.35670894384384155, "rewards/rejected": -0.49524980783462524, "step": 4750 }, { "epoch": 0.91, "learning_rate": 1.31508393714177e-07, "logits/chosen": -1.7405493259429932, "logits/rejected": -1.084986686706543, "logps/chosen": -643.45849609375, "logps/rejected": -1322.990478515625, "loss": 0.0696, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12244760990142822, "rewards/margins": 0.32172149419784546, "rewards/rejected": -0.44416913390159607, "step": 4760 }, { "epoch": 0.91, "learning_rate": 1.2624065816918414e-07, "logits/chosen": -1.6162086725234985, "logits/rejected": -1.1142741441726685, "logps/chosen": -675.887451171875, "logps/rejected": -1231.4742431640625, "loss": 0.1102, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.13799907267093658, "rewards/margins": 0.2717846930027008, "rewards/rejected": -0.4097837507724762, "step": 4770 }, { "epoch": 0.91, "learning_rate": 1.210778602433596e-07, "logits/chosen": -1.5767656564712524, "logits/rejected": -0.9563184976577759, "logps/chosen": -629.3272705078125, "logps/rejected": -1314.0963134765625, "loss": 0.0657, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12461809068918228, "rewards/margins": 0.34095823764801025, "rewards/rejected": -0.46557626128196716, "step": 4780 }, { "epoch": 0.91, "learning_rate": 1.1602022817033709e-07, "logits/chosen": -1.5052716732025146, "logits/rejected": -1.0084514617919922, "logps/chosen": -554.8811645507812, "logps/rejected": -1199.1683349609375, "loss": 0.0681, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11769001185894012, "rewards/margins": 0.27790355682373047, "rewards/rejected": -0.3955935835838318, "step": 4790 }, { "epoch": 0.91, "learning_rate": 1.1106798553464804e-07, "logits/chosen": -1.4443817138671875, "logits/rejected": -0.8818323016166687, "logps/chosen": -675.2034301757812, "logps/rejected": -1283.4957275390625, "loss": 0.0803, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1482025682926178, "rewards/margins": 0.3057909905910492, "rewards/rejected": -0.4539934992790222, "step": 4800 }, { "epoch": 0.92, "learning_rate": 1.0622135126183514e-07, "logits/chosen": -1.4301539659500122, "logits/rejected": -0.8788467645645142, "logps/chosen": -489.4832458496094, "logps/rejected": -1205.6776123046875, "loss": 0.0697, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.10816861689090729, "rewards/margins": 0.31940537691116333, "rewards/rejected": -0.4275740087032318, "step": 4810 }, { "epoch": 0.92, "learning_rate": 1.0148053960877396e-07, "logits/chosen": -1.4718639850616455, "logits/rejected": -1.015871286392212, "logps/chosen": -622.8069458007812, "logps/rejected": -1381.413818359375, "loss": 0.0532, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.14863041043281555, "rewards/margins": 0.34109631180763245, "rewards/rejected": -0.4897266924381256, "step": 4820 }, { "epoch": 0.92, "learning_rate": 9.684576015420277e-08, "logits/chosen": -1.6295974254608154, "logits/rejected": -0.9986993074417114, "logps/chosen": -497.78411865234375, "logps/rejected": -1245.800048828125, "loss": 0.0478, "rewards/accuracies": 0.875, "rewards/chosen": -0.11491765081882477, "rewards/margins": 0.3258935213088989, "rewards/rejected": -0.4408111572265625, "step": 4830 }, { "epoch": 0.92, "learning_rate": 9.23172177894574e-08, "logits/chosen": -1.498579502105713, "logits/rejected": -0.9404312372207642, "logps/chosen": -580.7014770507812, "logps/rejected": -1169.010498046875, "loss": 0.0828, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.146192267537117, "rewards/margins": 0.2631181478500366, "rewards/rejected": -0.40931040048599243, "step": 4840 }, { "epoch": 0.92, "learning_rate": 8.78951127094127e-08, "logits/chosen": -1.4492034912109375, "logits/rejected": -0.902155876159668, "logps/chosen": -580.7665405273438, "logps/rejected": -1362.78125, "loss": 0.0515, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1426132470369339, "rewards/margins": 0.3427208364009857, "rewards/rejected": -0.4853340983390808, "step": 4850 }, { "epoch": 0.93, "learning_rate": 8.357964040363209e-08, "logits/chosen": -1.699753999710083, "logits/rejected": -0.9447946548461914, "logps/chosen": -643.726318359375, "logps/rejected": -1170.8514404296875, "loss": 0.1054, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.14456409215927124, "rewards/margins": 0.26414671540260315, "rewards/rejected": -0.408710777759552, "step": 4860 }, { "epoch": 0.93, "learning_rate": 7.937099164772699e-08, "logits/chosen": -1.4378997087478638, "logits/rejected": -1.1139097213745117, "logps/chosen": -568.4095458984375, "logps/rejected": -1186.299072265625, "loss": 0.0868, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13183560967445374, "rewards/margins": 0.2873497009277344, "rewards/rejected": -0.4191853404045105, "step": 4870 }, { "epoch": 0.93, "learning_rate": 7.526935249492245e-08, "logits/chosen": -1.39249587059021, "logits/rejected": -0.9399245381355286, "logps/chosen": -592.54541015625, "logps/rejected": -1305.0413818359375, "loss": 0.0652, "rewards/accuracies": 0.875, "rewards/chosen": -0.13843494653701782, "rewards/margins": 0.3276015818119049, "rewards/rejected": -0.4660365581512451, "step": 4880 }, { "epoch": 0.93, "learning_rate": 7.127490426783124e-08, "logits/chosen": -1.7015079259872437, "logits/rejected": -0.9024487733840942, "logps/chosen": -660.8875122070312, "logps/rejected": -1300.248291015625, "loss": 0.0513, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1391044408082962, "rewards/margins": 0.32587337493896484, "rewards/rejected": -0.46497783064842224, "step": 4890 }, { "epoch": 0.93, "learning_rate": 6.738782355044048e-08, "logits/chosen": -1.6943657398223877, "logits/rejected": -0.9999350309371948, "logps/chosen": -548.2307739257812, "logps/rejected": -1201.4068603515625, "loss": 0.0649, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09915059804916382, "rewards/margins": 0.3060828745365143, "rewards/rejected": -0.4052334725856781, "step": 4900 }, { "epoch": 0.94, "learning_rate": 6.360828218030191e-08, "logits/chosen": -1.4893453121185303, "logits/rejected": -1.2656629085540771, "logps/chosen": -637.3431396484375, "logps/rejected": -1251.2078857421875, "loss": 0.0828, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1782352179288864, "rewards/margins": 0.25589674711227417, "rewards/rejected": -0.4341319501399994, "step": 4910 }, { "epoch": 0.94, "learning_rate": 5.993644724093889e-08, "logits/chosen": -1.530643343925476, "logits/rejected": -1.0829918384552002, "logps/chosen": -568.8175048828125, "logps/rejected": -1262.355224609375, "loss": 0.0742, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12133932113647461, "rewards/margins": 0.3102935254573822, "rewards/rejected": -0.4316328465938568, "step": 4920 }, { "epoch": 0.94, "learning_rate": 5.637248105445775e-08, "logits/chosen": -1.4310568571090698, "logits/rejected": -0.8825603723526001, "logps/chosen": -502.86248779296875, "logps/rejected": -1256.2763671875, "loss": 0.0616, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1074456200003624, "rewards/margins": 0.3345298767089844, "rewards/rejected": -0.44197553396224976, "step": 4930 }, { "epoch": 0.94, "learning_rate": 5.291654117437262e-08, "logits/chosen": -1.4048588275909424, "logits/rejected": -0.9273044466972351, "logps/chosen": -544.2005004882812, "logps/rejected": -1113.27490234375, "loss": 0.0786, "rewards/accuracies": 0.75, "rewards/chosen": -0.12635964155197144, "rewards/margins": 0.265400230884552, "rewards/rejected": -0.39175987243652344, "step": 4940 }, { "epoch": 0.94, "learning_rate": 4.956878037864044e-08, "logits/chosen": -1.393763780593872, "logits/rejected": -0.8130865097045898, "logps/chosen": -647.5671997070312, "logps/rejected": -1257.166015625, "loss": 0.0689, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15376783907413483, "rewards/margins": 0.29182344675064087, "rewards/rejected": -0.4455912709236145, "step": 4950 }, { "epoch": 0.94, "learning_rate": 4.632934666290778e-08, "logits/chosen": -1.3255221843719482, "logits/rejected": -0.9536614418029785, "logps/chosen": -657.3424682617188, "logps/rejected": -1044.6119384765625, "loss": 0.099, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.15257620811462402, "rewards/margins": 0.20966574549674988, "rewards/rejected": -0.3622419536113739, "step": 4960 }, { "epoch": 0.95, "learning_rate": 4.319838323396691e-08, "logits/chosen": -1.6361057758331299, "logits/rejected": -0.7931776642799377, "logps/chosen": -626.0015869140625, "logps/rejected": -1315.7972412109375, "loss": 0.0431, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.13260582089424133, "rewards/margins": 0.3334971070289612, "rewards/rejected": -0.4661029279232025, "step": 4970 }, { "epoch": 0.95, "learning_rate": 4.017602850342584e-08, "logits/chosen": -1.3285863399505615, "logits/rejected": -0.7825818061828613, "logps/chosen": -550.2575073242188, "logps/rejected": -1211.78125, "loss": 0.0628, "rewards/accuracies": 0.875, "rewards/chosen": -0.10753561556339264, "rewards/margins": 0.3216678500175476, "rewards/rejected": -0.42920345067977905, "step": 4980 }, { "epoch": 0.95, "learning_rate": 3.7262416081589866e-08, "logits/chosen": -1.5621405839920044, "logits/rejected": -0.8195871114730835, "logps/chosen": -659.1685791015625, "logps/rejected": -1270.6734619140625, "loss": 0.0554, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12191258370876312, "rewards/margins": 0.34549933671951294, "rewards/rejected": -0.46741190552711487, "step": 4990 }, { "epoch": 0.95, "learning_rate": 3.445767477155443e-08, "logits/chosen": -1.5442430973052979, "logits/rejected": -1.2246438264846802, "logps/chosen": -547.2069091796875, "logps/rejected": -1145.769287109375, "loss": 0.0771, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.13483698666095734, "rewards/margins": 0.2825453281402588, "rewards/rejected": -0.41738229990005493, "step": 5000 }, { "epoch": 0.95, "learning_rate": 3.1761928563510956e-08, "logits/chosen": -1.7152458429336548, "logits/rejected": -0.9394499063491821, "logps/chosen": -649.7903442382812, "logps/rejected": -1298.006591796875, "loss": 0.0552, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1411394327878952, "rewards/margins": 0.32097527384757996, "rewards/rejected": -0.46211472153663635, "step": 5010 }, { "epoch": 0.96, "learning_rate": 2.917529662926549e-08, "logits/chosen": -1.520808219909668, "logits/rejected": -0.945067286491394, "logps/chosen": -517.6978759765625, "logps/rejected": -1175.654541015625, "loss": 0.0584, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1118912473320961, "rewards/margins": 0.30338698625564575, "rewards/rejected": -0.41527828574180603, "step": 5020 }, { "epoch": 0.96, "learning_rate": 2.669789331697148e-08, "logits/chosen": -1.5593969821929932, "logits/rejected": -1.092525839805603, "logps/chosen": -691.4847412109375, "logps/rejected": -1298.0960693359375, "loss": 0.0926, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.16525861620903015, "rewards/margins": 0.2925161123275757, "rewards/rejected": -0.45777472853660583, "step": 5030 }, { "epoch": 0.96, "learning_rate": 2.4329828146074096e-08, "logits/chosen": -1.4827836751937866, "logits/rejected": -1.064502477645874, "logps/chosen": -419.036376953125, "logps/rejected": -958.3380737304688, "loss": 0.1329, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10691438615322113, "rewards/margins": 0.2166411131620407, "rewards/rejected": -0.32355549931526184, "step": 5040 }, { "epoch": 0.96, "learning_rate": 2.20712058024683e-08, "logits/chosen": -1.5136808156967163, "logits/rejected": -0.7604056000709534, "logps/chosen": -545.261474609375, "logps/rejected": -1140.312255859375, "loss": 0.0594, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1134977787733078, "rewards/margins": 0.29956376552581787, "rewards/rejected": -0.41306155920028687, "step": 5050 }, { "epoch": 0.96, "learning_rate": 1.9922126133870568e-08, "logits/chosen": -1.685162901878357, "logits/rejected": -0.8499481081962585, "logps/chosen": -608.2676391601562, "logps/rejected": -1121.2872314453125, "loss": 0.0749, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11034168303012848, "rewards/margins": 0.2738109230995178, "rewards/rejected": -0.3841525912284851, "step": 5060 }, { "epoch": 0.97, "learning_rate": 1.7882684145406616e-08, "logits/chosen": -1.9093729257583618, "logits/rejected": -1.2209051847457886, "logps/chosen": -613.359375, "logps/rejected": -1183.755615234375, "loss": 0.0905, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1341812163591385, "rewards/margins": 0.27863994240760803, "rewards/rejected": -0.4128211438655853, "step": 5070 }, { "epoch": 0.97, "learning_rate": 1.595296999541057e-08, "logits/chosen": -1.5866085290908813, "logits/rejected": -1.153424620628357, "logps/chosen": -531.4694213867188, "logps/rejected": -1303.758056640625, "loss": 0.0664, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12967099249362946, "rewards/margins": 0.3269729018211365, "rewards/rejected": -0.45664387941360474, "step": 5080 }, { "epoch": 0.97, "learning_rate": 1.4133068991437903e-08, "logits/chosen": -1.5420364141464233, "logits/rejected": -0.7891443371772766, "logps/chosen": -577.220947265625, "logps/rejected": -1330.06640625, "loss": 0.0463, "rewards/accuracies": 0.875, "rewards/chosen": -0.1222272664308548, "rewards/margins": 0.3585565686225891, "rewards/rejected": -0.4807838499546051, "step": 5090 }, { "epoch": 0.97, "learning_rate": 1.2423061586496476e-08, "logits/chosen": -1.3690813779830933, "logits/rejected": -0.7128039598464966, "logps/chosen": -578.891357421875, "logps/rejected": -1208.0849609375, "loss": 0.0761, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13187679648399353, "rewards/margins": 0.2918475270271301, "rewards/rejected": -0.42372435331344604, "step": 5100 }, { "epoch": 0.97, "learning_rate": 1.0823023375489128e-08, "logits/chosen": -1.5114414691925049, "logits/rejected": -0.9036266207695007, "logps/chosen": -595.9454345703125, "logps/rejected": -1167.0740966796875, "loss": 0.0934, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14127907156944275, "rewards/margins": 0.25682133436203003, "rewards/rejected": -0.39810046553611755, "step": 5110 }, { "epoch": 0.98, "learning_rate": 9.333025091870507e-09, "logits/chosen": -1.3770151138305664, "logits/rejected": -1.0183017253875732, "logps/chosen": -571.3338623046875, "logps/rejected": -1342.66455078125, "loss": 0.0561, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.12807521224021912, "rewards/margins": 0.32246822118759155, "rewards/rejected": -0.45054346323013306, "step": 5120 }, { "epoch": 0.98, "learning_rate": 7.95313260452263e-09, "logits/chosen": -1.7575418949127197, "logits/rejected": -1.016129970550537, "logps/chosen": -574.4557495117188, "logps/rejected": -1338.8140869140625, "loss": 0.0543, "rewards/accuracies": 0.875, "rewards/chosen": -0.1301496922969818, "rewards/margins": 0.3553154766559601, "rewards/rejected": -0.4854651093482971, "step": 5130 }, { "epoch": 0.98, "learning_rate": 6.683406914840818e-09, "logits/chosen": -1.581937313079834, "logits/rejected": -0.8572310209274292, "logps/chosen": -605.4105834960938, "logps/rejected": -1300.188720703125, "loss": 0.0532, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1113300696015358, "rewards/margins": 0.3524828255176544, "rewards/rejected": -0.4638128876686096, "step": 5140 }, { "epoch": 0.98, "learning_rate": 5.523904154037529e-09, "logits/chosen": -1.8323339223861694, "logits/rejected": -1.1476819515228271, "logps/chosen": -612.7574462890625, "logps/rejected": -1313.646728515625, "loss": 0.0479, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.11692248284816742, "rewards/margins": 0.3401055634021759, "rewards/rejected": -0.45702800154685974, "step": 5150 }, { "epoch": 0.98, "learning_rate": 4.474675580662113e-09, "logits/chosen": -1.613720178604126, "logits/rejected": -0.799970269203186, "logps/chosen": -609.1049194335938, "logps/rejected": -1300.5469970703125, "loss": 0.0586, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12533780932426453, "rewards/margins": 0.3247934877872467, "rewards/rejected": -0.4501313269138336, "step": 5160 }, { "epoch": 0.98, "learning_rate": 3.5357675783331823e-09, "logits/chosen": -1.4680449962615967, "logits/rejected": -0.8810006976127625, "logps/chosen": -550.5829467773438, "logps/rejected": -1324.5640869140625, "loss": 0.0451, "rewards/accuracies": 0.875, "rewards/chosen": -0.09199674427509308, "rewards/margins": 0.3480994701385498, "rewards/rejected": -0.4400961995124817, "step": 5170 }, { "epoch": 0.99, "learning_rate": 2.7072216536885855e-09, "logits/chosen": -1.4214956760406494, "logits/rejected": -1.0915638208389282, "logps/chosen": -592.4363403320312, "logps/rejected": -1323.13720703125, "loss": 0.0455, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11093990504741669, "rewards/margins": 0.3408183157444, "rewards/rejected": -0.4517582058906555, "step": 5180 }, { "epoch": 0.99, "learning_rate": 1.989074434551874e-09, "logits/chosen": -1.5371973514556885, "logits/rejected": -0.8911060094833374, "logps/chosen": -626.3933715820312, "logps/rejected": -1309.9378662109375, "loss": 0.0631, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.13457807898521423, "rewards/margins": 0.3284324109554291, "rewards/rejected": -0.4630104601383209, "step": 5190 }, { "epoch": 0.99, "learning_rate": 1.3813576683111007e-09, "logits/chosen": -1.585335373878479, "logits/rejected": -0.9166946411132812, "logps/chosen": -645.6207885742188, "logps/rejected": -1415.727783203125, "loss": 0.0712, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.13262395560741425, "rewards/margins": 0.3447038233280182, "rewards/rejected": -0.47732776403427124, "step": 5200 }, { "epoch": 0.99, "learning_rate": 8.840982205160498e-10, "logits/chosen": -1.6240886449813843, "logits/rejected": -0.658467710018158, "logps/chosen": -574.8680419921875, "logps/rejected": -1388.4107666015625, "loss": 0.0295, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.09021851420402527, "rewards/margins": 0.39147868752479553, "rewards/rejected": -0.4816971719264984, "step": 5210 }, { "epoch": 0.99, "learning_rate": 4.973180736911332e-10, "logits/chosen": -1.6752641201019287, "logits/rejected": -1.2019436359405518, "logps/chosen": -498.8638610839844, "logps/rejected": -1144.5714111328125, "loss": 0.079, "rewards/accuracies": 0.875, "rewards/chosen": -0.1074768528342247, "rewards/margins": 0.2879261374473572, "rewards/rejected": -0.3954029977321625, "step": 5220 }, { "epoch": 1.0, "learning_rate": 2.2103432636366718e-10, "logits/chosen": -1.4439948797225952, "logits/rejected": -1.0657399892807007, "logps/chosen": -604.0296630859375, "logps/rejected": -1356.142578125, "loss": 0.0609, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.12612642347812653, "rewards/margins": 0.3389315605163574, "rewards/rejected": -0.46505799889564514, "step": 5230 }, { "epoch": 1.0, "learning_rate": 5.525919230670029e-11, "logits/chosen": -1.5597736835479736, "logits/rejected": -1.0183484554290771, "logps/chosen": -586.085205078125, "logps/rejected": -1289.2353515625, "loss": 0.0449, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.11757595837116241, "rewards/margins": 0.3439788818359375, "rewards/rejected": -0.4615548551082611, "step": 5240 }, { "epoch": 1.0, "learning_rate": 0.0, "logits/chosen": -1.2492470741271973, "logits/rejected": -0.795202374458313, "logps/chosen": -545.5097045898438, "logps/rejected": -1238.634033203125, "loss": 0.0598, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.12360592931509018, "rewards/margins": 0.28936389088630676, "rewards/rejected": -0.41296982765197754, "step": 5250 }, { "epoch": 1.0, "step": 5250, "total_flos": 0.0, "train_loss": 0.07767095326809656, "train_runtime": 21725.9611, "train_samples_per_second": 0.967, "train_steps_per_second": 0.242 } ], "logging_steps": 10, "max_steps": 5250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }