{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996190476190476, "eval_steps": 500, "global_step": 656, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 7.575757575757576e-08, "logits/chosen": 0.040165986865758896, "logits/rejected": 0.1715753823518753, "logps/chosen": -294.844482421875, "logps/rejected": -361.2099914550781, "loss": 0.3581, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 7.575757575757576e-07, "logits/chosen": 0.08047256618738174, "logits/rejected": 0.3207971751689911, "logps/chosen": -393.2153625488281, "logps/rejected": -318.74615478515625, "loss": 0.3404, "rewards/accuracies": 0.4652777910232544, "rewards/chosen": -0.00022995664039626718, "rewards/margins": 2.2277235984802246e-05, "rewards/rejected": -0.000252234167419374, "step": 10 }, { "epoch": 0.03, "learning_rate": 1.5151515151515152e-06, "logits/chosen": 0.07466734945774078, "logits/rejected": 0.23236870765686035, "logps/chosen": -332.4886169433594, "logps/rejected": -281.853515625, "loss": 0.3468, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": 0.00011478399392217398, "rewards/margins": -0.0012641319772228599, "rewards/rejected": 0.0013789159711450338, "step": 20 }, { "epoch": 0.05, "learning_rate": 2.2727272727272728e-06, "logits/chosen": 0.07185273617506027, "logits/rejected": 0.2604687213897705, "logps/chosen": -358.33782958984375, "logps/rejected": -292.1524963378906, "loss": 0.3502, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0017305829096585512, "rewards/margins": 0.001660021604038775, "rewards/rejected": 7.056114554870874e-05, "step": 30 }, { "epoch": 0.06, "learning_rate": 3.0303030303030305e-06, "logits/chosen": 0.1199118122458458, "logits/rejected": 0.2392597496509552, "logps/chosen": -347.05926513671875, "logps/rejected": -286.26080322265625, "loss": 0.3353, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.005740107037127018, "rewards/margins": 0.006530737970024347, "rewards/rejected": -0.0007906301179900765, "step": 40 }, { "epoch": 0.08, "learning_rate": 3.7878787878787882e-06, "logits/chosen": 0.0741933211684227, "logits/rejected": 0.3117237091064453, "logps/chosen": -343.31170654296875, "logps/rejected": -287.1304931640625, "loss": 0.3286, "rewards/accuracies": 0.6875, "rewards/chosen": 0.016505183652043343, "rewards/margins": 0.016020886600017548, "rewards/rejected": 0.0004842969647143036, "step": 50 }, { "epoch": 0.09, "learning_rate": 4.5454545454545455e-06, "logits/chosen": 0.0762203261256218, "logits/rejected": 0.2750852704048157, "logps/chosen": -370.8611755371094, "logps/rejected": -302.7222595214844, "loss": 0.2924, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.032092947512865067, "rewards/margins": 0.04518315941095352, "rewards/rejected": -0.013090210035443306, "step": 60 }, { "epoch": 0.11, "learning_rate": 4.999432965739786e-06, "logits/chosen": 0.07090188562870026, "logits/rejected": 0.25236833095550537, "logps/chosen": -321.5715026855469, "logps/rejected": -301.0986022949219, "loss": 0.2803, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.05498770996928215, "rewards/margins": 0.0691528171300888, "rewards/rejected": -0.014165110886096954, "step": 70 }, { "epoch": 0.12, "learning_rate": 4.9930567839810125e-06, "logits/chosen": 0.10318852961063385, "logits/rejected": 0.2712559401988983, "logps/chosen": -358.31500244140625, "logps/rejected": -302.6133117675781, "loss": 0.2346, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.08596866577863693, "rewards/margins": 0.1306326687335968, "rewards/rejected": -0.044663988053798676, "step": 80 }, { "epoch": 0.14, "learning_rate": 4.979613761906212e-06, "logits/chosen": 0.1258704960346222, "logits/rejected": 0.2653108239173889, "logps/chosen": -316.4566345214844, "logps/rejected": -285.59442138671875, "loss": 0.2181, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.10472643375396729, "rewards/margins": 0.16386187076568604, "rewards/rejected": -0.05913544446229935, "step": 90 }, { "epoch": 0.15, "learning_rate": 4.959142005221991e-06, "logits/chosen": 0.14865969121456146, "logits/rejected": 0.2514886260032654, "logps/chosen": -316.22650146484375, "logps/rejected": -298.5392150878906, "loss": 0.2379, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.09984966367483139, "rewards/margins": 0.20773954689502716, "rewards/rejected": -0.10788986831903458, "step": 100 }, { "epoch": 0.17, "learning_rate": 4.931699543346854e-06, "logits/chosen": 0.1114228144288063, "logits/rejected": 0.2948494553565979, "logps/chosen": -338.43450927734375, "logps/rejected": -285.25494384765625, "loss": 0.2098, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1445818692445755, "rewards/margins": 0.19616642594337463, "rewards/rejected": -0.05158457159996033, "step": 110 }, { "epoch": 0.18, "learning_rate": 4.897364164920515e-06, "logits/chosen": 0.10053505003452301, "logits/rejected": 0.27514562010765076, "logps/chosen": -338.99151611328125, "logps/rejected": -297.2850646972656, "loss": 0.2161, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14088504016399384, "rewards/margins": 0.18905261158943176, "rewards/rejected": -0.048167549073696136, "step": 120 }, { "epoch": 0.2, "learning_rate": 4.8562331973035396e-06, "logits/chosen": 0.08438269048929214, "logits/rejected": 0.23364977538585663, "logps/chosen": -322.02117919921875, "logps/rejected": -303.7391357421875, "loss": 0.2438, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.146646648645401, "rewards/margins": 0.1478952169418335, "rewards/rejected": -0.0012485686456784606, "step": 130 }, { "epoch": 0.21, "learning_rate": 4.808423230692374e-06, "logits/chosen": 0.09936638176441193, "logits/rejected": 0.24736297130584717, "logps/chosen": -310.3603515625, "logps/rejected": -278.47320556640625, "loss": 0.222, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.18732748925685883, "rewards/margins": 0.1890837401151657, "rewards/rejected": -0.0017562557477504015, "step": 140 }, { "epoch": 0.23, "learning_rate": 4.754069787631761e-06, "logits/chosen": 0.13391128182411194, "logits/rejected": 0.2741778492927551, "logps/chosen": -366.7373046875, "logps/rejected": -295.2054138183594, "loss": 0.2071, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.23995013535022736, "rewards/margins": 0.24667362868785858, "rewards/rejected": -0.006723466329276562, "step": 150 }, { "epoch": 0.24, "learning_rate": 4.693326938861367e-06, "logits/chosen": 0.08330532908439636, "logits/rejected": 0.23301962018013, "logps/chosen": -295.6157531738281, "logps/rejected": -277.6016540527344, "loss": 0.2514, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.19748535752296448, "rewards/margins": 0.15948796272277832, "rewards/rejected": 0.03799740970134735, "step": 160 }, { "epoch": 0.26, "learning_rate": 4.626366866585528e-06, "logits/chosen": 0.17037127912044525, "logits/rejected": 0.3279545307159424, "logps/chosen": -371.6747131347656, "logps/rejected": -300.25250244140625, "loss": 0.226, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2174214869737625, "rewards/margins": 0.1945430338382721, "rewards/rejected": 0.022878441959619522, "step": 170 }, { "epoch": 0.27, "learning_rate": 4.553379376404085e-06, "logits/chosen": 0.12852030992507935, "logits/rejected": 0.23063895106315613, "logps/chosen": -339.1563720703125, "logps/rejected": -277.93841552734375, "loss": 0.1992, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.20981720089912415, "rewards/margins": 0.22972619533538818, "rewards/rejected": -0.019909001886844635, "step": 180 }, { "epoch": 0.29, "learning_rate": 4.474571359287791e-06, "logits/chosen": 0.08759725093841553, "logits/rejected": 0.26223450899124146, "logps/chosen": -315.0005187988281, "logps/rejected": -266.51715087890625, "loss": 0.2166, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.19832256436347961, "rewards/margins": 0.2093769758939743, "rewards/rejected": -0.011054400354623795, "step": 190 }, { "epoch": 0.3, "learning_rate": 4.3901662051233755e-06, "logits/chosen": 0.1616448611021042, "logits/rejected": 0.30854135751724243, "logps/chosen": -382.2667236328125, "logps/rejected": -286.1771240234375, "loss": 0.2198, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.19347664713859558, "rewards/margins": 0.20408186316490173, "rewards/rejected": -0.010605214163661003, "step": 200 }, { "epoch": 0.32, "learning_rate": 4.30040316949064e-06, "logits/chosen": 0.14471155405044556, "logits/rejected": 0.26751500368118286, "logps/chosen": -300.742919921875, "logps/rejected": -251.2882080078125, "loss": 0.23, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1704670488834381, "rewards/margins": 0.18817836046218872, "rewards/rejected": -0.0177113339304924, "step": 210 }, { "epoch": 0.34, "learning_rate": 4.205536695466524e-06, "logits/chosen": 0.16468700766563416, "logits/rejected": 0.24206213653087616, "logps/chosen": -294.42767333984375, "logps/rejected": -272.92010498046875, "loss": 0.2529, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.17376992106437683, "rewards/margins": 0.16116927564144135, "rewards/rejected": 0.012600669637322426, "step": 220 }, { "epoch": 0.35, "learning_rate": 4.105835692378557e-06, "logits/chosen": 0.10714814811944962, "logits/rejected": 0.2699413597583771, "logps/chosen": -304.6961364746094, "logps/rejected": -285.6314392089844, "loss": 0.1973, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.1876644492149353, "rewards/margins": 0.20867836475372314, "rewards/rejected": -0.02101389318704605, "step": 230 }, { "epoch": 0.37, "learning_rate": 4.001582773552153e-06, "logits/chosen": 0.06082786247134209, "logits/rejected": 0.29382848739624023, "logps/chosen": -378.81732177734375, "logps/rejected": -309.0159912109375, "loss": 0.1942, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.16840167343616486, "rewards/margins": 0.24110493063926697, "rewards/rejected": -0.0727032721042633, "step": 240 }, { "epoch": 0.38, "learning_rate": 3.893073455212438e-06, "logits/chosen": 0.1116786003112793, "logits/rejected": 0.2517862915992737, "logps/chosen": -320.56365966796875, "logps/rejected": -278.5521545410156, "loss": 0.2346, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17214366793632507, "rewards/margins": 0.2073363959789276, "rewards/rejected": -0.03519275039434433, "step": 250 }, { "epoch": 0.4, "learning_rate": 3.7806153188114027e-06, "logits/chosen": 0.11448470503091812, "logits/rejected": 0.22813239693641663, "logps/chosen": -279.91162109375, "logps/rejected": -260.7430114746094, "loss": 0.2402, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1783895492553711, "rewards/margins": 0.17652130126953125, "rewards/rejected": 0.001868226332589984, "step": 260 }, { "epoch": 0.41, "learning_rate": 3.6645271391548542e-06, "logits/chosen": 0.06586723029613495, "logits/rejected": 0.21862812340259552, "logps/chosen": -302.5594787597656, "logps/rejected": -270.1592712402344, "loss": 0.212, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.1828652024269104, "rewards/margins": 0.2052091658115387, "rewards/rejected": -0.022343963384628296, "step": 270 }, { "epoch": 0.43, "learning_rate": 3.5451379808006014e-06, "logits/chosen": 0.1570790708065033, "logits/rejected": 0.288757860660553, "logps/chosen": -347.58013916015625, "logps/rejected": -294.15716552734375, "loss": 0.1946, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.21899041533470154, "rewards/margins": 0.2574598491191864, "rewards/rejected": -0.03846944123506546, "step": 280 }, { "epoch": 0.44, "learning_rate": 3.4227862652892106e-06, "logits/chosen": 0.10818709433078766, "logits/rejected": 0.23891910910606384, "logps/chosen": -348.48370361328125, "logps/rejected": -307.1477966308594, "loss": 0.201, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1834907829761505, "rewards/margins": 0.22308149933815002, "rewards/rejected": -0.03959069401025772, "step": 290 }, { "epoch": 0.46, "learning_rate": 3.2978188118513814e-06, "logits/chosen": 0.11096982657909393, "logits/rejected": 0.24753287434577942, "logps/chosen": -296.8543701171875, "logps/rejected": -281.43310546875, "loss": 0.2306, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.19204005599021912, "rewards/margins": 0.18743662536144257, "rewards/rejected": 0.0046034445986151695, "step": 300 }, { "epoch": 0.47, "learning_rate": 3.1705898543111576e-06, "logits/chosen": 0.07924026995897293, "logits/rejected": 0.2654581665992737, "logps/chosen": -327.4444274902344, "logps/rejected": -319.9097595214844, "loss": 0.2183, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.1851603388786316, "rewards/margins": 0.23894396424293518, "rewards/rejected": -0.05378361791372299, "step": 310 }, { "epoch": 0.49, "learning_rate": 3.041460036971664e-06, "logits/chosen": 0.09540507942438126, "logits/rejected": 0.22888918220996857, "logps/chosen": -334.81689453125, "logps/rejected": -270.1705017089844, "loss": 0.2204, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15298177301883698, "rewards/margins": 0.2034742832183838, "rewards/rejected": -0.05049251392483711, "step": 320 }, { "epoch": 0.5, "learning_rate": 2.910795392329649e-06, "logits/chosen": 0.11527317762374878, "logits/rejected": 0.21196472644805908, "logps/chosen": -315.2773132324219, "logps/rejected": -283.607421875, "loss": 0.1907, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.19066214561462402, "rewards/margins": 0.22805961966514587, "rewards/rejected": -0.037397462874650955, "step": 330 }, { "epoch": 0.52, "learning_rate": 2.7789663035166035e-06, "logits/chosen": 0.054509587585926056, "logits/rejected": 0.21440072357654572, "logps/chosen": -327.4574279785156, "logps/rejected": -277.25958251953125, "loss": 0.201, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.16702668368816376, "rewards/margins": 0.22345618903636932, "rewards/rejected": -0.05642951279878616, "step": 340 }, { "epoch": 0.53, "learning_rate": 2.6463464544075344e-06, "logits/chosen": 0.0780135840177536, "logits/rejected": 0.2536531686782837, "logps/chosen": -305.2668151855469, "logps/rejected": -291.6106872558594, "loss": 0.1948, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.21040305495262146, "rewards/margins": 0.27535635232925415, "rewards/rejected": -0.06495330482721329, "step": 350 }, { "epoch": 0.55, "learning_rate": 2.513311770373421e-06, "logits/chosen": 0.12684503197669983, "logits/rejected": 0.23648087680339813, "logps/chosen": -299.1307067871094, "logps/rejected": -288.98614501953125, "loss": 0.2112, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.18101270496845245, "rewards/margins": 0.24970810115337372, "rewards/rejected": -0.06869538873434067, "step": 360 }, { "epoch": 0.56, "learning_rate": 2.380239352679908e-06, "logits/chosen": 0.050291478633880615, "logits/rejected": 0.2234155833721161, "logps/chosen": -296.54595947265625, "logps/rejected": -269.51702880859375, "loss": 0.2095, "rewards/accuracies": 0.6875, "rewards/chosen": 0.17090751230716705, "rewards/margins": 0.19086746871471405, "rewards/rejected": -0.019959963858127594, "step": 370 }, { "epoch": 0.58, "learning_rate": 2.247506409552795e-06, "logits/chosen": 0.10957841575145721, "logits/rejected": 0.21787157654762268, "logps/chosen": -305.3380432128906, "logps/rejected": -283.4737854003906, "loss": 0.2027, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.17482581734657288, "rewards/margins": 0.22671441733837128, "rewards/rejected": -0.0518886037170887, "step": 380 }, { "epoch": 0.59, "learning_rate": 2.1154891869403436e-06, "logits/chosen": 0.1384754180908203, "logits/rejected": 0.25176170468330383, "logps/chosen": -354.0063171386719, "logps/rejected": -307.35455322265625, "loss": 0.2073, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.2005975991487503, "rewards/margins": 0.27450570464134216, "rewards/rejected": -0.07390810549259186, "step": 390 }, { "epoch": 0.61, "learning_rate": 1.9845619020032552e-06, "logits/chosen": 0.09188776463270187, "logits/rejected": 0.2664358913898468, "logps/chosen": -328.11083984375, "logps/rejected": -286.30419921875, "loss": 0.1895, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.1963554322719574, "rewards/margins": 0.23746831715106964, "rewards/rejected": -0.04111289232969284, "step": 400 }, { "epoch": 0.62, "learning_rate": 1.8550956823554708e-06, "logits/chosen": 0.055462319403886795, "logits/rejected": 0.18411260843276978, "logps/chosen": -326.14703369140625, "logps/rejected": -275.77069091796875, "loss": 0.235, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.1863834410905838, "rewards/margins": 0.22951778769493103, "rewards/rejected": -0.043134383857250214, "step": 410 }, { "epoch": 0.64, "learning_rate": 1.7274575140626318e-06, "logits/chosen": 0.09742958843708038, "logits/rejected": 0.23133966326713562, "logps/chosen": -334.5967712402344, "logps/rejected": -302.8537902832031, "loss": 0.2048, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.21493402123451233, "rewards/margins": 0.26205217838287354, "rewards/rejected": -0.047118157148361206, "step": 420 }, { "epoch": 0.66, "learning_rate": 1.6020092013802002e-06, "logits/chosen": 0.061180900782346725, "logits/rejected": 0.22411946952342987, "logps/chosen": -326.1264953613281, "logps/rejected": -270.6374206542969, "loss": 0.2211, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.19443106651306152, "rewards/margins": 0.2181394100189209, "rewards/rejected": -0.023708324879407883, "step": 430 }, { "epoch": 0.67, "learning_rate": 1.4791063411799938e-06, "logits/chosen": 0.11679482460021973, "logits/rejected": 0.20075193047523499, "logps/chosen": -313.3033752441406, "logps/rejected": -301.5423278808594, "loss": 0.1941, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.18806949257850647, "rewards/margins": 0.22217002511024475, "rewards/rejected": -0.03410057723522186, "step": 440 }, { "epoch": 0.69, "learning_rate": 1.3590973149722103e-06, "logits/chosen": 0.11450199782848358, "logits/rejected": 0.2375846803188324, "logps/chosen": -324.9442443847656, "logps/rejected": -282.5760498046875, "loss": 0.2164, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.1744813621044159, "rewards/margins": 0.21334946155548096, "rewards/rejected": -0.03886810690164566, "step": 450 }, { "epoch": 0.7, "learning_rate": 1.2423223013801946e-06, "logits/chosen": 0.10633231699466705, "logits/rejected": 0.2275623381137848, "logps/chosen": -359.95428466796875, "logps/rejected": -296.4113464355469, "loss": 0.1969, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.21253260970115662, "rewards/margins": 0.24374982714653015, "rewards/rejected": -0.03121720813214779, "step": 460 }, { "epoch": 0.72, "learning_rate": 1.1291123118671665e-06, "logits/chosen": 0.03388429060578346, "logits/rejected": 0.17446021735668182, "logps/chosen": -308.4808349609375, "logps/rejected": -263.65283203125, "loss": 0.2128, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.21648752689361572, "rewards/margins": 0.22755059599876404, "rewards/rejected": -0.011063081212341785, "step": 470 }, { "epoch": 0.73, "learning_rate": 1.019788252448267e-06, "logits/chosen": 0.12791678309440613, "logits/rejected": 0.28256458044052124, "logps/chosen": -379.8475341796875, "logps/rejected": -303.40545654296875, "loss": 0.182, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.22206160426139832, "rewards/margins": 0.2580808699131012, "rewards/rejected": -0.036019258201122284, "step": 480 }, { "epoch": 0.75, "learning_rate": 9.146600140475945e-07, "logits/chosen": 0.11331765353679657, "logits/rejected": 0.2163075953722, "logps/chosen": -366.92926025390625, "logps/rejected": -308.08148193359375, "loss": 0.2333, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1822303682565689, "rewards/margins": 0.232683464884758, "rewards/rejected": -0.050453104078769684, "step": 490 }, { "epoch": 0.76, "learning_rate": 8.140255940787059e-07, "logits/chosen": 0.1049843281507492, "logits/rejected": 0.20622961223125458, "logps/chosen": -283.49822998046875, "logps/rejected": -262.69561767578125, "loss": 0.2244, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.19038459658622742, "rewards/margins": 0.19227764010429382, "rewards/rejected": -0.001893045729957521, "step": 500 }, { "epoch": 0.78, "learning_rate": 7.181702517385789e-07, "logits/chosen": 0.06920811533927917, "logits/rejected": 0.22420334815979004, "logps/chosen": -313.43670654296875, "logps/rejected": -280.72442626953125, "loss": 0.2521, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1772725135087967, "rewards/margins": 0.1995859146118164, "rewards/rejected": -0.022313417866826057, "step": 510 }, { "epoch": 0.79, "learning_rate": 6.273656994094232e-07, "logits/chosen": 0.0857834741473198, "logits/rejected": 0.1759118139743805, "logps/chosen": -308.9149475097656, "logps/rejected": -283.34747314453125, "loss": 0.1955, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2155129462480545, "rewards/margins": 0.22527408599853516, "rewards/rejected": -0.009761162102222443, "step": 520 }, { "epoch": 0.81, "learning_rate": 5.418693324604082e-07, "logits/chosen": 0.057937733829021454, "logits/rejected": 0.24161191284656525, "logps/chosen": -333.49603271484375, "logps/rejected": -280.8927917480469, "loss": 0.2266, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.21429701149463654, "rewards/margins": 0.22269944846630096, "rewards/rejected": -0.008402440696954727, "step": 530 }, { "epoch": 0.82, "learning_rate": 4.619234996325314e-07, "logits/chosen": 0.07015545666217804, "logits/rejected": 0.2190779447555542, "logps/chosen": -344.3935852050781, "logps/rejected": -303.2052917480469, "loss": 0.2045, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.21076282858848572, "rewards/margins": 0.25355157256126404, "rewards/rejected": -0.04278876259922981, "step": 540 }, { "epoch": 0.84, "learning_rate": 3.877548160747768e-07, "logits/chosen": 0.08561773598194122, "logits/rejected": 0.2999951243400574, "logps/chosen": -330.68475341796875, "logps/rejected": -281.82275390625, "loss": 0.2013, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.20333750545978546, "rewards/margins": 0.23657293617725372, "rewards/rejected": -0.033235400915145874, "step": 550 }, { "epoch": 0.85, "learning_rate": 3.195735209788528e-07, "logits/chosen": 0.13458076119422913, "logits/rejected": 0.27361050248146057, "logps/chosen": -318.90203857421875, "logps/rejected": -284.5428466796875, "loss": 0.2178, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.19831877946853638, "rewards/margins": 0.23214980959892273, "rewards/rejected": -0.03383101895451546, "step": 560 }, { "epoch": 0.87, "learning_rate": 2.5757288163336806e-07, "logits/chosen": 0.11606297641992569, "logits/rejected": 0.22765600681304932, "logps/chosen": -313.04278564453125, "logps/rejected": -281.8597717285156, "loss": 0.1959, "rewards/accuracies": 0.78125, "rewards/chosen": 0.21330790221691132, "rewards/margins": 0.25364676117897034, "rewards/rejected": -0.04033887758851051, "step": 570 }, { "epoch": 0.88, "learning_rate": 2.019286455866981e-07, "logits/chosen": 0.057523488998413086, "logits/rejected": 0.19763953983783722, "logps/chosen": -299.98626708984375, "logps/rejected": -252.2355194091797, "loss": 0.2448, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2040693461894989, "rewards/margins": 0.18340806663036346, "rewards/rejected": 0.020661287009716034, "step": 580 }, { "epoch": 0.9, "learning_rate": 1.5279854247146703e-07, "logits/chosen": 0.10874730348587036, "logits/rejected": 0.26722806692123413, "logps/chosen": -326.7654724121094, "logps/rejected": -274.41162109375, "loss": 0.2041, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.2091568410396576, "rewards/margins": 0.2396632879972458, "rewards/rejected": -0.030506467446684837, "step": 590 }, { "epoch": 0.91, "learning_rate": 1.1032183690276754e-07, "logits/chosen": 0.12037558853626251, "logits/rejected": 0.19973725080490112, "logps/chosen": -314.14910888671875, "logps/rejected": -270.01043701171875, "loss": 0.2167, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.21347804367542267, "rewards/margins": 0.22940710186958313, "rewards/rejected": -0.015929043292999268, "step": 600 }, { "epoch": 0.93, "learning_rate": 7.46189337174788e-08, "logits/chosen": 0.08730605989694595, "logits/rejected": 0.2364271879196167, "logps/chosen": -289.814453125, "logps/rejected": -251.5435791015625, "loss": 0.2221, "rewards/accuracies": 0.71875, "rewards/chosen": 0.19059757888317108, "rewards/margins": 0.19377049803733826, "rewards/rejected": -0.0031729289330542088, "step": 610 }, { "epoch": 0.94, "learning_rate": 4.579103667367385e-08, "logits/chosen": 0.13279291987419128, "logits/rejected": 0.2206091433763504, "logps/chosen": -342.04498291015625, "logps/rejected": -276.22613525390625, "loss": 0.1984, "rewards/accuracies": 0.71875, "rewards/chosen": 0.19695451855659485, "rewards/margins": 0.2266121655702591, "rewards/rejected": -0.029657626524567604, "step": 620 }, { "epoch": 0.96, "learning_rate": 2.3919861577572924e-08, "logits/chosen": 0.0992569848895073, "logits/rejected": 0.26812419295310974, "logps/chosen": -336.68817138671875, "logps/rejected": -261.6818542480469, "loss": 0.2111, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.21192236244678497, "rewards/margins": 0.2388623207807541, "rewards/rejected": -0.026939954608678818, "step": 630 }, { "epoch": 0.98, "learning_rate": 9.067404651211808e-09, "logits/chosen": 0.04493387043476105, "logits/rejected": 0.1637609452009201, "logps/chosen": -332.1258544921875, "logps/rejected": -283.83709716796875, "loss": 0.1939, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.20134110748767853, "rewards/margins": 0.23969101905822754, "rewards/rejected": -0.03834990784525871, "step": 640 }, { "epoch": 0.99, "learning_rate": 1.2757667974155896e-09, "logits/chosen": 0.1145804151892662, "logits/rejected": 0.23529252409934998, "logps/chosen": -351.08197021484375, "logps/rejected": -293.947998046875, "loss": 0.2055, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.2066545933485031, "rewards/margins": 0.22757765650749207, "rewards/rejected": -0.020923063158988953, "step": 650 }, { "epoch": 1.0, "step": 656, "total_flos": 0.0, "train_loss": 0.22703545004492853, "train_runtime": 7888.8763, "train_samples_per_second": 2.662, "train_steps_per_second": 0.083 } ], "logging_steps": 10, "max_steps": 656, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }