{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994666666666666, "eval_steps": 500, "global_step": 937, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.319148936170213e-08, "logits/chosen": 0.09486827999353409, "logits/rejected": 0.17880678176879883, "logps/chosen": -404.6722717285156, "logps/rejected": -393.01068115234375, "loss": 0.2923, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.319148936170213e-07, "logits/chosen": 0.3109264671802521, "logits/rejected": 0.1413353830575943, "logps/chosen": -451.60137939453125, "logps/rejected": -439.4466857910156, "loss": 0.3143, "rewards/accuracies": 0.3472222089767456, "rewards/chosen": -0.0008689137175679207, "rewards/margins": 9.278658399125561e-05, "rewards/rejected": -0.0009617002215236425, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.0638297872340427e-06, "logits/chosen": 0.16002216935157776, "logits/rejected": 0.29283756017684937, "logps/chosen": -446.17352294921875, "logps/rejected": -447.1424865722656, "loss": 0.3012, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0012470056535676122, "rewards/margins": 7.514755270676687e-05, "rewards/rejected": -0.0013221531407907605, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.595744680851064e-06, "logits/chosen": 0.16278645396232605, "logits/rejected": 0.23292532563209534, "logps/chosen": -390.2185974121094, "logps/rejected": -385.9638977050781, "loss": 0.3074, "rewards/accuracies": 0.3187499940395355, "rewards/chosen": -0.001785513712093234, "rewards/margins": -4.0346338209928945e-05, "rewards/rejected": -0.0017451674211770296, "step": 30 }, { "epoch": 0.04, "learning_rate": 2.1276595744680853e-06, "logits/chosen": 0.21185627579689026, "logits/rejected": 0.16483844816684723, "logps/chosen": -396.40234375, "logps/rejected": -413.7225036621094, "loss": 0.2927, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.003013796405866742, "rewards/margins": 0.0003060643211938441, "rewards/rejected": -0.003319860901683569, "step": 40 }, { "epoch": 0.05, "learning_rate": 2.6595744680851065e-06, "logits/chosen": 0.13742004334926605, "logits/rejected": 0.31375652551651, "logps/chosen": -412.4908142089844, "logps/rejected": -392.556640625, "loss": 0.3019, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.005482032895088196, "rewards/margins": 0.00019160615920554847, "rewards/rejected": -0.005673639010637999, "step": 50 }, { "epoch": 0.06, "learning_rate": 3.191489361702128e-06, "logits/chosen": 0.22858476638793945, "logits/rejected": 0.2066006362438202, "logps/chosen": -394.327880859375, "logps/rejected": -392.41656494140625, "loss": 0.32, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.009965803474187851, "rewards/margins": 0.00048381154192611575, "rewards/rejected": -0.010449616238474846, "step": 60 }, { "epoch": 0.07, "learning_rate": 3.723404255319149e-06, "logits/chosen": 0.29500612616539, "logits/rejected": 0.1506035029888153, "logps/chosen": -392.02960205078125, "logps/rejected": -427.17596435546875, "loss": 0.3328, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.01748177967965603, "rewards/margins": 0.0010660603875294328, "rewards/rejected": -0.018547840416431427, "step": 70 }, { "epoch": 0.09, "learning_rate": 4.255319148936171e-06, "logits/chosen": 0.1481747180223465, "logits/rejected": 0.2276589423418045, "logps/chosen": -396.8224182128906, "logps/rejected": -373.9072570800781, "loss": 0.3068, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.019837453961372375, "rewards/margins": 0.0006632342119701207, "rewards/rejected": -0.020500686019659042, "step": 80 }, { "epoch": 0.1, "learning_rate": 4.787234042553192e-06, "logits/chosen": 0.13089337944984436, "logits/rejected": 0.19030170142650604, "logps/chosen": -379.7206115722656, "logps/rejected": -391.56512451171875, "loss": 0.2966, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.018794242292642593, "rewards/margins": 0.0023801042698323727, "rewards/rejected": -0.021174345165491104, "step": 90 }, { "epoch": 0.11, "learning_rate": 4.999375059004058e-06, "logits/chosen": 0.13803163170814514, "logits/rejected": 0.24431411921977997, "logps/chosen": -408.5091247558594, "logps/rejected": -396.9778747558594, "loss": 0.3097, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.017420392483472824, "rewards/margins": 0.0018956039566546679, "rewards/rejected": -0.019315997138619423, "step": 100 }, { "epoch": 0.12, "learning_rate": 4.9955571065548795e-06, "logits/chosen": 0.155339315533638, "logits/rejected": 0.1211571916937828, "logps/chosen": -430.47344970703125, "logps/rejected": -437.48956298828125, "loss": 0.2946, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.018004143610596657, "rewards/margins": 0.004096529446542263, "rewards/rejected": -0.022100670263171196, "step": 110 }, { "epoch": 0.13, "learning_rate": 4.9882736864879e-06, "logits/chosen": 0.1062009185552597, "logits/rejected": 0.26274779438972473, "logps/chosen": -413.7748107910156, "logps/rejected": -401.53778076171875, "loss": 0.3079, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.01474347896873951, "rewards/margins": 0.00652940571308136, "rewards/rejected": -0.02127288654446602, "step": 120 }, { "epoch": 0.14, "learning_rate": 4.977534912960124e-06, "logits/chosen": 0.13843606412410736, "logits/rejected": 0.2405589520931244, "logps/chosen": -445.410888671875, "logps/rejected": -441.92486572265625, "loss": 0.3126, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.016456641256809235, "rewards/margins": 0.006972718983888626, "rewards/rejected": -0.02342936024069786, "step": 130 }, { "epoch": 0.15, "learning_rate": 4.963355698422092e-06, "logits/chosen": 0.1316756308078766, "logits/rejected": 0.0917954295873642, "logps/chosen": -441.77911376953125, "logps/rejected": -474.7543029785156, "loss": 0.3121, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.030273189768195152, "rewards/margins": 0.006867046467959881, "rewards/rejected": -0.03714023157954216, "step": 140 }, { "epoch": 0.16, "learning_rate": 4.945755732909625e-06, "logits/chosen": 0.07779763638973236, "logits/rejected": 0.16235283017158508, "logps/chosen": -468.13873291015625, "logps/rejected": -508.16888427734375, "loss": 0.2735, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.042156465351581573, "rewards/margins": 0.025058995932340622, "rewards/rejected": -0.0672154575586319, "step": 150 }, { "epoch": 0.17, "learning_rate": 4.924759456701167e-06, "logits/chosen": 0.013025517575442791, "logits/rejected": 0.05558537319302559, "logps/chosen": -490.2015686035156, "logps/rejected": -526.5242919921875, "loss": 0.296, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.07817033678293228, "rewards/margins": 0.030566949397325516, "rewards/rejected": -0.1087372750043869, "step": 160 }, { "epoch": 0.18, "learning_rate": 4.900396026378671e-06, "logits/chosen": 0.005712971091270447, "logits/rejected": -0.005013291724026203, "logps/chosen": -577.2238159179688, "logps/rejected": -608.1441650390625, "loss": 0.2962, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.12602511048316956, "rewards/margins": 0.029870549216866493, "rewards/rejected": -0.1558956503868103, "step": 170 }, { "epoch": 0.19, "learning_rate": 4.872699274339169e-06, "logits/chosen": 0.0872177928686142, "logits/rejected": 0.047161780297756195, "logps/chosen": -466.25384521484375, "logps/rejected": -508.01190185546875, "loss": 0.2824, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.06323900073766708, "rewards/margins": 0.03396384045481682, "rewards/rejected": -0.0972028374671936, "step": 180 }, { "epoch": 0.2, "learning_rate": 4.8417076618132434e-06, "logits/chosen": 0.040699899196624756, "logits/rejected": 0.08730391412973404, "logps/chosen": -451.14190673828125, "logps/rejected": -483.7627868652344, "loss": 0.269, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.05393596366047859, "rewards/margins": 0.02897338569164276, "rewards/rejected": -0.08290934562683105, "step": 190 }, { "epoch": 0.21, "learning_rate": 4.807464225455655e-06, "logits/chosen": -0.029990673065185547, "logits/rejected": 0.0431547686457634, "logps/chosen": -511.7386779785156, "logps/rejected": -536.8895263671875, "loss": 0.2937, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.06965841352939606, "rewards/margins": 0.030940961092710495, "rewards/rejected": -0.10059938579797745, "step": 200 }, { "epoch": 0.22, "learning_rate": 4.770016517582283e-06, "logits/chosen": -0.017718762159347534, "logits/rejected": 0.0021002888679504395, "logps/chosen": -475.02398681640625, "logps/rejected": -524.1427001953125, "loss": 0.2823, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.09790189564228058, "rewards/margins": 0.04916772618889809, "rewards/rejected": -0.14706961810588837, "step": 210 }, { "epoch": 0.23, "learning_rate": 4.7294165401363616e-06, "logits/chosen": 0.039277154952287674, "logits/rejected": 0.08032336086034775, "logps/chosen": -542.1201782226562, "logps/rejected": -580.0626220703125, "loss": 0.2699, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.11237654834985733, "rewards/margins": 0.05318045616149902, "rewards/rejected": -0.16555699706077576, "step": 220 }, { "epoch": 0.25, "learning_rate": 4.68572067247573e-06, "logits/chosen": -0.040498532354831696, "logits/rejected": 0.038641445338726044, "logps/chosen": -530.233642578125, "logps/rejected": -566.8374633789062, "loss": 0.3008, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.11718879640102386, "rewards/margins": 0.04862760379910469, "rewards/rejected": -0.16581639647483826, "step": 230 }, { "epoch": 0.26, "learning_rate": 4.638989593081364e-06, "logits/chosen": 0.03174019977450371, "logits/rejected": 0.028316298499703407, "logps/chosen": -544.1737060546875, "logps/rejected": -595.1419677734375, "loss": 0.2807, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.1217833161354065, "rewards/margins": 0.05592575669288635, "rewards/rejected": -0.17770907282829285, "step": 240 }, { "epoch": 0.27, "learning_rate": 4.5892881952959015e-06, "logits/chosen": -0.03996685892343521, "logits/rejected": 0.0781441181898117, "logps/chosen": -561.5933227539062, "logps/rejected": -588.9613037109375, "loss": 0.289, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.12694257497787476, "rewards/margins": 0.05903642624616623, "rewards/rejected": -0.18597903847694397, "step": 250 }, { "epoch": 0.28, "learning_rate": 4.536685497209182e-06, "logits/chosen": -0.07835476100444794, "logits/rejected": -0.022311905398964882, "logps/chosen": -542.6710815429688, "logps/rejected": -574.5667114257812, "loss": 0.2814, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.13178928196430206, "rewards/margins": 0.04299298673868179, "rewards/rejected": -0.17478224635124207, "step": 260 }, { "epoch": 0.29, "learning_rate": 4.481254545815943e-06, "logits/chosen": -0.08058448135852814, "logits/rejected": 0.07340067625045776, "logps/chosen": -508.29901123046875, "logps/rejected": -560.2567138671875, "loss": 0.2709, "rewards/accuracies": 0.46875, "rewards/chosen": -0.11642781645059586, "rewards/margins": 0.05028604343533516, "rewards/rejected": -0.1667138636112213, "step": 270 }, { "epoch": 0.3, "learning_rate": 4.42307231557875e-06, "logits/chosen": -0.060424257069826126, "logits/rejected": -0.05060155317187309, "logps/chosen": -522.2492065429688, "logps/rejected": -580.984375, "loss": 0.2702, "rewards/accuracies": 0.5, "rewards/chosen": -0.1186426654458046, "rewards/margins": 0.05852733179926872, "rewards/rejected": -0.17716999351978302, "step": 280 }, { "epoch": 0.31, "learning_rate": 4.3622196015370305e-06, "logits/chosen": -0.0831371396780014, "logits/rejected": 0.1073642149567604, "logps/chosen": -578.5635986328125, "logps/rejected": -601.4964599609375, "loss": 0.2702, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.14065107703208923, "rewards/margins": 0.04782631993293762, "rewards/rejected": -0.18847739696502686, "step": 290 }, { "epoch": 0.32, "learning_rate": 4.298780907110648e-06, "logits/chosen": -0.034952230751514435, "logits/rejected": -0.033971935510635376, "logps/chosen": -521.9036865234375, "logps/rejected": -568.0286865234375, "loss": 0.281, "rewards/accuracies": 0.5, "rewards/chosen": -0.10558326542377472, "rewards/margins": 0.05633324384689331, "rewards/rejected": -0.16191650927066803, "step": 300 }, { "epoch": 0.33, "learning_rate": 4.23284432675381e-06, "logits/chosen": -0.05012059211730957, "logits/rejected": -0.07782770693302155, "logps/chosen": -489.470458984375, "logps/rejected": -555.35302734375, "loss": 0.2723, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09179838001728058, "rewards/margins": 0.06358543783426285, "rewards/rejected": -0.15538384020328522, "step": 310 }, { "epoch": 0.34, "learning_rate": 4.164501423622277e-06, "logits/chosen": -0.19141286611557007, "logits/rejected": -0.10618031024932861, "logps/chosen": -450.0037536621094, "logps/rejected": -540.8576049804688, "loss": 0.2691, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.09631234407424927, "rewards/margins": 0.0666104406118393, "rewards/rejected": -0.16292276978492737, "step": 320 }, { "epoch": 0.35, "learning_rate": 4.0938471024237355e-06, "logits/chosen": -0.05697326734662056, "logits/rejected": -0.13873888552188873, "logps/chosen": -481.6494140625, "logps/rejected": -599.4074096679688, "loss": 0.2811, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.12175127118825912, "rewards/margins": 0.0913892462849617, "rewards/rejected": -0.21314053237438202, "step": 330 }, { "epoch": 0.36, "learning_rate": 4.020979477627907e-06, "logits/chosen": -0.08442874252796173, "logits/rejected": -0.0031359121203422546, "logps/chosen": -474.0492248535156, "logps/rejected": -555.919189453125, "loss": 0.2772, "rewards/accuracies": 0.46875, "rewards/chosen": -0.11865799129009247, "rewards/margins": 0.05636826157569885, "rewards/rejected": -0.17502623796463013, "step": 340 }, { "epoch": 0.37, "learning_rate": 3.9459997372194105e-06, "logits/chosen": -0.0918760672211647, "logits/rejected": 0.022966912016272545, "logps/chosen": -555.8822021484375, "logps/rejected": -670.1365356445312, "loss": 0.2612, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.1380760669708252, "rewards/margins": 0.0869637131690979, "rewards/rejected": -0.2250397652387619, "step": 350 }, { "epoch": 0.38, "learning_rate": 3.869012002182573e-06, "logits/chosen": -0.1127350777387619, "logits/rejected": -0.15412607789039612, "logps/chosen": -516.8878784179688, "logps/rejected": -592.467041015625, "loss": 0.2765, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.12640734016895294, "rewards/margins": 0.06364301592111588, "rewards/rejected": -0.19005033373832703, "step": 360 }, { "epoch": 0.39, "learning_rate": 3.7901231819133104e-06, "logits/chosen": -0.1551699936389923, "logits/rejected": -0.055733174085617065, "logps/chosen": -500.0525817871094, "logps/rejected": -564.5525512695312, "loss": 0.271, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.11618797481060028, "rewards/margins": 0.05627555400133133, "rewards/rejected": -0.17246350646018982, "step": 370 }, { "epoch": 0.41, "learning_rate": 3.709442825758875e-06, "logits/chosen": -0.15336255729198456, "logits/rejected": -0.09010852128267288, "logps/chosen": -532.9185180664062, "logps/rejected": -579.7677612304688, "loss": 0.2518, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.09618903696537018, "rewards/margins": 0.059428442269563675, "rewards/rejected": -0.15561747550964355, "step": 380 }, { "epoch": 0.42, "learning_rate": 3.6270829708916113e-06, "logits/chosen": -0.14842228591442108, "logits/rejected": -0.18628902733325958, "logps/chosen": -499.26495361328125, "logps/rejected": -566.7688598632812, "loss": 0.2847, "rewards/accuracies": 0.5, "rewards/chosen": -0.10898256301879883, "rewards/margins": 0.05999414250254631, "rewards/rejected": -0.16897672414779663, "step": 390 }, { "epoch": 0.43, "learning_rate": 3.543157986727991e-06, "logits/chosen": -0.16273057460784912, "logits/rejected": -0.10563422739505768, "logps/chosen": -554.4479370117188, "logps/rejected": -563.3797607421875, "loss": 0.2849, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.10144559293985367, "rewards/margins": 0.04638112708926201, "rewards/rejected": -0.14782671630382538, "step": 400 }, { "epoch": 0.44, "learning_rate": 3.4577844161089614e-06, "logits/chosen": -0.11332042515277863, "logits/rejected": -0.11062748730182648, "logps/chosen": -518.7150268554688, "logps/rejected": -573.0471801757812, "loss": 0.259, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.10492125898599625, "rewards/margins": 0.046738140285015106, "rewards/rejected": -0.15165939927101135, "step": 410 }, { "epoch": 0.45, "learning_rate": 3.3710808134621577e-06, "logits/chosen": -0.158113032579422, "logits/rejected": -0.11588151752948761, "logps/chosen": -462.90423583984375, "logps/rejected": -516.649658203125, "loss": 0.2722, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.10597708076238632, "rewards/margins": 0.0523579902946949, "rewards/rejected": -0.15833505988121033, "step": 420 }, { "epoch": 0.46, "learning_rate": 3.2831675801707126e-06, "logits/chosen": -0.21510057151317596, "logits/rejected": -0.08049353212118149, "logps/chosen": -583.40283203125, "logps/rejected": -586.7947998046875, "loss": 0.2929, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.11892227083444595, "rewards/margins": 0.05448797345161438, "rewards/rejected": -0.17341025173664093, "step": 430 }, { "epoch": 0.47, "learning_rate": 3.194166797377289e-06, "logits/chosen": -0.12544101476669312, "logits/rejected": -0.11414500325918198, "logps/chosen": -459.9356384277344, "logps/rejected": -505.929931640625, "loss": 0.2729, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.11049865186214447, "rewards/margins": 0.047077327966690063, "rewards/rejected": -0.15757599472999573, "step": 440 }, { "epoch": 0.48, "learning_rate": 3.104202056455501e-06, "logits/chosen": -0.10923869907855988, "logits/rejected": -0.1209484338760376, "logps/chosen": -453.79071044921875, "logps/rejected": -520.7780151367188, "loss": 0.2685, "rewards/accuracies": 0.53125, "rewards/chosen": -0.09267580509185791, "rewards/margins": 0.06151905655860901, "rewards/rejected": -0.15419486165046692, "step": 450 }, { "epoch": 0.49, "learning_rate": 3.013398287384144e-06, "logits/chosen": -0.20092570781707764, "logits/rejected": -0.04523925110697746, "logps/chosen": -501.44183349609375, "logps/rejected": -541.0614624023438, "loss": 0.2738, "rewards/accuracies": 0.53125, "rewards/chosen": -0.1075231060385704, "rewards/margins": 0.054855745285749435, "rewards/rejected": -0.16237884759902954, "step": 460 }, { "epoch": 0.5, "learning_rate": 2.9218815852625717e-06, "logits/chosen": -0.18216080963611603, "logits/rejected": -0.09222938120365143, "logps/chosen": -543.4312744140625, "logps/rejected": -569.2079467773438, "loss": 0.2854, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.11925343424081802, "rewards/margins": 0.03793327510356903, "rewards/rejected": -0.15718670189380646, "step": 470 }, { "epoch": 0.51, "learning_rate": 2.829779035208113e-06, "logits/chosen": -0.12351751327514648, "logits/rejected": -0.06279022991657257, "logps/chosen": -496.760498046875, "logps/rejected": -529.7518310546875, "loss": 0.29, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0903807058930397, "rewards/margins": 0.060529064387083054, "rewards/rejected": -0.15090976655483246, "step": 480 }, { "epoch": 0.52, "learning_rate": 2.737218535878705e-06, "logits/chosen": -0.19289804995059967, "logits/rejected": -0.1217590793967247, "logps/chosen": -558.1929321289062, "logps/rejected": -592.0021362304688, "loss": 0.2771, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.11521999537944794, "rewards/margins": 0.05031859874725342, "rewards/rejected": -0.16553862392902374, "step": 490 }, { "epoch": 0.53, "learning_rate": 2.64432862186579e-06, "logits/chosen": -0.033294953405857086, "logits/rejected": -0.07580285519361496, "logps/chosen": -491.3495178222656, "logps/rejected": -552.3511352539062, "loss": 0.2827, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09923096001148224, "rewards/margins": 0.07192887365818024, "rewards/rejected": -0.17115983366966248, "step": 500 }, { "epoch": 0.54, "learning_rate": 2.551238285204126e-06, "logits/chosen": -0.2156916856765747, "logits/rejected": -0.06504158675670624, "logps/chosen": -521.3781127929688, "logps/rejected": -539.2896728515625, "loss": 0.2831, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.10902263969182968, "rewards/margins": 0.04854800924658775, "rewards/rejected": -0.15757066011428833, "step": 510 }, { "epoch": 0.55, "learning_rate": 2.4580767962463688e-06, "logits/chosen": -0.12651291489601135, "logits/rejected": -0.11372752487659454, "logps/chosen": -529.92529296875, "logps/rejected": -584.7510986328125, "loss": 0.2821, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.11373905837535858, "rewards/margins": 0.055428702384233475, "rewards/rejected": -0.16916777193546295, "step": 520 }, { "epoch": 0.57, "learning_rate": 2.3649735241511546e-06, "logits/chosen": -0.17947567999362946, "logits/rejected": -0.15643848478794098, "logps/chosen": -505.3380432128906, "logps/rejected": -594.7916259765625, "loss": 0.2675, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.10552126169204712, "rewards/margins": 0.07201484590768814, "rewards/rejected": -0.17753610014915466, "step": 530 }, { "epoch": 0.58, "learning_rate": 2.2720577572339914e-06, "logits/chosen": -0.1622428297996521, "logits/rejected": -0.08399353176355362, "logps/chosen": -583.8826904296875, "logps/rejected": -565.4668579101562, "loss": 0.2736, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.11298346519470215, "rewards/margins": 0.0552997961640358, "rewards/rejected": -0.16828325390815735, "step": 540 }, { "epoch": 0.59, "learning_rate": 2.1794585234303995e-06, "logits/chosen": -0.15642212331295013, "logits/rejected": -0.11185960471630096, "logps/chosen": -478.8877868652344, "logps/rejected": -511.4723205566406, "loss": 0.2606, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.09287136793136597, "rewards/margins": 0.05674201250076294, "rewards/rejected": -0.1496133804321289, "step": 550 }, { "epoch": 0.6, "learning_rate": 2.0873044111206407e-06, "logits/chosen": -0.19819292426109314, "logits/rejected": -0.08049633353948593, "logps/chosen": -502.8023986816406, "logps/rejected": -527.1845703125, "loss": 0.2718, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.11550305783748627, "rewards/margins": 0.045039866119623184, "rewards/rejected": -0.16054292023181915, "step": 560 }, { "epoch": 0.61, "learning_rate": 1.9957233905648293e-06, "logits/chosen": -0.19029836356639862, "logits/rejected": -0.12178380787372589, "logps/chosen": -487.24029541015625, "logps/rejected": -522.4542846679688, "loss": 0.2977, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08964806795120239, "rewards/margins": 0.05422767996788025, "rewards/rejected": -0.14387574791908264, "step": 570 }, { "epoch": 0.62, "learning_rate": 1.904842636196402e-06, "logits/chosen": -0.11798320710659027, "logits/rejected": -0.06398233026266098, "logps/chosen": -555.4868774414062, "logps/rejected": -606.1224365234375, "loss": 0.2674, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.10348886251449585, "rewards/margins": 0.07239896804094315, "rewards/rejected": -0.1758878380060196, "step": 580 }, { "epoch": 0.63, "learning_rate": 1.814788350020726e-06, "logits/chosen": -0.1332792490720749, "logits/rejected": -0.1340535581111908, "logps/chosen": -518.5892944335938, "logps/rejected": -602.2269287109375, "loss": 0.253, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.09993033111095428, "rewards/margins": 0.06545485556125641, "rewards/rejected": -0.1653851717710495, "step": 590 }, { "epoch": 0.64, "learning_rate": 1.725685586364051e-06, "logits/chosen": -0.24548590183258057, "logits/rejected": -0.07364498823881149, "logps/chosen": -528.7620849609375, "logps/rejected": -581.3148803710938, "loss": 0.2575, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.10455663502216339, "rewards/margins": 0.05815718695521355, "rewards/rejected": -0.16271382570266724, "step": 600 }, { "epoch": 0.65, "learning_rate": 1.6376580782162172e-06, "logits/chosen": -0.17857643961906433, "logits/rejected": -0.05776941031217575, "logps/chosen": -495.97613525390625, "logps/rejected": -546.7186279296875, "loss": 0.2811, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.09525806456804276, "rewards/margins": 0.07870879769325256, "rewards/rejected": -0.17396686971187592, "step": 610 }, { "epoch": 0.66, "learning_rate": 1.550828065408227e-06, "logits/chosen": -0.1720505952835083, "logits/rejected": -0.25772562623023987, "logps/chosen": -486.467529296875, "logps/rejected": -584.5782470703125, "loss": 0.2582, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.09170956909656525, "rewards/margins": 0.09501364082098007, "rewards/rejected": -0.18672320246696472, "step": 620 }, { "epoch": 0.67, "learning_rate": 1.4653161248633053e-06, "logits/chosen": -0.21994712948799133, "logits/rejected": -0.1386212706565857, "logps/chosen": -567.0380859375, "logps/rejected": -595.9439697265625, "loss": 0.2555, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.11543399095535278, "rewards/margins": 0.06394585222005844, "rewards/rejected": -0.17937985062599182, "step": 630 }, { "epoch": 0.68, "learning_rate": 1.381241003157162e-06, "logits/chosen": -0.17482668161392212, "logits/rejected": -0.11466997861862183, "logps/chosen": -497.3523864746094, "logps/rejected": -575.636962890625, "loss": 0.2737, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.10399528592824936, "rewards/margins": 0.07409018278121948, "rewards/rejected": -0.17808546125888824, "step": 640 }, { "epoch": 0.69, "learning_rate": 1.298719451619979e-06, "logits/chosen": -0.1267072856426239, "logits/rejected": -0.22677993774414062, "logps/chosen": -562.6024169921875, "logps/rejected": -630.305419921875, "loss": 0.2679, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.11934264004230499, "rewards/margins": 0.08058986812829971, "rewards/rejected": -0.1999325305223465, "step": 650 }, { "epoch": 0.7, "learning_rate": 1.2178660642091036e-06, "logits/chosen": -0.2002047747373581, "logits/rejected": -0.1947084367275238, "logps/chosen": -541.3032836914062, "logps/rejected": -579.1988525390625, "loss": 0.2595, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.10852652788162231, "rewards/margins": 0.0627359002828598, "rewards/rejected": -0.17126242816448212, "step": 660 }, { "epoch": 0.71, "learning_rate": 1.1387931183775821e-06, "logits/chosen": -0.1807963103055954, "logits/rejected": -0.1801833063364029, "logps/chosen": -545.267333984375, "logps/rejected": -617.2647705078125, "loss": 0.2531, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.10741935670375824, "rewards/margins": 0.07981701195240021, "rewards/rejected": -0.18723638355731964, "step": 670 }, { "epoch": 0.73, "learning_rate": 1.061610419159532e-06, "logits/chosen": -0.16296163201332092, "logits/rejected": -0.16335263848304749, "logps/chosen": -568.46923828125, "logps/rejected": -639.3665771484375, "loss": 0.2827, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1257370114326477, "rewards/margins": 0.07859646528959274, "rewards/rejected": -0.20433346927165985, "step": 680 }, { "epoch": 0.74, "learning_rate": 9.864251466888364e-07, "logits/chosen": -0.15190322697162628, "logits/rejected": -0.023175863549113274, "logps/chosen": -568.4057006835938, "logps/rejected": -605.7794189453125, "loss": 0.2618, "rewards/accuracies": 0.5, "rewards/chosen": -0.1031126156449318, "rewards/margins": 0.07158243656158447, "rewards/rejected": -0.17469504475593567, "step": 690 }, { "epoch": 0.75, "learning_rate": 9.133417073629288e-07, "logits/chosen": -0.2676157057285309, "logits/rejected": -0.15279750525951385, "logps/chosen": -566.8143310546875, "logps/rejected": -579.0692138671875, "loss": 0.284, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.11574618518352509, "rewards/margins": 0.03857272118330002, "rewards/rejected": -0.1543188989162445, "step": 700 }, { "epoch": 0.76, "learning_rate": 8.424615888583332e-07, "logits/chosen": -0.13174203038215637, "logits/rejected": -0.1824689358472824, "logps/chosen": -478.6060485839844, "logps/rejected": -531.3140869140625, "loss": 0.2583, "rewards/accuracies": 0.5, "rewards/chosen": -0.08994091302156448, "rewards/margins": 0.053567446768283844, "rewards/rejected": -0.14350834488868713, "step": 710 }, { "epoch": 0.77, "learning_rate": 7.738832191993092e-07, "logits/chosen": -0.08020667731761932, "logits/rejected": -0.13351663947105408, "logps/chosen": -520.2698974609375, "logps/rejected": -585.7660522460938, "loss": 0.2855, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.10003659874200821, "rewards/margins": 0.06318075209856033, "rewards/rejected": -0.16321733593940735, "step": 720 }, { "epoch": 0.78, "learning_rate": 7.077018300752917e-07, "logits/chosen": -0.18519246578216553, "logits/rejected": -0.13630035519599915, "logps/chosen": -504.2767639160156, "logps/rejected": -569.7481689453125, "loss": 0.2545, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09188510477542877, "rewards/margins": 0.06354068219661713, "rewards/rejected": -0.1554257869720459, "step": 730 }, { "epoch": 0.79, "learning_rate": 6.440093245969342e-07, "logits/chosen": -0.19004374742507935, "logits/rejected": -0.1609114110469818, "logps/chosen": -557.939208984375, "logps/rejected": -546.6970825195312, "loss": 0.2965, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.10281065851449966, "rewards/margins": 0.04500015825033188, "rewards/rejected": -0.14781081676483154, "step": 740 }, { "epoch": 0.8, "learning_rate": 5.828941496744075e-07, "logits/chosen": -0.17647784948349, "logits/rejected": -0.13355641067028046, "logps/chosen": -567.4881591796875, "logps/rejected": -612.3013305664062, "loss": 0.2652, "rewards/accuracies": 0.5, "rewards/chosen": -0.11634314060211182, "rewards/margins": 0.061061274260282516, "rewards/rejected": -0.17740443348884583, "step": 750 }, { "epoch": 0.81, "learning_rate": 5.244411731951671e-07, "logits/chosen": -0.2159070074558258, "logits/rejected": -0.10073345899581909, "logps/chosen": -459.47149658203125, "logps/rejected": -529.6015014648438, "loss": 0.2666, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08737196773290634, "rewards/margins": 0.06437215954065323, "rewards/rejected": -0.15174412727355957, "step": 760 }, { "epoch": 0.82, "learning_rate": 4.6873156617173594e-07, "logits/chosen": -0.1441185027360916, "logits/rejected": -0.027431348338723183, "logps/chosen": -476.61798095703125, "logps/rejected": -507.6756286621094, "loss": 0.2678, "rewards/accuracies": 0.46875, "rewards/chosen": -0.09565510600805283, "rewards/margins": 0.05930342152714729, "rewards/rejected": -0.15495853126049042, "step": 770 }, { "epoch": 0.83, "learning_rate": 4.1584269002318653e-07, "logits/chosen": -0.14351201057434082, "logits/rejected": -0.13506287336349487, "logps/chosen": -515.0467529296875, "logps/rejected": -579.8074951171875, "loss": 0.2695, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.10201451927423477, "rewards/margins": 0.05492178350687027, "rewards/rejected": -0.15693630278110504, "step": 780 }, { "epoch": 0.84, "learning_rate": 3.658479891468258e-07, "logits/chosen": -0.12005716562271118, "logits/rejected": -0.13143005967140198, "logps/chosen": -480.64166259765625, "logps/rejected": -542.0699462890625, "loss": 0.2681, "rewards/accuracies": 0.5, "rewards/chosen": -0.10447077453136444, "rewards/margins": 0.0592375211417675, "rewards/rejected": -0.16370829939842224, "step": 790 }, { "epoch": 0.85, "learning_rate": 3.18816888929272e-07, "logits/chosen": -0.12967665493488312, "logits/rejected": -0.09862512350082397, "logps/chosen": -504.17559814453125, "logps/rejected": -561.1007690429688, "loss": 0.2608, "rewards/accuracies": 0.5, "rewards/chosen": -0.10162999480962753, "rewards/margins": 0.052912771701812744, "rewards/rejected": -0.15454277396202087, "step": 800 }, { "epoch": 0.86, "learning_rate": 2.748146993385484e-07, "logits/chosen": -0.12466283142566681, "logits/rejected": -0.18476256728172302, "logps/chosen": -521.7217407226562, "logps/rejected": -606.356689453125, "loss": 0.2598, "rewards/accuracies": 0.5, "rewards/chosen": -0.11335861682891846, "rewards/margins": 0.0591861791908741, "rewards/rejected": -0.17254477739334106, "step": 810 }, { "epoch": 0.87, "learning_rate": 2.3390252423108077e-07, "logits/chosen": -0.17083628475666046, "logits/rejected": -0.12109515815973282, "logps/chosen": -456.31787109375, "logps/rejected": -514.2985229492188, "loss": 0.2716, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.09381003677845001, "rewards/margins": 0.06866031140089035, "rewards/rejected": -0.16247034072875977, "step": 820 }, { "epoch": 0.89, "learning_rate": 1.961371764995243e-07, "logits/chosen": -0.16086441278457642, "logits/rejected": -0.13963501155376434, "logps/chosen": -485.7345275878906, "logps/rejected": -566.4594116210938, "loss": 0.2668, "rewards/accuracies": 0.5, "rewards/chosen": -0.08713211864233017, "rewards/margins": 0.0719505250453949, "rewards/rejected": -0.15908263623714447, "step": 830 }, { "epoch": 0.9, "learning_rate": 1.61571099179261e-07, "logits/chosen": -0.08465079963207245, "logits/rejected": -0.21077242493629456, "logps/chosen": -507.953369140625, "logps/rejected": -587.9386596679688, "loss": 0.2522, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.09845003485679626, "rewards/margins": 0.0746922716498375, "rewards/rejected": -0.17314231395721436, "step": 840 }, { "epoch": 0.91, "learning_rate": 1.3025229262312367e-07, "logits/chosen": -0.12216424942016602, "logits/rejected": -0.10754810273647308, "logps/chosen": -534.802734375, "logps/rejected": -569.3134155273438, "loss": 0.2789, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.1041802167892456, "rewards/margins": 0.06386038661003113, "rewards/rejected": -0.16804060339927673, "step": 850 }, { "epoch": 0.92, "learning_rate": 1.0222424784546853e-07, "logits/chosen": -0.2511650621891022, "logits/rejected": -0.1288149058818817, "logps/chosen": -511.1424255371094, "logps/rejected": -537.6439208984375, "loss": 0.2629, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.08994197845458984, "rewards/margins": 0.051044024527072906, "rewards/rejected": -0.14098599553108215, "step": 860 }, { "epoch": 0.93, "learning_rate": 7.752588612816553e-08, "logits/chosen": -0.2117297351360321, "logits/rejected": -0.17164471745491028, "logps/chosen": -505.76715087890625, "logps/rejected": -567.2267456054688, "loss": 0.2535, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0884295180439949, "rewards/margins": 0.07234706729650497, "rewards/rejected": -0.16077657043933868, "step": 870 }, { "epoch": 0.94, "learning_rate": 5.619150497236991e-08, "logits/chosen": -0.13743457198143005, "logits/rejected": -0.13362528383731842, "logps/chosen": -507.50946044921875, "logps/rejected": -544.0374755859375, "loss": 0.2857, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.10853898525238037, "rewards/margins": 0.044584743678569794, "rewards/rejected": -0.15312373638153076, "step": 880 }, { "epoch": 0.95, "learning_rate": 3.825073047112743e-08, "logits/chosen": -0.16523101925849915, "logits/rejected": -0.1837397962808609, "logps/chosen": -475.5501403808594, "logps/rejected": -527.5878295898438, "loss": 0.2617, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.08936251699924469, "rewards/margins": 0.06296424567699432, "rewards/rejected": -0.15232674777507782, "step": 890 }, { "epoch": 0.96, "learning_rate": 2.372847616895685e-08, "logits/chosen": -0.23205402493476868, "logits/rejected": -0.2261538952589035, "logps/chosen": -500.764892578125, "logps/rejected": -598.1412963867188, "loss": 0.2395, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.09027452021837234, "rewards/margins": 0.07599581778049469, "rewards/rejected": -0.16627033054828644, "step": 900 }, { "epoch": 0.97, "learning_rate": 1.264490846553279e-08, "logits/chosen": -0.18639487028121948, "logits/rejected": -0.2467382252216339, "logps/chosen": -519.2239379882812, "logps/rejected": -602.8407592773438, "loss": 0.2546, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.09007105976343155, "rewards/margins": 0.07813303172588348, "rewards/rejected": -0.16820409893989563, "step": 910 }, { "epoch": 0.98, "learning_rate": 5.015418611516165e-09, "logits/chosen": -0.09497157484292984, "logits/rejected": -0.20631170272827148, "logps/chosen": -507.2242126464844, "logps/rejected": -574.6315307617188, "loss": 0.2819, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.10080975294113159, "rewards/margins": 0.06279795616865158, "rewards/rejected": -0.16360768675804138, "step": 920 }, { "epoch": 0.99, "learning_rate": 8.506013354186993e-10, "logits/chosen": -0.13831308484077454, "logits/rejected": -0.15870514512062073, "logps/chosen": -443.75421142578125, "logps/rejected": -540.8892822265625, "loss": 0.2713, "rewards/accuracies": 0.46875, "rewards/chosen": -0.09177975356578827, "rewards/margins": 0.07154536247253418, "rewards/rejected": -0.16332513093948364, "step": 930 }, { "epoch": 1.0, "step": 937, "total_flos": 0.0, "train_loss": 0.2777036651094288, "train_runtime": 6958.035, "train_samples_per_second": 4.312, "train_steps_per_second": 0.135 } ], "logging_steps": 10, "max_steps": 937, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }