{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994666666666666, "eval_steps": 500, "global_step": 937, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.319148936170213e-08, "logits/chosen": -0.1885092854499817, "logits/rejected": -0.3158565163612366, "logps/chosen": -579.0175170898438, "logps/rejected": -485.2366638183594, "loss": 0.2285, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.319148936170213e-07, "logits/chosen": -0.14487330615520477, "logits/rejected": -0.07772153615951538, "logps/chosen": -490.6266174316406, "logps/rejected": -480.7708740234375, "loss": 0.21, "rewards/accuracies": 0.3263888955116272, "rewards/chosen": -0.00022973844897933304, "rewards/margins": -7.616530638188124e-05, "rewards/rejected": -0.0001535731425974518, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.0638297872340427e-06, "logits/chosen": -0.17436349391937256, "logits/rejected": -0.20474335551261902, "logps/chosen": -491.7076721191406, "logps/rejected": -509.813232421875, "loss": 0.2025, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -9.073010733118281e-05, "rewards/margins": 9.670343570178375e-05, "rewards/rejected": -0.00018743352848105133, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.595744680851064e-06, "logits/chosen": -0.13612958788871765, "logits/rejected": -0.13220253586769104, "logps/chosen": -529.9112548828125, "logps/rejected": -542.8663940429688, "loss": 0.2153, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.0004884627414867282, "rewards/margins": 0.00024191811098717153, "rewards/rejected": -0.0007303808815777302, "step": 30 }, { "epoch": 0.04, "learning_rate": 2.1276595744680853e-06, "logits/chosen": -0.10267192125320435, "logits/rejected": -0.09803201258182526, "logps/chosen": -578.48291015625, "logps/rejected": -594.6197509765625, "loss": 0.2087, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0006708616274408996, "rewards/margins": 0.0006775633082725108, "rewards/rejected": -0.0013484249357134104, "step": 40 }, { "epoch": 0.05, "learning_rate": 2.6595744680851065e-06, "logits/chosen": -0.10303981602191925, "logits/rejected": -0.133346289396286, "logps/chosen": -511.7806091308594, "logps/rejected": -534.8740234375, "loss": 0.21, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.0018150504911318421, "rewards/margins": 0.0009007491171360016, "rewards/rejected": -0.0027157997246831656, "step": 50 }, { "epoch": 0.06, "learning_rate": 3.191489361702128e-06, "logits/chosen": -0.22180834412574768, "logits/rejected": -0.08587469905614853, "logps/chosen": -550.4360961914062, "logps/rejected": -563.314697265625, "loss": 0.212, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.005344606935977936, "rewards/margins": 0.0011552043724805117, "rewards/rejected": -0.006499811075627804, "step": 60 }, { "epoch": 0.07, "learning_rate": 3.723404255319149e-06, "logits/chosen": -0.1192946806550026, "logits/rejected": -0.10370689630508423, "logps/chosen": -537.7994384765625, "logps/rejected": -544.24951171875, "loss": 0.2132, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.010287364944815636, "rewards/margins": 0.003959923516958952, "rewards/rejected": -0.0142472879961133, "step": 70 }, { "epoch": 0.09, "learning_rate": 4.255319148936171e-06, "logits/chosen": -0.12416684627532959, "logits/rejected": -0.1819939911365509, "logps/chosen": -515.0903930664062, "logps/rejected": -531.8945922851562, "loss": 0.21, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.015505559742450714, "rewards/margins": 0.003083501709625125, "rewards/rejected": -0.018589060753583908, "step": 80 }, { "epoch": 0.1, "learning_rate": 4.787234042553192e-06, "logits/chosen": -0.2256217896938324, "logits/rejected": -0.21886661648750305, "logps/chosen": -503.46875, "logps/rejected": -517.8480224609375, "loss": 0.2086, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.02677309513092041, "rewards/margins": 0.004604019224643707, "rewards/rejected": -0.03137711435556412, "step": 90 }, { "epoch": 0.11, "learning_rate": 4.999375059004058e-06, "logits/chosen": -0.14829424023628235, "logits/rejected": -0.23846416175365448, "logps/chosen": -551.840087890625, "logps/rejected": -553.8361206054688, "loss": 0.1984, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.04528447985649109, "rewards/margins": 0.004657699726521969, "rewards/rejected": -0.04994218051433563, "step": 100 }, { "epoch": 0.12, "learning_rate": 4.9955571065548795e-06, "logits/chosen": -0.25657883286476135, "logits/rejected": -0.28871434926986694, "logps/chosen": -600.0863647460938, "logps/rejected": -634.4995727539062, "loss": 0.1992, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0750977173447609, "rewards/margins": 0.020321359857916832, "rewards/rejected": -0.09541907161474228, "step": 110 }, { "epoch": 0.13, "learning_rate": 4.9882736864879e-06, "logits/chosen": -0.2847597897052765, "logits/rejected": -0.2475430965423584, "logps/chosen": -621.210693359375, "logps/rejected": -635.7638549804688, "loss": 0.2029, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.08808945119380951, "rewards/margins": 0.016084905713796616, "rewards/rejected": -0.10417436063289642, "step": 120 }, { "epoch": 0.14, "learning_rate": 4.977534912960124e-06, "logits/chosen": -0.19638505578041077, "logits/rejected": -0.2249457836151123, "logps/chosen": -586.533203125, "logps/rejected": -623.6920166015625, "loss": 0.1987, "rewards/accuracies": 0.46875, "rewards/chosen": -0.09554356336593628, "rewards/margins": 0.026452088728547096, "rewards/rejected": -0.12199564278125763, "step": 130 }, { "epoch": 0.15, "learning_rate": 4.963355698422092e-06, "logits/chosen": -0.2038092166185379, "logits/rejected": -0.2346273958683014, "logps/chosen": -569.3733520507812, "logps/rejected": -591.7291259765625, "loss": 0.1968, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.08128508180379868, "rewards/margins": 0.024774957448244095, "rewards/rejected": -0.10606005042791367, "step": 140 }, { "epoch": 0.16, "learning_rate": 4.945755732909625e-06, "logits/chosen": -0.26792481541633606, "logits/rejected": -0.16466854512691498, "logps/chosen": -599.0369262695312, "logps/rejected": -654.7034912109375, "loss": 0.2017, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.08535520732402802, "rewards/margins": 0.025586510077118874, "rewards/rejected": -0.11094172298908234, "step": 150 }, { "epoch": 0.17, "learning_rate": 4.924759456701167e-06, "logits/chosen": -0.1864621341228485, "logits/rejected": -0.2081078737974167, "logps/chosen": -529.7960205078125, "logps/rejected": -589.953857421875, "loss": 0.199, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.06579065322875977, "rewards/margins": 0.027334023267030716, "rewards/rejected": -0.09312467277050018, "step": 160 }, { "epoch": 0.18, "learning_rate": 4.900396026378671e-06, "logits/chosen": -0.26899057626724243, "logits/rejected": -0.22697623074054718, "logps/chosen": -629.7769165039062, "logps/rejected": -715.6519775390625, "loss": 0.1984, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.08880215883255005, "rewards/margins": 0.038344450294971466, "rewards/rejected": -0.1271466165781021, "step": 170 }, { "epoch": 0.19, "learning_rate": 4.872699274339169e-06, "logits/chosen": -0.2153758555650711, "logits/rejected": -0.09847669303417206, "logps/chosen": -593.626953125, "logps/rejected": -643.131591796875, "loss": 0.2048, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.08825898915529251, "rewards/margins": 0.031147807836532593, "rewards/rejected": -0.1194067969918251, "step": 180 }, { "epoch": 0.2, "learning_rate": 4.8417076618132434e-06, "logits/chosen": -0.12460646778345108, "logits/rejected": -0.21595044434070587, "logps/chosen": -633.2433471679688, "logps/rejected": -660.8668212890625, "loss": 0.2002, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.09575756639242172, "rewards/margins": 0.027992555871605873, "rewards/rejected": -0.12375012785196304, "step": 190 }, { "epoch": 0.21, "learning_rate": 4.807464225455655e-06, "logits/chosen": -0.14323315024375916, "logits/rejected": -0.2139279544353485, "logps/chosen": -578.2418212890625, "logps/rejected": -605.6033325195312, "loss": 0.2023, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -0.08724905550479889, "rewards/margins": 0.01588294841349125, "rewards/rejected": -0.1031319871544838, "step": 200 }, { "epoch": 0.22, "learning_rate": 4.770016517582283e-06, "logits/chosen": -0.15528282523155212, "logits/rejected": -0.15554766356945038, "logps/chosen": -628.6737060546875, "logps/rejected": -688.3468017578125, "loss": 0.1956, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08062631636857986, "rewards/margins": 0.026851017028093338, "rewards/rejected": -0.1074773296713829, "step": 210 }, { "epoch": 0.23, "learning_rate": 4.7294165401363616e-06, "logits/chosen": -0.18030421435832977, "logits/rejected": -0.06279157847166061, "logps/chosen": -651.3761596679688, "logps/rejected": -689.3553466796875, "loss": 0.1887, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09219054132699966, "rewards/margins": 0.028040152043104172, "rewards/rejected": -0.12023069709539413, "step": 220 }, { "epoch": 0.25, "learning_rate": 4.68572067247573e-06, "logits/chosen": -0.16874362528324127, "logits/rejected": -0.1639997959136963, "logps/chosen": -555.4114990234375, "logps/rejected": -628.8604736328125, "loss": 0.1989, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.1045011430978775, "rewards/margins": 0.02383086457848549, "rewards/rejected": -0.1283320039510727, "step": 230 }, { "epoch": 0.26, "learning_rate": 4.638989593081364e-06, "logits/chosen": -0.12631872296333313, "logits/rejected": -0.13256661593914032, "logps/chosen": -553.3485717773438, "logps/rejected": -603.0968627929688, "loss": 0.1915, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.08940593898296356, "rewards/margins": 0.04212746024131775, "rewards/rejected": -0.1315334141254425, "step": 240 }, { "epoch": 0.27, "learning_rate": 4.5892881952959015e-06, "logits/chosen": -0.1083054393529892, "logits/rejected": -0.0742143839597702, "logps/chosen": -599.7116088867188, "logps/rejected": -652.4061279296875, "loss": 0.1884, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.09931150823831558, "rewards/margins": 0.03657541796565056, "rewards/rejected": -0.13588692247867584, "step": 250 }, { "epoch": 0.28, "learning_rate": 4.536685497209182e-06, "logits/chosen": -0.11171998083591461, "logits/rejected": -0.16903842985630035, "logps/chosen": -593.5734252929688, "logps/rejected": -672.16552734375, "loss": 0.1929, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.09612639248371124, "rewards/margins": 0.05510964244604111, "rewards/rejected": -0.15123602747917175, "step": 260 }, { "epoch": 0.29, "learning_rate": 4.481254545815943e-06, "logits/chosen": -0.09803778678178787, "logits/rejected": -0.08885441720485687, "logps/chosen": -665.0200805664062, "logps/rejected": -690.63818359375, "loss": 0.2074, "rewards/accuracies": 0.40625, "rewards/chosen": -0.08989432454109192, "rewards/margins": 0.02165227010846138, "rewards/rejected": -0.111546590924263, "step": 270 }, { "epoch": 0.3, "learning_rate": 4.42307231557875e-06, "logits/chosen": -0.005570247769355774, "logits/rejected": -0.13455268740653992, "logps/chosen": -609.916748046875, "logps/rejected": -643.1370849609375, "loss": 0.1925, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.06449146568775177, "rewards/margins": 0.022723758593201637, "rewards/rejected": -0.08721521496772766, "step": 280 }, { "epoch": 0.31, "learning_rate": 4.3622196015370305e-06, "logits/chosen": -0.027301525697112083, "logits/rejected": -0.0810176208615303, "logps/chosen": -541.7484130859375, "logps/rejected": -617.4515380859375, "loss": 0.1958, "rewards/accuracies": 0.53125, "rewards/chosen": -0.062103599309921265, "rewards/margins": 0.0201987586915493, "rewards/rejected": -0.08230235427618027, "step": 290 }, { "epoch": 0.32, "learning_rate": 4.298780907110648e-06, "logits/chosen": 0.005954580847173929, "logits/rejected": -0.03331022337079048, "logps/chosen": -575.2536010742188, "logps/rejected": -629.2679443359375, "loss": 0.1965, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.07110811769962311, "rewards/margins": 0.021345119923353195, "rewards/rejected": -0.0924532413482666, "step": 300 }, { "epoch": 0.33, "learning_rate": 4.23284432675381e-06, "logits/chosen": -0.05120337754487991, "logits/rejected": -0.07742851972579956, "logps/chosen": -555.53662109375, "logps/rejected": -598.7825927734375, "loss": 0.1928, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.07844728976488113, "rewards/margins": 0.03324751555919647, "rewards/rejected": -0.11169479787349701, "step": 310 }, { "epoch": 0.34, "learning_rate": 4.164501423622277e-06, "logits/chosen": -0.10243277251720428, "logits/rejected": -0.03584013134241104, "logps/chosen": -573.73046875, "logps/rejected": -552.7764892578125, "loss": 0.206, "rewards/accuracies": 0.40625, "rewards/chosen": -0.06825848668813705, "rewards/margins": 0.009243585169315338, "rewards/rejected": -0.0775020569562912, "step": 320 }, { "epoch": 0.35, "learning_rate": 4.0938471024237355e-06, "logits/chosen": -0.028592532500624657, "logits/rejected": 0.022961582988500595, "logps/chosen": -559.4745483398438, "logps/rejected": -626.19482421875, "loss": 0.1891, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.05221911519765854, "rewards/margins": 0.037141233682632446, "rewards/rejected": -0.08936034888029099, "step": 330 }, { "epoch": 0.36, "learning_rate": 4.020979477627907e-06, "logits/chosen": 0.02807508036494255, "logits/rejected": -0.025245526805520058, "logps/chosen": -576.9495239257812, "logps/rejected": -612.4227905273438, "loss": 0.1949, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.04981788620352745, "rewards/margins": 0.029762808233499527, "rewards/rejected": -0.07958068698644638, "step": 340 }, { "epoch": 0.37, "learning_rate": 3.9459997372194105e-06, "logits/chosen": 0.014679011888802052, "logits/rejected": 0.05258216708898544, "logps/chosen": -537.677978515625, "logps/rejected": -593.5046997070312, "loss": 0.1982, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.063970185816288, "rewards/margins": 0.030909085646271706, "rewards/rejected": -0.09487926214933395, "step": 350 }, { "epoch": 0.38, "learning_rate": 3.869012002182573e-06, "logits/chosen": -0.10759621858596802, "logits/rejected": -0.07801146060228348, "logps/chosen": -577.882568359375, "logps/rejected": -652.2096557617188, "loss": 0.1919, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.07402776181697845, "rewards/margins": 0.038211267441511154, "rewards/rejected": -0.1122390404343605, "step": 360 }, { "epoch": 0.39, "learning_rate": 3.7901231819133104e-06, "logits/chosen": -0.02245343290269375, "logits/rejected": 0.03868962079286575, "logps/chosen": -616.5308837890625, "logps/rejected": -642.6898193359375, "loss": 0.1978, "rewards/accuracies": 0.46875, "rewards/chosen": -0.08398912847042084, "rewards/margins": 0.02132398448884487, "rewards/rejected": -0.10531310737133026, "step": 370 }, { "epoch": 0.41, "learning_rate": 3.709442825758875e-06, "logits/chosen": -0.07354136556386948, "logits/rejected": -0.04209035634994507, "logps/chosen": -616.562744140625, "logps/rejected": -685.1181030273438, "loss": 0.2001, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0954984650015831, "rewards/margins": 0.04605044052004814, "rewards/rejected": -0.14154890179634094, "step": 380 }, { "epoch": 0.42, "learning_rate": 3.6270829708916113e-06, "logits/chosen": -0.11906121671199799, "logits/rejected": -0.056663453578948975, "logps/chosen": -596.8269653320312, "logps/rejected": -644.7449340820312, "loss": 0.1951, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.08853301405906677, "rewards/margins": 0.0294196717441082, "rewards/rejected": -0.11795268207788467, "step": 390 }, { "epoch": 0.43, "learning_rate": 3.543157986727991e-06, "logits/chosen": 0.013636293821036816, "logits/rejected": -0.04586212337017059, "logps/chosen": -599.9013671875, "logps/rejected": -657.8106079101562, "loss": 0.2035, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.09106438606977463, "rewards/margins": 0.04611728712916374, "rewards/rejected": -0.13718166947364807, "step": 400 }, { "epoch": 0.44, "learning_rate": 3.4577844161089614e-06, "logits/chosen": 0.08837394416332245, "logits/rejected": -0.07880507409572601, "logps/chosen": -598.9387817382812, "logps/rejected": -617.7493286132812, "loss": 0.2047, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.07388485968112946, "rewards/margins": 0.019855108112096786, "rewards/rejected": -0.09373997151851654, "step": 410 }, { "epoch": 0.45, "learning_rate": 3.3710808134621577e-06, "logits/chosen": -0.005928731057792902, "logits/rejected": -0.030420657247304916, "logps/chosen": -558.4488525390625, "logps/rejected": -602.2744750976562, "loss": 0.2045, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.06963202357292175, "rewards/margins": 0.03080623783171177, "rewards/rejected": -0.10043825954198837, "step": 420 }, { "epoch": 0.46, "learning_rate": 3.2831675801707126e-06, "logits/chosen": -0.06051339581608772, "logits/rejected": 0.0010267883772030473, "logps/chosen": -594.7051391601562, "logps/rejected": -662.65625, "loss": 0.1935, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.07640456408262253, "rewards/margins": 0.04043372720479965, "rewards/rejected": -0.11683829128742218, "step": 430 }, { "epoch": 0.47, "learning_rate": 3.194166797377289e-06, "logits/chosen": -0.07280877977609634, "logits/rejected": -0.03391597419977188, "logps/chosen": -590.2994384765625, "logps/rejected": -659.5791015625, "loss": 0.1965, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.068049356341362, "rewards/margins": 0.03281210735440254, "rewards/rejected": -0.10086147487163544, "step": 440 }, { "epoch": 0.48, "learning_rate": 3.104202056455501e-06, "logits/chosen": -0.00242437282577157, "logits/rejected": -0.059494733810424805, "logps/chosen": -576.4191284179688, "logps/rejected": -637.6677856445312, "loss": 0.189, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.06889624148607254, "rewards/margins": 0.041952721774578094, "rewards/rejected": -0.11084897816181183, "step": 450 }, { "epoch": 0.49, "learning_rate": 3.013398287384144e-06, "logits/chosen": 0.023568499833345413, "logits/rejected": -0.002035862300544977, "logps/chosen": -549.110107421875, "logps/rejected": -576.5599365234375, "loss": 0.1978, "rewards/accuracies": 0.4375, "rewards/chosen": -0.06544467061758041, "rewards/margins": 0.02002917230129242, "rewards/rejected": -0.08547384291887283, "step": 460 }, { "epoch": 0.5, "learning_rate": 2.9218815852625717e-06, "logits/chosen": -0.07891889661550522, "logits/rejected": 0.012334518134593964, "logps/chosen": -603.6246337890625, "logps/rejected": -670.80615234375, "loss": 0.1957, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.07147827744483948, "rewards/margins": 0.032703179866075516, "rewards/rejected": -0.10418146848678589, "step": 470 }, { "epoch": 0.51, "learning_rate": 2.829779035208113e-06, "logits/chosen": 0.04133855551481247, "logits/rejected": -0.06480925530195236, "logps/chosen": -609.1380004882812, "logps/rejected": -640.5394287109375, "loss": 0.1896, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.07929681241512299, "rewards/margins": 0.025448182597756386, "rewards/rejected": -0.10474499315023422, "step": 480 }, { "epoch": 0.52, "learning_rate": 2.737218535878705e-06, "logits/chosen": 0.053513627499341965, "logits/rejected": -0.0014798849588260055, "logps/chosen": -560.655517578125, "logps/rejected": -680.7060546875, "loss": 0.1833, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06273628771305084, "rewards/margins": 0.05317532271146774, "rewards/rejected": -0.11591160297393799, "step": 490 }, { "epoch": 0.53, "learning_rate": 2.64432862186579e-06, "logits/chosen": -0.02448629029095173, "logits/rejected": 0.0629236251115799, "logps/chosen": -592.8604125976562, "logps/rejected": -623.6454467773438, "loss": 0.188, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07425443828105927, "rewards/margins": 0.044731441885232925, "rewards/rejected": -0.1189858689904213, "step": 500 }, { "epoch": 0.54, "learning_rate": 2.551238285204126e-06, "logits/chosen": -0.08944495767354965, "logits/rejected": 0.10957720130681992, "logps/chosen": -582.0028076171875, "logps/rejected": -623.3414306640625, "loss": 0.1852, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.08551887422800064, "rewards/margins": 0.04520031064748764, "rewards/rejected": -0.1307191699743271, "step": 510 }, { "epoch": 0.55, "learning_rate": 2.4580767962463688e-06, "logits/chosen": -0.06701700389385223, "logits/rejected": -0.07697690278291702, "logps/chosen": -553.2176513671875, "logps/rejected": -593.4279174804688, "loss": 0.199, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.09521619975566864, "rewards/margins": 0.033401116728782654, "rewards/rejected": -0.1286173164844513, "step": 520 }, { "epoch": 0.57, "learning_rate": 2.3649735241511546e-06, "logits/chosen": -0.0331454835832119, "logits/rejected": -0.10423725843429565, "logps/chosen": -543.7550048828125, "logps/rejected": -637.5558471679688, "loss": 0.1833, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.07857955992221832, "rewards/margins": 0.043745510280132294, "rewards/rejected": -0.12232507765293121, "step": 530 }, { "epoch": 0.58, "learning_rate": 2.2720577572339914e-06, "logits/chosen": -0.04820919781923294, "logits/rejected": -0.08856736123561859, "logps/chosen": -550.280517578125, "logps/rejected": -695.6846313476562, "loss": 0.182, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0971294417977333, "rewards/margins": 0.06545992940664291, "rewards/rejected": -0.16258935630321503, "step": 540 }, { "epoch": 0.59, "learning_rate": 2.1794585234303995e-06, "logits/chosen": -0.06888549029827118, "logits/rejected": -0.019281355664134026, "logps/chosen": -631.0794067382812, "logps/rejected": -681.6578369140625, "loss": 0.1999, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0913894921541214, "rewards/margins": 0.034925952553749084, "rewards/rejected": -0.12631544470787048, "step": 550 }, { "epoch": 0.6, "learning_rate": 2.0873044111206407e-06, "logits/chosen": 0.05780111625790596, "logits/rejected": -0.08857734501361847, "logps/chosen": -554.0203857421875, "logps/rejected": -672.1288452148438, "loss": 0.1885, "rewards/accuracies": 0.5, "rewards/chosen": -0.08766164630651474, "rewards/margins": 0.05516546964645386, "rewards/rejected": -0.1428271234035492, "step": 560 }, { "epoch": 0.61, "learning_rate": 1.9957233905648293e-06, "logits/chosen": -0.07425542920827866, "logits/rejected": -0.14232522249221802, "logps/chosen": -590.5303955078125, "logps/rejected": -634.2200317382812, "loss": 0.1922, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.08620771020650864, "rewards/margins": 0.02542172744870186, "rewards/rejected": -0.1116294413805008, "step": 570 }, { "epoch": 0.62, "learning_rate": 1.904842636196402e-06, "logits/chosen": -0.04322206228971481, "logits/rejected": 0.028172429651021957, "logps/chosen": -600.7364501953125, "logps/rejected": -676.2041625976562, "loss": 0.1917, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.07297880947589874, "rewards/margins": 0.03739452734589577, "rewards/rejected": -0.11037333309650421, "step": 580 }, { "epoch": 0.63, "learning_rate": 1.814788350020726e-06, "logits/chosen": 0.03321009874343872, "logits/rejected": -0.012934012338519096, "logps/chosen": -562.2665405273438, "logps/rejected": -677.3361206054688, "loss": 0.1969, "rewards/accuracies": 0.53125, "rewards/chosen": -0.06490649282932281, "rewards/margins": 0.03305616229772568, "rewards/rejected": -0.09796266257762909, "step": 590 }, { "epoch": 0.64, "learning_rate": 1.725685586364051e-06, "logits/chosen": -0.0036629363894462585, "logits/rejected": 0.04252059385180473, "logps/chosen": -574.91064453125, "logps/rejected": -599.8869018554688, "loss": 0.1941, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.06440480798482895, "rewards/margins": 0.02049572765827179, "rewards/rejected": -0.08490053564310074, "step": 600 }, { "epoch": 0.65, "learning_rate": 1.6376580782162172e-06, "logits/chosen": -0.028811585158109665, "logits/rejected": -0.0507902130484581, "logps/chosen": -614.2153930664062, "logps/rejected": -697.7538452148438, "loss": 0.1922, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.07627642154693604, "rewards/margins": 0.028858337551355362, "rewards/rejected": -0.1051347628235817, "step": 610 }, { "epoch": 0.66, "learning_rate": 1.550828065408227e-06, "logits/chosen": -0.06321687996387482, "logits/rejected": 0.04666703939437866, "logps/chosen": -562.6372680664062, "logps/rejected": -623.3756713867188, "loss": 0.1964, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.07641318440437317, "rewards/margins": 0.02390345185995102, "rewards/rejected": -0.10031662881374359, "step": 620 }, { "epoch": 0.67, "learning_rate": 1.4653161248633053e-06, "logits/chosen": -0.03132300823926926, "logits/rejected": 0.05715359374880791, "logps/chosen": -556.3076171875, "logps/rejected": -603.981689453125, "loss": 0.188, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0778612419962883, "rewards/margins": 0.032999925315380096, "rewards/rejected": -0.1108611598610878, "step": 630 }, { "epoch": 0.68, "learning_rate": 1.381241003157162e-06, "logits/chosen": -0.0160093754529953, "logits/rejected": -0.033678505569696426, "logps/chosen": -561.7026977539062, "logps/rejected": -632.2767333984375, "loss": 0.1998, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.08461053669452667, "rewards/margins": 0.027624454349279404, "rewards/rejected": -0.11223499476909637, "step": 640 }, { "epoch": 0.69, "learning_rate": 1.298719451619979e-06, "logits/chosen": 0.033716317266225815, "logits/rejected": -0.05850861221551895, "logps/chosen": -576.638916015625, "logps/rejected": -658.7437133789062, "loss": 0.1877, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.07561091333627701, "rewards/margins": 0.035250525921583176, "rewards/rejected": -0.11086144298315048, "step": 650 }, { "epoch": 0.7, "learning_rate": 1.2178660642091036e-06, "logits/chosen": -0.0035046476405113935, "logits/rejected": -0.01969769224524498, "logps/chosen": -594.3090209960938, "logps/rejected": -659.2559814453125, "loss": 0.1831, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07316828519105911, "rewards/margins": 0.05110269784927368, "rewards/rejected": -0.12427099049091339, "step": 660 }, { "epoch": 0.71, "learning_rate": 1.1387931183775821e-06, "logits/chosen": 0.015228897333145142, "logits/rejected": -0.07918987423181534, "logps/chosen": -573.0462646484375, "logps/rejected": -665.521728515625, "loss": 0.1846, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.060265183448791504, "rewards/margins": 0.053003955632448196, "rewards/rejected": -0.1132691353559494, "step": 670 }, { "epoch": 0.73, "learning_rate": 1.061610419159532e-06, "logits/chosen": 0.04909176006913185, "logits/rejected": 0.0019339825958013535, "logps/chosen": -591.9546508789062, "logps/rejected": -646.3989868164062, "loss": 0.1872, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07109365612268448, "rewards/margins": 0.050017982721328735, "rewards/rejected": -0.12111164629459381, "step": 680 }, { "epoch": 0.74, "learning_rate": 9.864251466888364e-07, "logits/chosen": -0.06201664358377457, "logits/rejected": 0.021130381152033806, "logps/chosen": -599.9959716796875, "logps/rejected": -649.0542602539062, "loss": 0.1803, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.07730476558208466, "rewards/margins": 0.035370949655771255, "rewards/rejected": -0.11267571151256561, "step": 690 }, { "epoch": 0.75, "learning_rate": 9.133417073629288e-07, "logits/chosen": -0.060487646609544754, "logits/rejected": -0.031666141003370285, "logps/chosen": -546.1062622070312, "logps/rejected": -622.996337890625, "loss": 0.1898, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.07208921015262604, "rewards/margins": 0.048208087682724, "rewards/rejected": -0.12029729783535004, "step": 700 }, { "epoch": 0.76, "learning_rate": 8.424615888583332e-07, "logits/chosen": -0.043236102908849716, "logits/rejected": 0.010083493776619434, "logps/chosen": -565.9916381835938, "logps/rejected": -666.7619018554688, "loss": 0.1877, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07324135303497314, "rewards/margins": 0.0542600154876709, "rewards/rejected": -0.12750136852264404, "step": 710 }, { "epoch": 0.77, "learning_rate": 7.738832191993092e-07, "logits/chosen": -0.03003951907157898, "logits/rejected": -0.023655174300074577, "logps/chosen": -593.333740234375, "logps/rejected": -679.3707275390625, "loss": 0.1821, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.07741405069828033, "rewards/margins": 0.040637485682964325, "rewards/rejected": -0.11805154383182526, "step": 720 }, { "epoch": 0.78, "learning_rate": 7.077018300752917e-07, "logits/chosen": -0.10231657326221466, "logits/rejected": -0.00636244285851717, "logps/chosen": -564.58154296875, "logps/rejected": -615.8031005859375, "loss": 0.1959, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.07730068266391754, "rewards/margins": 0.038891423493623734, "rewards/rejected": -0.11619208753108978, "step": 730 }, { "epoch": 0.79, "learning_rate": 6.440093245969342e-07, "logits/chosen": -0.12688763439655304, "logits/rejected": 0.07261992990970612, "logps/chosen": -589.0942993164062, "logps/rejected": -604.7650756835938, "loss": 0.193, "rewards/accuracies": 0.5, "rewards/chosen": -0.09481467306613922, "rewards/margins": 0.03096696175634861, "rewards/rejected": -0.12578162550926208, "step": 740 }, { "epoch": 0.8, "learning_rate": 5.828941496744075e-07, "logits/chosen": -0.0817611813545227, "logits/rejected": -0.010524662211537361, "logps/chosen": -537.4359130859375, "logps/rejected": -617.6580810546875, "loss": 0.1898, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08204906433820724, "rewards/margins": 0.05059989541769028, "rewards/rejected": -0.13264895975589752, "step": 750 }, { "epoch": 0.81, "learning_rate": 5.244411731951671e-07, "logits/chosen": -0.08540595322847366, "logits/rejected": -0.019884133711457253, "logps/chosen": -569.3452758789062, "logps/rejected": -623.3587646484375, "loss": 0.1916, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.07342539727687836, "rewards/margins": 0.03685514256358147, "rewards/rejected": -0.11028053611516953, "step": 760 }, { "epoch": 0.82, "learning_rate": 4.6873156617173594e-07, "logits/chosen": -0.0825069323182106, "logits/rejected": 0.03379444032907486, "logps/chosen": -560.5318603515625, "logps/rejected": -629.8617553710938, "loss": 0.1859, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.07376088947057724, "rewards/margins": 0.036222193390131, "rewards/rejected": -0.10998308658599854, "step": 770 }, { "epoch": 0.83, "learning_rate": 4.1584269002318653e-07, "logits/chosen": 0.015585740096867085, "logits/rejected": -0.09954878687858582, "logps/chosen": -536.1882934570312, "logps/rejected": -646.0011596679688, "loss": 0.1864, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08902934193611145, "rewards/margins": 0.054770153015851974, "rewards/rejected": -0.14379946887493134, "step": 780 }, { "epoch": 0.84, "learning_rate": 3.658479891468258e-07, "logits/chosen": -0.0846419557929039, "logits/rejected": -0.027601266279816628, "logps/chosen": -524.1680297851562, "logps/rejected": -655.0399169921875, "loss": 0.1905, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.06788254529237747, "rewards/margins": 0.07369416952133179, "rewards/rejected": -0.14157672226428986, "step": 790 }, { "epoch": 0.85, "learning_rate": 3.18816888929272e-07, "logits/chosen": -0.04564080387353897, "logits/rejected": -0.056423623114824295, "logps/chosen": -556.4688720703125, "logps/rejected": -622.6414794921875, "loss": 0.1852, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0697588101029396, "rewards/margins": 0.047328755259513855, "rewards/rejected": -0.11708758026361465, "step": 800 }, { "epoch": 0.86, "learning_rate": 2.748146993385484e-07, "logits/chosen": -0.00408951798453927, "logits/rejected": -0.05295505374670029, "logps/chosen": -588.3592529296875, "logps/rejected": -639.141845703125, "loss": 0.1884, "rewards/accuracies": 0.5, "rewards/chosen": -0.08244818449020386, "rewards/margins": 0.04398036748170853, "rewards/rejected": -0.1264285445213318, "step": 810 }, { "epoch": 0.87, "learning_rate": 2.3390252423108077e-07, "logits/chosen": -0.023698529228568077, "logits/rejected": -0.033614348620176315, "logps/chosen": -550.3116455078125, "logps/rejected": -649.8366088867188, "loss": 0.1813, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.06834039092063904, "rewards/margins": 0.056421488523483276, "rewards/rejected": -0.12476189434528351, "step": 820 }, { "epoch": 0.89, "learning_rate": 1.961371764995243e-07, "logits/chosen": -0.020034242421388626, "logits/rejected": -0.053077150136232376, "logps/chosen": -530.62744140625, "logps/rejected": -573.3870239257812, "loss": 0.1829, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.07050155103206635, "rewards/margins": 0.03381948173046112, "rewards/rejected": -0.10432104766368866, "step": 830 }, { "epoch": 0.9, "learning_rate": 1.61571099179261e-07, "logits/chosen": -0.03530939295887947, "logits/rejected": -0.05062105506658554, "logps/chosen": -612.7421875, "logps/rejected": -640.69580078125, "loss": 0.1912, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.08575752377510071, "rewards/margins": 0.034512124955654144, "rewards/rejected": -0.12026965618133545, "step": 840 }, { "epoch": 0.91, "learning_rate": 1.3025229262312367e-07, "logits/chosen": -0.02685430273413658, "logits/rejected": -0.050860174000263214, "logps/chosen": -561.8297119140625, "logps/rejected": -601.8411865234375, "loss": 0.1938, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.07358418405056, "rewards/margins": 0.03662911430001259, "rewards/rejected": -0.11021329462528229, "step": 850 }, { "epoch": 0.92, "learning_rate": 1.0222424784546853e-07, "logits/chosen": -0.0493302047252655, "logits/rejected": 0.006892223842442036, "logps/chosen": -599.8060913085938, "logps/rejected": -692.0012817382812, "loss": 0.1973, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.07736798375844955, "rewards/margins": 0.050996191799640656, "rewards/rejected": -0.1283641755580902, "step": 860 }, { "epoch": 0.93, "learning_rate": 7.752588612816553e-08, "logits/chosen": -0.02599761262536049, "logits/rejected": -0.013713346794247627, "logps/chosen": -580.1739501953125, "logps/rejected": -671.7718505859375, "loss": 0.1925, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.07879254966974258, "rewards/margins": 0.05689109489321709, "rewards/rejected": -0.13568365573883057, "step": 870 }, { "epoch": 0.94, "learning_rate": 5.619150497236991e-08, "logits/chosen": 0.02115291729569435, "logits/rejected": -0.052051056176424026, "logps/chosen": -584.3345336914062, "logps/rejected": -611.1611328125, "loss": 0.1831, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0652504563331604, "rewards/margins": 0.03656047582626343, "rewards/rejected": -0.10181091725826263, "step": 880 }, { "epoch": 0.95, "learning_rate": 3.825073047112743e-08, "logits/chosen": 0.001644585281610489, "logits/rejected": -0.010257053188979626, "logps/chosen": -632.9583740234375, "logps/rejected": -683.1160888671875, "loss": 0.204, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08045943081378937, "rewards/margins": 0.047462526708841324, "rewards/rejected": -0.1279219686985016, "step": 890 }, { "epoch": 0.96, "learning_rate": 2.372847616895685e-08, "logits/chosen": -0.13961699604988098, "logits/rejected": -0.039934299886226654, "logps/chosen": -542.2841186523438, "logps/rejected": -602.2158813476562, "loss": 0.1903, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.08329685032367706, "rewards/margins": 0.034699175506830215, "rewards/rejected": -0.11799603700637817, "step": 900 }, { "epoch": 0.97, "learning_rate": 1.264490846553279e-08, "logits/chosen": -0.06002987176179886, "logits/rejected": -0.00015243441157508641, "logps/chosen": -576.7723388671875, "logps/rejected": -622.7515869140625, "loss": 0.1949, "rewards/accuracies": 0.5, "rewards/chosen": -0.0770721584558487, "rewards/margins": 0.03859367594122887, "rewards/rejected": -0.11566583812236786, "step": 910 }, { "epoch": 0.98, "learning_rate": 5.015418611516165e-09, "logits/chosen": 0.030324691906571388, "logits/rejected": -0.10050855576992035, "logps/chosen": -561.7693481445312, "logps/rejected": -647.3151245117188, "loss": 0.1875, "rewards/accuracies": 0.5, "rewards/chosen": -0.07844243943691254, "rewards/margins": 0.04898718744516373, "rewards/rejected": -0.12742963433265686, "step": 920 }, { "epoch": 0.99, "learning_rate": 8.506013354186993e-10, "logits/chosen": 0.004024127032607794, "logits/rejected": -0.07468675822019577, "logps/chosen": -574.8939208984375, "logps/rejected": -618.0397338867188, "loss": 0.1861, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08176033198833466, "rewards/margins": 0.037909943610429764, "rewards/rejected": -0.11967027187347412, "step": 930 }, { "epoch": 1.0, "step": 937, "total_flos": 0.0, "train_loss": 0.19473119514220044, "train_runtime": 7963.8235, "train_samples_per_second": 3.767, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 937, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }