diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,1019 +1,2349 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.9950825430277486, + "epoch": 1.9988002399520095, "eval_steps": 10000, - "global_step": 710, + "global_step": 1666, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.03, - "learning_rate": 1.4084507042253522e-07, - "logits/chosen": -0.018687758594751358, - "logits/rejected": 0.044545598328113556, - "logps/chosen": -322.3531799316406, - "logps/rejected": -218.51559448242188, - "loss": 0.6929, - "rewards/accuracies": 0.4000000059604645, - "rewards/chosen": 0.0003283830883447081, - "rewards/margins": 0.0007339473813772202, - "rewards/rejected": -0.000405564351240173, + "epoch": 0.01, + "learning_rate": 5.988023952095808e-08, + "logits/chosen": 0.08604730665683746, + "logits/rejected": 0.14735615253448486, + "logps/chosen": -306.490966796875, + "logps/rejected": -284.1272277832031, + "loss": 0.3265, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.00030196673469617963, + "rewards/margins": 0.0003302523400634527, + "rewards/rejected": -2.8285570806474425e-05, "step": 10 }, { - "epoch": 0.06, - "learning_rate": 2.8169014084507043e-07, - "logits/chosen": -0.04173532500863075, - "logits/rejected": -0.03870970755815506, - "logps/chosen": -334.37646484375, - "logps/rejected": -199.8443603515625, - "loss": 0.6921, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -0.0004360534658189863, - "rewards/margins": 0.0012426268076524138, - "rewards/rejected": -0.0016786803025752306, + "epoch": 0.02, + "learning_rate": 1.1976047904191617e-07, + "logits/chosen": 0.07679140567779541, + "logits/rejected": 0.13602732121944427, + "logps/chosen": -270.78839111328125, + "logps/rejected": -239.93063354492188, + "loss": 0.3261, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.0004934846656396985, + "rewards/margins": 0.0003515507560223341, + "rewards/rejected": 0.00014193373499438167, "step": 20 }, { - "epoch": 0.08, - "learning_rate": 4.225352112676056e-07, - "logits/chosen": -0.10300469398498535, - "logits/rejected": -0.022432664409279823, - "logps/chosen": -429.2939453125, - "logps/rejected": -253.77828979492188, - "loss": 0.6849, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.012739419937133789, - "rewards/margins": 0.02927268110215664, - "rewards/rejected": -0.016533263027668, + "epoch": 0.04, + "learning_rate": 1.7964071856287425e-07, + "logits/chosen": 0.08353379368782043, + "logits/rejected": 0.18339572846889496, + "logps/chosen": -350.9549255371094, + "logps/rejected": -297.22540283203125, + "loss": 0.3184, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.0014796562027186155, + "rewards/margins": 0.0019894675351679325, + "rewards/rejected": -0.0005098110414110124, "step": 30 }, { - "epoch": 0.11, - "learning_rate": 5.633802816901409e-07, - "logits/chosen": 0.0008204568293876946, - "logits/rejected": 0.04422558471560478, - "logps/chosen": -365.8589782714844, - "logps/rejected": -268.4306640625, - "loss": 0.6741, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.012456291355192661, - "rewards/margins": 0.04383957386016846, - "rewards/rejected": -0.056295864284038544, + "epoch": 0.05, + "learning_rate": 2.3952095808383233e-07, + "logits/chosen": 0.019822608679533005, + "logits/rejected": 0.08296267688274384, + "logps/chosen": -319.4927062988281, + "logps/rejected": -283.9751892089844, + "loss": 0.3256, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.002856344450265169, + "rewards/margins": 0.002463708631694317, + "rewards/rejected": 0.00039263576036319137, "step": 40 }, { - "epoch": 0.14, - "learning_rate": 7.04225352112676e-07, - "logits/chosen": 0.043348558247089386, - "logits/rejected": 0.06186369061470032, - "logps/chosen": -278.3343200683594, - "logps/rejected": -209.2032012939453, - "loss": 0.6625, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.03386519104242325, - "rewards/margins": 0.07294665277004242, - "rewards/rejected": -0.10681183636188507, + "epoch": 0.06, + "learning_rate": 2.9940119760479036e-07, + "logits/chosen": 0.08549543470144272, + "logits/rejected": 0.17756062746047974, + "logps/chosen": -323.5517272949219, + "logps/rejected": -260.97430419921875, + "loss": 0.334, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.009697018191218376, + "rewards/margins": 0.006611070595681667, + "rewards/rejected": 0.003085947362706065, "step": 50 }, { - "epoch": 0.17, - "learning_rate": 8.450704225352112e-07, - "logits/chosen": 0.10327181965112686, - "logits/rejected": 0.16977280378341675, - "logps/chosen": -364.6358947753906, - "logps/rejected": -203.13235473632812, - "loss": 0.6482, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.0220925435423851, - "rewards/margins": 0.20718447864055634, - "rewards/rejected": -0.22927701473236084, + "epoch": 0.07, + "learning_rate": 3.592814371257485e-07, + "logits/chosen": 0.06589554995298386, + "logits/rejected": 0.1306503564119339, + "logps/chosen": -298.95758056640625, + "logps/rejected": -270.60491943359375, + "loss": 0.3363, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.013030062429606915, + "rewards/margins": 0.015959080308675766, + "rewards/rejected": -0.002929018810391426, "step": 60 }, { - "epoch": 0.2, - "learning_rate": 9.859154929577465e-07, - "logits/chosen": 0.05509650707244873, - "logits/rejected": 0.08397762477397919, - "logps/chosen": -291.2886657714844, - "logps/rejected": -232.84286499023438, - "loss": 0.645, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.1501120775938034, - "rewards/margins": 0.1249283105134964, - "rewards/rejected": -0.2750404179096222, + "epoch": 0.08, + "learning_rate": 4.191616766467065e-07, + "logits/chosen": 0.12031495571136475, + "logits/rejected": 0.1805482804775238, + "logps/chosen": -324.9089050292969, + "logps/rejected": -258.1722106933594, + "loss": 0.35, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.01291816495358944, + "rewards/margins": 0.024499254301190376, + "rewards/rejected": -0.011581086553633213, "step": 70 }, { - "epoch": 0.22, - "learning_rate": 9.995106132599868e-07, - "logits/chosen": -0.11236193031072617, - "logits/rejected": -0.041074078530073166, - "logps/chosen": -377.6411437988281, - "logps/rejected": -217.063232421875, - "loss": 0.6538, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.00241122767329216, - "rewards/margins": 0.28361231088638306, - "rewards/rejected": -0.2860235273838043, + "epoch": 0.1, + "learning_rate": 4.790419161676647e-07, + "logits/chosen": 0.07129839807748795, + "logits/rejected": 0.06257729232311249, + "logps/chosen": -348.3440856933594, + "logps/rejected": -301.13201904296875, + "loss": 0.3534, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.020938226953148842, + "rewards/margins": 0.04414420947432518, + "rewards/rejected": -0.023205982521176338, "step": 80 }, { - "epoch": 0.25, - "learning_rate": 9.978201358980644e-07, - "logits/chosen": -0.04335172474384308, - "logits/rejected": -0.035887934267520905, - "logps/chosen": -270.10919189453125, - "logps/rejected": -228.088623046875, - "loss": 0.6478, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -0.03529727831482887, - "rewards/margins": 0.09024782478809357, - "rewards/rejected": -0.12554509937763214, + "epoch": 0.11, + "learning_rate": 5.389221556886228e-07, + "logits/chosen": 0.08929282426834106, + "logits/rejected": 0.17325028777122498, + "logps/chosen": -316.1700439453125, + "logps/rejected": -239.6669158935547, + "loss": 0.3401, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.029331078752875328, + "rewards/margins": 0.04176971688866615, + "rewards/rejected": -0.012438638135790825, "step": 90 }, { - "epoch": 0.28, - "learning_rate": 9.949266103908894e-07, - "logits/chosen": -0.07351367175579071, - "logits/rejected": -0.02050161361694336, - "logps/chosen": -325.0459899902344, - "logps/rejected": -215.1256103515625, - "loss": 0.6513, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.040706828236579895, - "rewards/margins": 0.17307811975479126, - "rewards/rejected": -0.13237129151821136, + "epoch": 0.12, + "learning_rate": 5.988023952095807e-07, + "logits/chosen": 0.08235646784305573, + "logits/rejected": 0.20089676976203918, + "logps/chosen": -346.5120849609375, + "logps/rejected": -244.7434844970703, + "loss": 0.3598, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.024305399507284164, + "rewards/margins": 0.05784136801958084, + "rewards/rejected": -0.03353596478700638, "step": 100 }, { - "epoch": 0.31, - "learning_rate": 9.908370293252287e-07, - "logits/chosen": -0.13664308190345764, - "logits/rejected": -0.05858312174677849, - "logps/chosen": -328.02001953125, - "logps/rejected": -232.35107421875, - "loss": 0.6396, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.0018805682193487883, - "rewards/margins": 0.1877610981464386, - "rewards/rejected": -0.18964168429374695, + "epoch": 0.13, + "learning_rate": 6.586826347305389e-07, + "logits/chosen": 0.09147349745035172, + "logits/rejected": 0.14871755242347717, + "logps/chosen": -320.4288024902344, + "logps/rejected": -296.7561950683594, + "loss": 0.3659, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.02884674072265625, + "rewards/margins": 0.07738146930932999, + "rewards/rejected": -0.04853471741080284, "step": 110 }, { - "epoch": 0.34, - "learning_rate": 9.855612757141654e-07, - "logits/chosen": -0.12235263735055923, - "logits/rejected": -0.08722386509180069, - "logps/chosen": -303.6057434082031, - "logps/rejected": -228.93896484375, - "loss": 0.645, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.06580052524805069, - "rewards/margins": 0.1427757292985916, - "rewards/rejected": -0.2085762470960617, + "epoch": 0.14, + "learning_rate": 7.18562874251497e-07, + "logits/chosen": 0.11941643804311752, + "logits/rejected": 0.19315120577812195, + "logps/chosen": -307.4304504394531, + "logps/rejected": -267.32354736328125, + "loss": 0.3478, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.017546942457556725, + "rewards/margins": 0.04876334220170975, + "rewards/rejected": -0.031216394156217575, "step": 120 }, { - "epoch": 0.37, - "learning_rate": 9.791120991134902e-07, - "logits/chosen": -0.0988820344209671, - "logits/rejected": -0.03470475226640701, - "logps/chosen": -342.01495361328125, - "logps/rejected": -231.0585479736328, - "loss": 0.6373, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.02199043706059456, - "rewards/margins": 0.23313036561012268, - "rewards/rejected": -0.21113994717597961, + "epoch": 0.16, + "learning_rate": 7.784431137724551e-07, + "logits/chosen": 0.029125332832336426, + "logits/rejected": 0.13452430069446564, + "logps/chosen": -312.5350646972656, + "logps/rejected": -282.9906921386719, + "loss": 0.3476, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.00010833759006345645, + "rewards/margins": 0.07449211925268173, + "rewards/rejected": -0.074600450694561, "step": 130 }, { - "epoch": 0.39, - "learning_rate": 9.715050848107168e-07, - "logits/chosen": -0.19416412711143494, - "logits/rejected": -0.09932310879230499, - "logps/chosen": -318.74029541015625, - "logps/rejected": -201.91317749023438, - "loss": 0.6316, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.029886353760957718, - "rewards/margins": 0.26950255036354065, - "rewards/rejected": -0.2993888854980469, + "epoch": 0.17, + "learning_rate": 8.38323353293413e-07, + "logits/chosen": 0.07874087244272232, + "logits/rejected": 0.1653667837381363, + "logps/chosen": -323.27349853515625, + "logps/rejected": -291.1534729003906, + "loss": 0.3508, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.02968321368098259, + "rewards/margins": 0.08127189427614212, + "rewards/rejected": -0.05158866569399834, "step": 140 }, { - "epoch": 0.42, - "learning_rate": 9.627586161611731e-07, - "logits/chosen": -0.13512900471687317, - "logits/rejected": -0.08841085433959961, - "logps/chosen": -317.9823913574219, - "logps/rejected": -250.9694366455078, - "loss": 0.6308, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.10612215101718903, - "rewards/margins": 0.17752480506896973, - "rewards/rejected": -0.28364697098731995, + "epoch": 0.18, + "learning_rate": 8.982035928143712e-07, + "logits/chosen": 0.08668439090251923, + "logits/rejected": 0.2019130289554596, + "logps/chosen": -331.7438049316406, + "logps/rejected": -282.2138671875, + "loss": 0.3408, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.01081451028585434, + "rewards/margins": 0.11802507936954498, + "rewards/rejected": -0.10721053928136826, "step": 150 }, { - "epoch": 0.45, - "learning_rate": 9.528938301621955e-07, - "logits/chosen": -0.17586150765419006, - "logits/rejected": -0.10572761297225952, - "logps/chosen": -319.16729736328125, - "logps/rejected": -229.08065795898438, - "loss": 0.6376, + "epoch": 0.19, + "learning_rate": 9.580838323353293e-07, + "logits/chosen": 0.17280864715576172, + "logits/rejected": 0.2055545598268509, + "logps/chosen": -274.8600769042969, + "logps/rejected": -270.06292724609375, + "loss": 0.3387, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.06883593648672104, - "rewards/margins": 0.24380502104759216, - "rewards/rejected": -0.3126409649848938, + "rewards/chosen": -0.0733034759759903, + "rewards/margins": 0.10196540504693985, + "rewards/rejected": -0.17526885867118835, "step": 160 }, { - "epoch": 0.48, - "learning_rate": 9.419345663727804e-07, - "logits/chosen": -0.21701748669147491, - "logits/rejected": -0.14746910333633423, - "logps/chosen": -377.66748046875, - "logps/rejected": -237.4300079345703, - "loss": 0.6355, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.0431918129324913, - "rewards/margins": 0.2865811288356781, - "rewards/rejected": -0.3297729790210724, + "epoch": 0.2, + "learning_rate": 9.999901172555115e-07, + "logits/chosen": 0.11693236976861954, + "logits/rejected": 0.22318188846111298, + "logps/chosen": -295.5730895996094, + "logps/rejected": -241.427978515625, + "loss": 0.3278, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.10353277623653412, + "rewards/margins": 0.08885890245437622, + "rewards/rejected": -0.19239167869091034, "step": 170 }, { - "epoch": 0.51, - "learning_rate": 9.299073093021404e-07, - "logits/chosen": -0.23954620957374573, - "logits/rejected": -0.17175036668777466, - "logps/chosen": -351.08447265625, - "logps/rejected": -240.8822021484375, - "loss": 0.636, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.13297706842422485, - "rewards/margins": 0.2579553723335266, - "rewards/rejected": -0.39093244075775146, + "epoch": 0.22, + "learning_rate": 9.998144348880984e-07, + "logits/chosen": 0.13397441804409027, + "logits/rejected": 0.17609907686710358, + "logps/chosen": -315.9989013671875, + "logps/rejected": -284.2611999511719, + "loss": 0.2989, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.13113589584827423, + "rewards/margins": 0.20865917205810547, + "rewards/rejected": -0.3397950530052185, "step": 180 }, { - "epoch": 0.53, - "learning_rate": 9.168411244063861e-07, - "logits/chosen": -0.1812850534915924, - "logits/rejected": -0.14350393414497375, - "logps/chosen": -278.08203125, - "logps/rejected": -241.28976440429688, - "loss": 0.6317, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.23226889967918396, - "rewards/margins": 0.09439550340175629, - "rewards/rejected": -0.32666438817977905, + "epoch": 0.23, + "learning_rate": 9.994192247951515e-07, + "logits/chosen": 0.0798071026802063, + "logits/rejected": 0.19820377230644226, + "logps/chosen": -370.124755859375, + "logps/rejected": -300.3252258300781, + "loss": 0.2864, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.1878439486026764, + "rewards/margins": 0.19493091106414795, + "rewards/rejected": -0.38277485966682434, "step": 190 }, { - "epoch": 0.56, - "learning_rate": 9.02767587848013e-07, - "logits/chosen": -0.17360162734985352, - "logits/rejected": -0.1402386575937271, - "logps/chosen": -288.13519287109375, - "logps/rejected": -250.974853515625, - "loss": 0.6343, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.1756954938173294, - "rewards/margins": 0.19079844653606415, - "rewards/rejected": -0.36649391055107117, + "epoch": 0.24, + "learning_rate": 9.988046605602389e-07, + "logits/chosen": 0.062232185155153275, + "logits/rejected": 0.10917310416698456, + "logps/chosen": -371.1097106933594, + "logps/rejected": -357.1819763183594, + "loss": 0.2606, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.22029490768909454, + "rewards/margins": 0.28260841965675354, + "rewards/rejected": -0.5029032230377197, "step": 200 }, { - "epoch": 0.59, - "learning_rate": 8.877207101879301e-07, - "logits/chosen": -0.2661494314670563, - "logits/rejected": -0.1918991506099701, - "logps/chosen": -313.1727294921875, - "logps/rejected": -244.7296905517578, - "loss": 0.6282, + "epoch": 0.25, + "learning_rate": 9.979710121113161e-07, + "logits/chosen": 0.034885063767433167, + "logits/rejected": 0.21861211955547333, + "logps/chosen": -342.1944885253906, + "logps/rejected": -289.22650146484375, + "loss": 0.2492, "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.20293402671813965, - "rewards/margins": 0.18547190725803375, - "rewards/rejected": -0.3884059190750122, + "rewards/chosen": -0.2956531047821045, + "rewards/margins": 0.1267615407705307, + "rewards/rejected": -0.422414630651474, "step": 210 }, { - "epoch": 0.62, - "learning_rate": 8.717368541944452e-07, - "logits/chosen": -0.20634086430072784, - "logits/rejected": -0.16375333070755005, - "logps/chosen": -359.44525146484375, - "logps/rejected": -258.64605712890625, - "loss": 0.6301, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.20741090178489685, - "rewards/margins": 0.23171374201774597, - "rewards/rejected": -0.4391246736049652, + "epoch": 0.26, + "learning_rate": 9.969186456021698e-07, + "logits/chosen": 0.16665050387382507, + "logits/rejected": 0.2314036637544632, + "logps/chosen": -332.11944580078125, + "logps/rejected": -352.77984619140625, + "loss": 0.2453, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5235787630081177, + "rewards/margins": 0.11343353986740112, + "rewards/rejected": -0.637012243270874, "step": 220 }, { - "epoch": 0.65, - "learning_rate": 8.54854646967831e-07, - "logits/chosen": -0.17849591374397278, - "logits/rejected": -0.12133362144231796, - "logps/chosen": -307.7841796875, - "logps/rejected": -271.978759765625, - "loss": 0.6423, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.20799310505390167, - "rewards/margins": 0.2731212377548218, - "rewards/rejected": -0.48111432790756226, + "epoch": 0.28, + "learning_rate": 9.956480232515958e-07, + "logits/chosen": 0.08058114349842072, + "logits/rejected": 0.1627165824174881, + "logps/chosen": -358.8059997558594, + "logps/rejected": -323.8888854980469, + "loss": 0.2318, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4380015432834625, + "rewards/margins": 0.17821909487247467, + "rewards/rejected": -0.6162205934524536, "step": 230 }, { - "epoch": 0.67, - "learning_rate": 8.371148865928318e-07, - "logits/chosen": -0.17793336510658264, - "logits/rejected": -0.17101266980171204, - "logps/chosen": -347.5501403808594, - "logps/rejected": -269.15826416015625, - "loss": 0.6228, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.22561614215373993, - "rewards/margins": 0.22716352343559265, - "rewards/rejected": -0.452779620885849, + "epoch": 0.29, + "learning_rate": 9.941597031403838e-07, + "logits/chosen": 0.006361488252878189, + "logits/rejected": 0.14310383796691895, + "logps/chosen": -389.48565673828125, + "logps/rejected": -322.10211181640625, + "loss": 0.2383, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4093681275844574, + "rewards/margins": 0.3130263388156891, + "rewards/rejected": -0.7223945260047913, "step": 240 }, { - "epoch": 0.7, - "learning_rate": 8.185604435447001e-07, - "logits/chosen": -0.2426186501979828, - "logits/rejected": -0.1688891351222992, - "logps/chosen": -385.1275329589844, - "logps/rejected": -280.0343017578125, - "loss": 0.6233, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.2962805926799774, - "rewards/margins": 0.21590442955493927, - "rewards/rejected": -0.5121850371360779, + "epoch": 0.3, + "learning_rate": 9.924543389661986e-07, + "logits/chosen": -0.02250669337809086, + "logits/rejected": 0.07530005276203156, + "logps/chosen": -340.83258056640625, + "logps/rejected": -296.439208984375, + "loss": 0.2014, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.5589783787727356, + "rewards/margins": 0.13796114921569824, + "rewards/rejected": -0.6969395279884338, "step": 250 }, { - "epoch": 0.73, - "learning_rate": 7.992361570870287e-07, - "logits/chosen": -0.18071404099464417, - "logits/rejected": -0.1167706698179245, - "logps/chosen": -386.547607421875, - "logps/rejected": -306.0770568847656, - "loss": 0.6234, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.3630081117153168, - "rewards/margins": 0.305530309677124, - "rewards/rejected": -0.6685384511947632, + "epoch": 0.31, + "learning_rate": 9.905326797564637e-07, + "logits/chosen": 0.0301060788333416, + "logits/rejected": 0.040543533861637115, + "logps/chosen": -358.6849365234375, + "logps/rejected": -356.5751037597656, + "loss": 0.2045, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4870468080043793, + "rewards/margins": 0.35052934288978577, + "rewards/rejected": -0.8375760912895203, "step": 260 }, { - "epoch": 0.76, - "learning_rate": 7.791887269117441e-07, - "logits/chosen": -0.13100236654281616, - "logits/rejected": -0.08122432976961136, - "logps/chosen": -360.1916198730469, - "logps/rejected": -292.900390625, - "loss": 0.6281, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.4012362062931061, - "rewards/margins": 0.26358842849731445, - "rewards/rejected": -0.6648246645927429, + "epoch": 0.32, + "learning_rate": 9.883955695393743e-07, + "logits/chosen": 0.000649239111226052, + "logits/rejected": 0.022925155237317085, + "logps/chosen": -414.7745666503906, + "logps/rejected": -382.8134765625, + "loss": 0.1942, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5404749512672424, + "rewards/margins": 0.33123961091041565, + "rewards/rejected": -0.8717145919799805, "step": 270 }, { - "epoch": 0.79, - "learning_rate": 7.584666002831294e-07, - "logits/chosen": -0.20774011313915253, - "logits/rejected": -0.16246415674686432, - "logps/chosen": -364.76934814453125, - "logps/rejected": -267.95953369140625, - "loss": 0.6164, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.3360297977924347, - "rewards/margins": 0.29080110788345337, - "rewards/rejected": -0.6268308162689209, + "epoch": 0.34, + "learning_rate": 9.860439469731857e-07, + "logits/chosen": -0.09281344711780548, + "logits/rejected": -0.03658398985862732, + "logps/chosen": -405.03375244140625, + "logps/rejected": -366.6494140625, + "loss": 0.185, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6410386562347412, + "rewards/margins": 0.3488469123840332, + "rewards/rejected": -0.9898855090141296, "step": 280 }, { - "epoch": 0.81, - "learning_rate": 7.37119854958609e-07, - "logits/chosen": -0.22421498596668243, - "logits/rejected": -0.15666626393795013, - "logps/chosen": -363.0233459472656, - "logps/rejected": -278.25421142578125, - "loss": 0.6114, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.29455628991127014, - "rewards/margins": 0.39645588397979736, - "rewards/rejected": -0.6910120844841003, + "epoch": 0.35, + "learning_rate": 9.834788449339357e-07, + "logits/chosen": -0.08095192164182663, + "logits/rejected": 0.01947859302163124, + "logps/chosen": -394.8659362792969, + "logps/rejected": -363.68896484375, + "loss": 0.1967, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6716563701629639, + "rewards/margins": 0.35823893547058105, + "rewards/rejected": -1.029895305633545, "step": 290 }, { - "epoch": 0.84, - "learning_rate": 7.152000781692285e-07, - "logits/chosen": -0.2969985902309418, - "logits/rejected": -0.19818410277366638, - "logps/chosen": -392.032470703125, - "logps/rejected": -303.8256530761719, - "loss": 0.6055, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.46714216470718384, - "rewards/margins": 0.38615185022354126, - "rewards/rejected": -0.8532940149307251, + "epoch": 0.36, + "learning_rate": 9.807013900617874e-07, + "logits/chosen": -0.0883181020617485, + "logits/rejected": -0.04594338685274124, + "logps/chosen": -368.65673828125, + "logps/rejected": -391.3207092285156, + "loss": 0.172, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6309670805931091, + "rewards/margins": 0.35369187593460083, + "rewards/rejected": -0.9846588969230652, "step": 300 }, { - "epoch": 0.87, - "learning_rate": 6.927602419522946e-07, - "logits/chosen": -0.32732582092285156, - "logits/rejected": -0.2507998049259186, - "logps/chosen": -403.8778381347656, - "logps/rejected": -287.19976806640625, - "loss": 0.6042, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.5833036303520203, - "rewards/margins": 0.34751299023628235, - "rewards/rejected": -0.9308164715766907, + "epoch": 0.37, + "learning_rate": 9.777128022661876e-07, + "logits/chosen": -0.09762546420097351, + "logits/rejected": -0.0024875595699995756, + "logps/chosen": -399.7887878417969, + "logps/rejected": -358.4689025878906, + "loss": 0.1668, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8196405172348022, + "rewards/margins": 0.29697737097740173, + "rewards/rejected": -1.1166179180145264, "step": 310 }, { - "epoch": 0.9, - "learning_rate": 6.698545751374463e-07, - "logits/chosen": -0.30765649676322937, - "logits/rejected": -0.22963877022266388, - "logps/chosen": -404.23016357421875, - "logps/rejected": -302.5799560546875, - "loss": 0.6112, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.6336126923561096, - "rewards/margins": 0.31230220198631287, - "rewards/rejected": -0.9459150433540344, + "epoch": 0.38, + "learning_rate": 9.745143941900607e-07, + "logits/chosen": 0.008178139105439186, + "logits/rejected": 0.06296094506978989, + "logps/chosen": -318.7600402832031, + "logps/rejected": -331.10064697265625, + "loss": 0.1882, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7780025005340576, + "rewards/margins": 0.26487836241722107, + "rewards/rejected": -1.0428807735443115, "step": 320 }, { - "epoch": 0.93, - "learning_rate": 6.465384322955224e-07, - "logits/chosen": -0.23409101366996765, - "logits/rejected": -0.18529504537582397, - "logps/chosen": -368.0924072265625, - "logps/rejected": -304.32781982421875, - "loss": 0.6115, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.5948143601417542, - "rewards/margins": 0.30781102180480957, - "rewards/rejected": -0.9026253819465637, + "epoch": 0.4, + "learning_rate": 9.711075706332709e-07, + "logits/chosen": 0.013791674748063087, + "logits/rejected": 0.0160694383084774, + "logps/chosen": -388.42034912109375, + "logps/rejected": -389.6010437011719, + "loss": 0.2253, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7162211537361145, + "rewards/margins": 0.36372461915016174, + "rewards/rejected": -1.0799458026885986, "step": 330 }, { - "epoch": 0.96, - "learning_rate": 6.228681599669248e-07, - "logits/chosen": -0.2893804907798767, - "logits/rejected": -0.20403006672859192, - "logps/chosen": -332.33050537109375, - "logps/rejected": -249.17416381835938, - "loss": 0.6212, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.47543925046920776, - "rewards/margins": 0.2542230486869812, - "rewards/rejected": -0.729662299156189, + "epoch": 0.41, + "learning_rate": 9.674938279356085e-07, + "logits/chosen": -0.03776586428284645, + "logits/rejected": 0.06259050965309143, + "logps/chosen": -374.6064453125, + "logps/rejected": -347.88470458984375, + "loss": 0.205, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.534817099571228, + "rewards/margins": 0.38740724325180054, + "rewards/rejected": -0.9222243428230286, "step": 340 }, { - "epoch": 0.98, - "learning_rate": 5.989009604927586e-07, - "logits/chosen": -0.28099551796913147, - "logits/rejected": -0.2452818602323532, - "logps/chosen": -360.80859375, - "logps/rejected": -303.64117431640625, - "loss": 0.6229, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.4838181138038635, - "rewards/margins": 0.25123196840286255, - "rewards/rejected": -0.7350500822067261, + "epoch": 0.42, + "learning_rate": 9.636747533195696e-07, + "logits/chosen": -0.01885460875928402, + "logits/rejected": 0.02142554149031639, + "logps/chosen": -394.37554931640625, + "logps/rejected": -375.2349853515625, + "loss": 0.1826, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6455094218254089, + "rewards/margins": 0.3467075526714325, + "rewards/rejected": -0.992216944694519, "step": 350 }, { - "epoch": 1.01, - "learning_rate": 5.74694753777815e-07, - "logits/chosen": -0.2772447466850281, - "logits/rejected": -0.22402259707450867, - "logps/chosen": -353.03729248046875, - "logps/rejected": -280.85357666015625, - "loss": 0.5672, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.36207324266433716, - "rewards/margins": 0.49078360199928284, - "rewards/rejected": -0.8528569340705872, + "epoch": 0.43, + "learning_rate": 9.596520241932198e-07, + "logits/chosen": -0.017878394573926926, + "logits/rejected": 0.017517492175102234, + "logps/chosen": -352.70306396484375, + "logps/rejected": -338.069091796875, + "loss": 0.19, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.691541314125061, + "rewards/margins": 0.2788214087486267, + "rewards/rejected": -0.970362663269043, "step": 360 }, { - "epoch": 1.04, - "learning_rate": 5.503080373194666e-07, - "logits/chosen": -0.25274351239204407, - "logits/rejected": -0.20375871658325195, - "logps/chosen": -331.82098388671875, - "logps/rejected": -328.4645690917969, - "loss": 0.48, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -0.42001858353614807, - "rewards/margins": 0.6619771122932434, - "rewards/rejected": -1.0819956064224243, + "epoch": 0.44, + "learning_rate": 9.554274074134438e-07, + "logits/chosen": 0.021682120859622955, + "logits/rejected": -0.029350418597459793, + "logps/chosen": -371.61688232421875, + "logps/rejected": -414.383056640625, + "loss": 0.1841, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7872886657714844, + "rewards/margins": 0.4845043122768402, + "rewards/rejected": -1.271793007850647, "step": 370 }, { - "epoch": 1.07, - "learning_rate": 5.257997448407366e-07, - "logits/chosen": -0.22535817325115204, - "logits/rejected": -0.19036546349525452, - "logps/chosen": -421.0411071777344, - "logps/rejected": -368.4337158203125, - "loss": 0.4831, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -0.645516037940979, - "rewards/margins": 0.8011385202407837, - "rewards/rejected": -1.4466546773910522, + "epoch": 0.46, + "learning_rate": 9.510027585099106e-07, + "logits/chosen": -0.12592732906341553, + "logits/rejected": -0.046247534453868866, + "logps/chosen": -406.028564453125, + "logps/rejected": -401.7203674316406, + "loss": 0.163, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9996460676193237, + "rewards/margins": 0.5138468742370605, + "rewards/rejected": -1.5134929418563843, "step": 380 }, { - "epoch": 1.1, - "learning_rate": 5.012291038691665e-07, - "logits/chosen": -0.22669723629951477, - "logits/rejected": -0.14135774970054626, - "logps/chosen": -512.1949462890625, - "logps/rejected": -392.8013916015625, - "loss": 0.4652, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.9111388921737671, - "rewards/margins": 0.8151243925094604, - "rewards/rejected": -1.726263403892517, + "epoch": 0.47, + "learning_rate": 9.463800208700903e-07, + "logits/chosen": -0.08930721133947372, + "logits/rejected": -0.05070197582244873, + "logps/chosen": -451.42279052734375, + "logps/rejected": -448.52239990234375, + "loss": 0.1745, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.8742043375968933, + "rewards/margins": 0.6940380334854126, + "rewards/rejected": -1.5682423114776611, "step": 390 }, { - "epoch": 1.12, - "learning_rate": 4.7665549260567063e-07, - "logits/chosen": -0.2706853747367859, - "logits/rejected": -0.16077610850334167, - "logps/chosen": -381.7259826660156, - "logps/rejected": -356.19769287109375, - "loss": 0.4675, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -0.8826516270637512, - "rewards/margins": 0.6549713611602783, - "rewards/rejected": -1.5376229286193848, + "epoch": 0.48, + "learning_rate": 9.415612248856824e-07, + "logits/chosen": -0.05381837487220764, + "logits/rejected": 0.003485634922981262, + "logps/chosen": -368.0176086425781, + "logps/rejected": -357.1371765136719, + "loss": 0.1836, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7229073643684387, + "rewards/margins": 0.3449392020702362, + "rewards/rejected": -1.067846655845642, "step": 400 }, { - "epoch": 1.15, - "learning_rate": 4.521382964292663e-07, - "logits/chosen": -0.19228239357471466, - "logits/rejected": -0.15899647772312164, - "logps/chosen": -374.1214599609375, - "logps/rejected": -365.5899658203125, - "loss": 0.462, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.8202232122421265, - "rewards/margins": 0.694760262966156, - "rewards/rejected": -1.5149835348129272, + "epoch": 0.49, + "learning_rate": 9.365484870608296e-07, + "logits/chosen": 0.029063940048217773, + "logits/rejected": 0.097762331366539, + "logps/chosen": -411.65362548828125, + "logps/rejected": -373.55780029296875, + "loss": 0.1899, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.8145805597305298, + "rewards/margins": 0.2862567603588104, + "rewards/rejected": -1.100837230682373, "step": 410 }, { - "epoch": 1.18, - "learning_rate": 4.277367643844574e-07, - "logits/chosen": -0.2522595226764679, - "logits/rejected": -0.15804943442344666, - "logps/chosen": -440.37957763671875, - "logps/rejected": -419.84808349609375, - "loss": 0.4635, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -1.0633289813995361, - "rewards/margins": 0.8434053659439087, - "rewards/rejected": -1.9067341089248657, + "epoch": 0.5, + "learning_rate": 9.313440090825118e-07, + "logits/chosen": -0.05664276331663132, + "logits/rejected": -0.03467511385679245, + "logps/chosen": -398.04229736328125, + "logps/rejected": -385.92242431640625, + "loss": 0.1805, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.8589332699775696, + "rewards/margins": 0.4059682786464691, + "rewards/rejected": -1.2649013996124268, "step": 420 }, { - "epoch": 1.21, - "learning_rate": 4.035098659980891e-07, - "logits/chosen": -0.097696952521801, - "logits/rejected": -0.047377489507198334, - "logps/chosen": -397.20550537109375, - "logps/rejected": -343.21868896484375, - "loss": 0.4612, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -0.9886892437934875, - "rewards/margins": 0.7855029106140137, - "rewards/rejected": -1.7741918563842773, + "epoch": 0.52, + "learning_rate": 9.259500768535226e-07, + "logits/chosen": -0.08377309143543243, + "logits/rejected": -0.07356628775596619, + "logps/chosen": -447.7481994628906, + "logps/rejected": -414.64691162109375, + "loss": 0.1551, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9274501800537109, + "rewards/margins": 0.39995989203453064, + "rewards/rejected": -1.3274099826812744, "step": 430 }, { - "epoch": 1.24, - "learning_rate": 3.795161487716928e-07, - "logits/chosen": -0.19944395124912262, - "logits/rejected": -0.0935400128364563, - "logps/chosen": -585.8798828125, - "logps/rejected": -449.3470764160156, - "loss": 0.4359, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -0.9106278419494629, - "rewards/margins": 1.0275523662567139, - "rewards/rejected": -1.9381802082061768, + "epoch": 0.53, + "learning_rate": 9.203690594884599e-07, + "logits/chosen": -0.12356811761856079, + "logits/rejected": -0.022806715220212936, + "logps/chosen": -456.0494689941406, + "logps/rejected": -365.2657165527344, + "loss": 0.1655, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.7116681337356567, + "rewards/margins": 0.41433924436569214, + "rewards/rejected": -1.126007318496704, "step": 440 }, { - "epoch": 1.26, - "learning_rate": 3.5581359669371223e-07, - "logits/chosen": -0.1967972069978714, - "logits/rejected": -0.028771549463272095, - "logps/chosen": -476.41619873046875, - "logps/rejected": -432.7919921875, - "loss": 0.4449, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2173726558685303, - "rewards/margins": 0.7459080219268799, - "rewards/rejected": -1.963280439376831, + "epoch": 0.54, + "learning_rate": 9.146034082731666e-07, + "logits/chosen": -0.09179284423589706, + "logits/rejected": -0.0003077193978242576, + "logps/chosen": -437.27130126953125, + "logps/rejected": -412.7220764160156, + "loss": 0.1777, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8441268801689148, + "rewards/margins": 0.415422260761261, + "rewards/rejected": -1.2595491409301758, "step": 450 }, { - "epoch": 1.29, - "learning_rate": 3.324594901135326e-07, - "logits/chosen": -0.11798839271068573, - "logits/rejected": 0.010074866935610771, - "logps/chosen": -431.80615234375, - "logps/rejected": -412.49951171875, - "loss": 0.4346, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.3423187732696533, - "rewards/margins": 0.7483752965927124, - "rewards/rejected": -2.090693950653076, + "epoch": 0.55, + "learning_rate": 9.086556555880808e-07, + "logits/chosen": -0.06583790481090546, + "logits/rejected": -0.04182542487978935, + "logps/chosen": -418.4779357910156, + "logps/rejected": -353.83837890625, + "loss": 0.1726, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9192927479743958, + "rewards/margins": 0.37663576006889343, + "rewards/rejected": -1.295928716659546, "step": 460 }, { - "epoch": 1.32, - "learning_rate": 3.095102673159463e-07, - "logits/chosen": -0.016011739149689674, - "logits/rejected": 0.017299681901931763, - "logps/chosen": -446.6827087402344, - "logps/rejected": -445.84368896484375, - "loss": 0.4499, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -1.256035566329956, - "rewards/margins": 0.909619927406311, - "rewards/rejected": -2.1656556129455566, + "epoch": 0.56, + "learning_rate": 9.025284137959672e-07, + "logits/chosen": -0.06473040580749512, + "logits/rejected": -0.0440254732966423, + "logps/chosen": -362.92718505859375, + "logps/rejected": -377.1036071777344, + "loss": 0.1812, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7919570803642273, + "rewards/margins": 0.5987037420272827, + "rewards/rejected": -1.3906608819961548, "step": 470 }, { - "epoch": 1.35, - "learning_rate": 2.870213881305802e-07, - "logits/chosen": -0.06118525192141533, - "logits/rejected": 0.05159863829612732, - "logps/chosen": -449.0279846191406, - "logps/rejected": -437.6075744628906, - "loss": 0.4568, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.2242887020111084, - "rewards/margins": 0.7840197086334229, - "rewards/rejected": -2.0083084106445312, + "epoch": 0.58, + "learning_rate": 8.962243740945193e-07, + "logits/chosen": -0.16925401985645294, + "logits/rejected": -0.08770541846752167, + "logps/chosen": -388.9159240722656, + "logps/rejected": -359.47113037109375, + "loss": 0.1672, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6586478352546692, + "rewards/margins": 0.4439294934272766, + "rewards/rejected": -1.1025774478912354, "step": 480 }, { - "epoch": 1.38, - "learning_rate": 2.6504719990588745e-07, - "logits/chosen": 0.07276159524917603, - "logits/rejected": 0.2596586346626282, - "logps/chosen": -396.858154296875, - "logps/rejected": -399.5719299316406, - "loss": 0.4387, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -1.093074083328247, - "rewards/margins": 0.9474472999572754, - "rewards/rejected": -2.0405211448669434, + "epoch": 0.59, + "learning_rate": 8.897463053343362e-07, + "logits/chosen": -0.15098915994167328, + "logits/rejected": -0.09551471471786499, + "logps/chosen": -375.2362976074219, + "logps/rejected": -377.40374755859375, + "loss": 0.1654, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.7528899312019348, + "rewards/margins": 0.4329261779785156, + "rewards/rejected": -1.1858160495758057, "step": 490 }, { - "epoch": 1.4, - "learning_rate": 2.436408061715988e-07, - "logits/chosen": 0.011857263743877411, - "logits/rejected": 0.129285991191864, - "logps/chosen": -420.177001953125, - "logps/rejected": -385.1749572753906, - "loss": 0.4543, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.1765092611312866, - "rewards/margins": 0.8424569368362427, - "rewards/rejected": -2.0189664363861084, + "epoch": 0.6, + "learning_rate": 8.83097052802791e-07, + "logits/chosen": -0.19105754792690277, + "logits/rejected": -0.20415177941322327, + "logps/chosen": -438.22900390625, + "logps/rejected": -413.8765563964844, + "loss": 0.1546, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1042355298995972, + "rewards/margins": 0.324155330657959, + "rewards/rejected": -1.4283908605575562, "step": 500 }, { - "epoch": 1.43, - "learning_rate": 2.22853938307025e-07, - "logits/chosen": 0.07151642441749573, - "logits/rejected": 0.17956461012363434, - "logps/chosen": -484.64996337890625, - "logps/rejected": -458.7752990722656, - "loss": 0.4237, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.254974603652954, - "rewards/margins": 0.9956918954849243, - "rewards/rejected": -2.2506661415100098, + "epoch": 0.61, + "learning_rate": 8.762795369743302e-07, + "logits/chosen": -0.1732911914587021, + "logits/rejected": -0.14861652255058289, + "logps/chosen": -430.3330078125, + "logps/rejected": -412.541748046875, + "loss": 0.1798, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.980789303779602, + "rewards/margins": 0.3685452342033386, + "rewards/rejected": -1.349334478378296, "step": 510 }, { - "epoch": 1.46, - "learning_rate": 2.0273683052534173e-07, - "logits/chosen": 0.05397455021739006, - "logits/rejected": 0.17397473752498627, - "logps/chosen": -462.7275390625, - "logps/rejected": -428.75018310546875, - "loss": 0.4436, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.3979852199554443, - "rewards/margins": 0.9231426119804382, - "rewards/rejected": -2.3211278915405273, + "epoch": 0.62, + "learning_rate": 8.692967522277452e-07, + "logits/chosen": -0.2620302140712738, + "logits/rejected": -0.23215556144714355, + "logps/chosen": -397.580810546875, + "logps/rejected": -401.218017578125, + "loss": 0.1675, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.922555148601532, + "rewards/margins": 0.5658455491065979, + "rewards/rejected": -1.4884006977081299, "step": 520 }, { - "epoch": 1.49, - "learning_rate": 1.833380984759764e-07, - "logits/chosen": 0.0829939991235733, - "logits/rejected": 0.09590896219015121, - "logps/chosen": -454.99481201171875, - "logps/rejected": -425.2177734375, - "loss": 0.4429, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -1.2986598014831543, - "rewards/margins": 0.9561601877212524, - "rewards/rejected": -2.2548201084136963, + "epoch": 0.64, + "learning_rate": 8.621517655309871e-07, + "logits/chosen": -0.22505703568458557, + "logits/rejected": -0.21353265643119812, + "logps/chosen": -351.1321105957031, + "logps/rejected": -402.13812255859375, + "loss": 0.1616, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9518992304801941, + "rewards/margins": 0.576888918876648, + "rewards/rejected": -1.5287882089614868, "step": 530 }, { - "epoch": 1.52, - "learning_rate": 1.6470462175846606e-07, - "logits/chosen": 0.1332448422908783, - "logits/rejected": 0.2453324794769287, - "logps/chosen": -438.85711669921875, - "logps/rejected": -441.5235290527344, - "loss": 0.4318, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.2478262186050415, - "rewards/margins": 0.9397850036621094, - "rewards/rejected": -2.1876111030578613, + "epoch": 0.65, + "learning_rate": 8.548477150940976e-07, + "logits/chosen": -0.2001039981842041, + "logits/rejected": -0.11327888816595078, + "logps/chosen": -493.7047424316406, + "logps/rejected": -436.589111328125, + "loss": 0.1636, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9714105725288391, + "rewards/margins": 0.33609670400619507, + "rewards/rejected": -1.3075072765350342, "step": 540 }, { - "epoch": 1.55, - "learning_rate": 1.468814306317092e-07, - "logits/chosen": 0.020557861775159836, - "logits/rejected": 0.10769232362508774, - "logps/chosen": -482.09423828125, - "logps/rejected": -429.72943115234375, - "loss": 0.4359, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.2857545614242554, - "rewards/margins": 0.9815255403518677, - "rewards/rejected": -2.267280340194702, + "epoch": 0.66, + "learning_rate": 8.473878089908488e-07, + "logits/chosen": -0.21582035720348358, + "logits/rejected": -0.1645367294549942, + "logps/chosen": -398.8905029296875, + "logps/rejected": -387.6484680175781, + "loss": 0.1616, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.990231990814209, + "rewards/margins": 0.44936639070510864, + "rewards/rejected": -1.4395983219146729, "step": 550 }, { - "epoch": 1.57, - "learning_rate": 1.299115971923958e-07, - "logits/chosen": 0.061371125280857086, - "logits/rejected": 0.17282184958457947, - "logps/chosen": -497.0215759277344, - "logps/rejected": -415.6434020996094, - "loss": 0.4316, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.3921228647232056, - "rewards/margins": 0.8907474279403687, - "rewards/rejected": -2.282870054244995, + "epoch": 0.67, + "learning_rate": 8.397753237496989e-07, + "logits/chosen": -0.14817172288894653, + "logits/rejected": -0.1120673194527626, + "logps/chosen": -411.04547119140625, + "logps/rejected": -383.22772216796875, + "loss": 0.1901, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7595073580741882, + "rewards/margins": 0.40615516901016235, + "rewards/rejected": -1.1656625270843506, "step": 560 }, { - "epoch": 1.6, - "learning_rate": 1.1383613128559305e-07, - "logits/chosen": 0.024585505947470665, - "logits/rejected": 0.15401263535022736, - "logps/chosen": -460.99432373046875, - "logps/rejected": -446.9695739746094, - "loss": 0.4234, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.4750173091888428, - "rewards/margins": 0.9499062299728394, - "rewards/rejected": -2.4249236583709717, + "epoch": 0.68, + "learning_rate": 8.320136029146792e-07, + "logits/chosen": -0.16047896444797516, + "logits/rejected": -0.07018055766820908, + "logps/chosen": -400.98187255859375, + "logps/rejected": -362.17718505859375, + "loss": 0.2001, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6892865896224976, + "rewards/margins": 0.28408753871917725, + "rewards/rejected": -0.97337406873703, "step": 570 }, { - "epoch": 1.63, - "learning_rate": 9.869388139903495e-08, - "logits/chosen": 0.036046337336301804, - "logits/rejected": 0.06279195845127106, - "logps/chosen": -442.05023193359375, - "logps/rejected": -483.8797912597656, - "loss": 0.4126, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5214459896087646, - "rewards/margins": 0.8415502309799194, - "rewards/rejected": -2.3629965782165527, + "epoch": 0.7, + "learning_rate": 8.241060555768485e-07, + "logits/chosen": -0.0704733356833458, + "logits/rejected": -0.06501320749521255, + "logps/chosen": -396.9364318847656, + "logps/rejected": -431.4285583496094, + "loss": 0.1785, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8727092742919922, + "rewards/margins": 0.596550464630127, + "rewards/rejected": -1.4692598581314087, "step": 580 }, { - "epoch": 1.66, - "learning_rate": 8.452144078061818e-08, - "logits/chosen": 0.0481281653046608, - "logits/rejected": 0.19294533133506775, - "logps/chosen": -472.75726318359375, - "logps/rejected": -434.3541564941406, - "loss": 0.4253, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.4548476934432983, - "rewards/margins": 1.0240322351455688, - "rewards/rejected": -2.478879928588867, + "epoch": 0.71, + "learning_rate": 8.160561548769579e-07, + "logits/chosen": -0.28295397758483887, + "logits/rejected": -0.14965271949768066, + "logps/chosen": -452.00927734375, + "logps/rejected": -374.82147216796875, + "loss": 0.1728, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9421501159667969, + "rewards/margins": 0.42659133672714233, + "rewards/rejected": -1.3687413930892944, "step": 590 }, { - "epoch": 1.69, - "learning_rate": 7.135305900598321e-08, - "logits/chosen": 0.07246498018503189, - "logits/rejected": 0.16774284839630127, - "logps/chosen": -489.28515625, - "logps/rejected": -435.74127197265625, - "loss": 0.4446, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.41053307056427, - "rewards/margins": 0.9924067258834839, - "rewards/rejected": -2.402939558029175, + "epoch": 0.72, + "learning_rate": 8.078674364799822e-07, + "logits/chosen": -0.14142009615898132, + "logits/rejected": -0.04986618459224701, + "logps/chosen": -406.5389709472656, + "logps/rejected": -386.47332763671875, + "loss": 0.177, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.805712878704071, + "rewards/margins": 0.3872813880443573, + "rewards/rejected": -1.1929943561553955, "step": 600 }, { - "epoch": 1.71, - "learning_rate": 5.9220559209888166e-08, - "logits/chosen": 0.2750491201877594, - "logits/rejected": 0.2468949854373932, - "logps/chosen": -428.89385986328125, - "logps/rejected": -403.0452575683594, - "loss": 0.4279, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.3840343952178955, - "rewards/margins": 0.910051703453064, - "rewards/rejected": -2.29408597946167, + "epoch": 0.73, + "learning_rate": 7.995434970221915e-07, + "logits/chosen": -0.17559921741485596, + "logits/rejected": -0.11661572754383087, + "logps/chosen": -403.9606018066406, + "logps/rejected": -384.85552978515625, + "loss": 0.1623, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0488908290863037, + "rewards/margins": 0.48383307456970215, + "rewards/rejected": -1.5327237844467163, "step": 610 }, { - "epoch": 1.74, - "learning_rate": 4.815326118139812e-08, - "logits/chosen": 0.11998578161001205, - "logits/rejected": 0.1687900722026825, - "logps/chosen": -465.30218505859375, - "logps/rejected": -447.5667419433594, - "loss": 0.4263, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.4353718757629395, - "rewards/margins": 0.9383231997489929, - "rewards/rejected": -2.373694896697998, + "epoch": 0.74, + "learning_rate": 7.910879925314412e-07, + "logits/chosen": -0.17368339002132416, + "logits/rejected": -0.14526596665382385, + "logps/chosen": -452.844482421875, + "logps/rejected": -468.4239196777344, + "loss": 0.1568, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.206775426864624, + "rewards/margins": 0.4726572036743164, + "rewards/rejected": -1.6794328689575195, "step": 620 }, { - "epoch": 1.77, - "learning_rate": 3.81779105087407e-08, - "logits/chosen": 0.12116841971874237, - "logits/rejected": 0.22567251324653625, - "logps/chosen": -472.7357482910156, - "logps/rejected": -437.3785095214844, - "loss": 0.4132, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.4718587398529053, - "rewards/margins": 0.9858075380325317, - "rewards/rejected": -2.4576661586761475, + "epoch": 0.76, + "learning_rate": 7.825046368213781e-07, + "logits/chosen": -0.0924375057220459, + "logits/rejected": -0.050202567130327225, + "logps/chosen": -395.1549377441406, + "logps/rejected": -428.9422302246094, + "loss": 0.1588, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0802223682403564, + "rewards/margins": 0.5186144113540649, + "rewards/rejected": -1.5988366603851318, "step": 630 }, { - "epoch": 1.8, - "learning_rate": 2.9318613945057637e-08, - "logits/chosen": 0.029863903298974037, - "logits/rejected": 0.14516909420490265, - "logps/chosen": -460.98712158203125, - "logps/rejected": -444.34478759765625, - "loss": 0.4291, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.4373255968093872, - "rewards/margins": 0.9584738612174988, - "rewards/rejected": -2.3957996368408203, + "epoch": 0.77, + "learning_rate": 7.737971998602646e-07, + "logits/chosen": -0.12860114872455597, + "logits/rejected": -0.08621262013912201, + "logps/chosen": -436.1148376464844, + "logps/rejected": -423.77197265625, + "loss": 0.1665, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0132167339324951, + "rewards/margins": 0.6689234972000122, + "rewards/rejected": -1.6821401119232178, "step": 640 }, { - "epoch": 1.83, - "learning_rate": 2.1596781151249523e-08, - "logits/chosen": -0.0014356456231325865, - "logits/rejected": 0.01894865743815899, - "logps/chosen": -513.64111328125, - "logps/rejected": -465.93597412109375, - "loss": 0.4275, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -1.495025396347046, - "rewards/margins": 0.9518612027168274, - "rewards/rejected": -2.4468865394592285, + "epoch": 0.78, + "learning_rate": 7.649695061151383e-07, + "logits/chosen": -0.10626170784235, + "logits/rejected": 0.04015485942363739, + "logps/chosen": -422.88482666015625, + "logps/rejected": -401.40399169921875, + "loss": 0.1549, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.095649003982544, + "rewards/margins": 0.6243374943733215, + "rewards/rejected": -1.7199863195419312, "step": 650 }, { - "epoch": 1.85, - "learning_rate": 1.5031072956701695e-08, - "logits/chosen": 0.13802851736545563, - "logits/rejected": 0.28396037220954895, - "logps/chosen": -462.52191162109375, - "logps/rejected": -448.866455078125, - "loss": 0.4263, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.5100092887878418, - "rewards/margins": 0.8550790548324585, - "rewards/rejected": -2.3650882244110107, + "epoch": 0.79, + "learning_rate": 7.560254328720362e-07, + "logits/chosen": -0.06364638358354568, + "logits/rejected": -0.053037650883197784, + "logps/chosen": -399.8140563964844, + "logps/rejected": -377.3336181640625, + "loss": 0.1513, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9602818489074707, + "rewards/margins": 0.49895793199539185, + "rewards/rejected": -1.4592397212982178, "step": 660 }, { - "epoch": 1.88, - "learning_rate": 9.637356262923723e-09, - "logits/chosen": -0.025952596217393875, - "logits/rejected": 0.005559571087360382, - "logps/chosen": -467.1776428222656, - "logps/rejected": -446.845703125, - "loss": 0.42, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -1.3850352764129639, - "rewards/margins": 1.1320542097091675, - "rewards/rejected": -2.517089366912842, + "epoch": 0.8, + "learning_rate": 7.469689085330195e-07, + "logits/chosen": -0.10155004262924194, + "logits/rejected": -0.053514860570430756, + "logps/chosen": -433.72393798828125, + "logps/rejected": -445.9632873535156, + "loss": 0.1623, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0109504461288452, + "rewards/margins": 0.7182562351226807, + "rewards/rejected": -1.7292066812515259, "step": 670 }, { - "epoch": 1.91, - "learning_rate": 5.428665699084789e-09, - "logits/chosen": 0.04917389899492264, - "logits/rejected": 0.2146989107131958, - "logps/chosen": -479.5437927246094, - "logps/rejected": -460.38885498046875, - "loss": 0.432, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.4707105159759521, - "rewards/margins": 0.9773432612419128, - "rewards/rejected": -2.448054075241089, + "epoch": 0.82, + "learning_rate": 7.37803910890746e-07, + "logits/chosen": 0.04309063404798508, + "logits/rejected": 0.05086972564458847, + "logps/chosen": -364.6871643066406, + "logps/rejected": -444.87615966796875, + "loss": 0.1664, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9387286305427551, + "rewards/margins": 0.5190192461013794, + "rewards/rejected": -1.4577480554580688, "step": 680 }, { - "epoch": 1.94, - "learning_rate": 2.415172122110343e-09, - "logits/chosen": 0.10749175399541855, - "logits/rejected": 0.13517335057258606, - "logps/chosen": -468.06414794921875, - "logps/rejected": -501.6357421875, - "loss": 0.4395, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.5244255065917969, - "rewards/margins": 0.9970847964286804, - "rewards/rejected": -2.521510124206543, + "epoch": 0.83, + "learning_rate": 7.285344653813504e-07, + "logits/chosen": 0.005590127781033516, + "logits/rejected": 0.05527013540267944, + "logps/chosen": -420.50421142578125, + "logps/rejected": -392.6619567871094, + "loss": 0.1652, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7855395078659058, + "rewards/margins": 0.5350450873374939, + "rewards/rejected": -1.3205845355987549, "step": 690 }, { - "epoch": 1.97, - "learning_rate": 6.041580374618327e-10, - "logits/chosen": 0.2404203712940216, - "logits/rejected": 0.3014678359031677, - "logps/chosen": -448.21099853515625, - "logps/rejected": -439.7056579589844, - "loss": 0.4211, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.484864354133606, - "rewards/margins": 0.9606615900993347, - "rewards/rejected": -2.445525884628296, + "epoch": 0.84, + "learning_rate": 7.19164643316399e-07, + "logits/chosen": -0.14258572459220886, + "logits/rejected": -0.07622213661670685, + "logps/chosen": -450.7547302246094, + "logps/rejected": -421.08740234375, + "loss": 0.1606, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8017450571060181, + "rewards/margins": 0.7106142044067383, + "rewards/rejected": -1.5123592615127563, "step": 700 }, { - "epoch": 2.0, - "learning_rate": 0.0, - "logits/chosen": 0.10392512381076813, - "logits/rejected": 0.3015091121196747, - "logps/chosen": -476.21160888671875, - "logps/rejected": -439.8465270996094, - "loss": 0.4323, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.363506555557251, - "rewards/margins": 0.9956139326095581, - "rewards/rejected": -2.3591203689575195, + "epoch": 0.85, + "learning_rate": 7.096985600946937e-07, + "logits/chosen": -0.07260491698980331, + "logits/rejected": -0.03210210055112839, + "logps/chosen": -395.57122802734375, + "logps/rejected": -388.82244873046875, + "loss": 0.1695, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0168393850326538, + "rewards/margins": 0.4624442160129547, + "rewards/rejected": -1.4792835712432861, "step": 710 }, + { + "epoch": 0.86, + "learning_rate": 7.001403733947133e-07, + "logits/chosen": -0.07182411104440689, + "logits/rejected": -0.03923854976892471, + "logps/chosen": -382.0772399902344, + "logps/rejected": -392.26947021484375, + "loss": 0.1581, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0459500551223755, + "rewards/margins": 0.514189600944519, + "rewards/rejected": -1.560139536857605, + "step": 720 + }, + { + "epoch": 0.88, + "learning_rate": 6.904942813484846e-07, + "logits/chosen": -0.011922065168619156, + "logits/rejected": -0.011497074738144875, + "logps/chosen": -387.7651672363281, + "logps/rejected": -401.37579345703125, + "loss": 0.1647, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9897109270095825, + "rewards/margins": 0.43441686034202576, + "rewards/rejected": -1.4241278171539307, + "step": 730 + }, + { + "epoch": 0.89, + "learning_rate": 6.807645206976847e-07, + "logits/chosen": -0.10563385486602783, + "logits/rejected": -0.026922887191176414, + "logps/chosen": -463.51904296875, + "logps/rejected": -396.99237060546875, + "loss": 0.1522, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9849216341972351, + "rewards/margins": 0.29621127247810364, + "rewards/rejected": -1.2811329364776611, + "step": 740 + }, + { + "epoch": 0.9, + "learning_rate": 6.709553649327864e-07, + "logits/chosen": -0.12097591161727905, + "logits/rejected": -0.08931994438171387, + "logps/chosen": -408.4173889160156, + "logps/rejected": -437.5682067871094, + "loss": 0.1462, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0879547595977783, + "rewards/margins": 0.5936441421508789, + "rewards/rejected": -1.6815989017486572, + "step": 750 + }, + { + "epoch": 0.91, + "learning_rate": 6.610711224160624e-07, + "logits/chosen": -0.04705732315778732, + "logits/rejected": -0.07320089638233185, + "logps/chosen": -412.73016357421875, + "logps/rejected": -459.07379150390625, + "loss": 0.1555, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0964686870574951, + "rewards/margins": 0.4577638506889343, + "rewards/rejected": -1.5542323589324951, + "step": 760 + }, + { + "epoch": 0.92, + "learning_rate": 6.51116134489272e-07, + "logits/chosen": -0.1511673927307129, + "logits/rejected": -0.09142941236495972, + "logps/chosen": -437.65899658203125, + "logps/rejected": -391.7604675292969, + "loss": 0.1633, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8741817474365234, + "rewards/margins": 0.5760532021522522, + "rewards/rejected": -1.4502347707748413, + "step": 770 + }, + { + "epoch": 0.94, + "learning_rate": 6.410947735668653e-07, + "logits/chosen": -0.0881911963224411, + "logits/rejected": -0.03626961261034012, + "logps/chosen": -463.324951171875, + "logps/rejected": -437.611328125, + "loss": 0.1497, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.101100206375122, + "rewards/margins": 0.6774808168411255, + "rewards/rejected": -1.778580904006958, + "step": 780 + }, + { + "epoch": 0.95, + "learning_rate": 6.310114412155368e-07, + "logits/chosen": -0.12566931545734406, + "logits/rejected": -0.05288320779800415, + "logps/chosen": -433.12139892578125, + "logps/rejected": -450.35491943359375, + "loss": 0.1387, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.1941142082214355, + "rewards/margins": 0.65628582239151, + "rewards/rejected": -1.8503999710083008, + "step": 790 + }, + { + "epoch": 0.96, + "learning_rate": 6.208705662209762e-07, + "logits/chosen": -0.15951304137706757, + "logits/rejected": -0.08058343082666397, + "logps/chosen": -434.4078063964844, + "logps/rejected": -420.0638122558594, + "loss": 0.1476, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1902692317962646, + "rewards/margins": 0.41559162735939026, + "rewards/rejected": -1.605860948562622, + "step": 800 + }, + { + "epoch": 0.97, + "learning_rate": 6.106766026426648e-07, + "logits/chosen": -0.17886283993721008, + "logits/rejected": -0.07015601545572281, + "logps/chosen": -441.0155334472656, + "logps/rejected": -408.666748046875, + "loss": 0.1398, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0972322225570679, + "rewards/margins": 0.6500229239463806, + "rewards/rejected": -1.7472550868988037, + "step": 810 + }, + { + "epoch": 0.98, + "learning_rate": 6.004340278575695e-07, + "logits/chosen": -0.11742790788412094, + "logits/rejected": -0.08097358047962189, + "logps/chosen": -481.71258544921875, + "logps/rejected": -490.98876953125, + "loss": 0.13, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1779887676239014, + "rewards/margins": 0.5510334372520447, + "rewards/rejected": -1.7290220260620117, + "step": 820 + }, + { + "epoch": 1.0, + "learning_rate": 5.901473405935966e-07, + "logits/chosen": -0.07999231666326523, + "logits/rejected": -0.0074430713430047035, + "logps/chosen": -432.86981201171875, + "logps/rejected": -416.0740661621094, + "loss": 0.1415, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1524440050125122, + "rewards/margins": 0.4403459429740906, + "rewards/rejected": -1.592789888381958, + "step": 830 + }, + { + "epoch": 1.01, + "learning_rate": 5.798210589536672e-07, + "logits/chosen": -0.06455966830253601, + "logits/rejected": -0.09206490218639374, + "logps/chosen": -372.489501953125, + "logps/rejected": -415.30987548828125, + "loss": 0.1047, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.02300226688385, + "rewards/margins": 0.6669188737869263, + "rewards/rejected": -1.6899211406707764, + "step": 840 + }, + { + "epoch": 1.02, + "learning_rate": 5.694597184312832e-07, + "logits/chosen": -0.1633467972278595, + "logits/rejected": -0.11246392875909805, + "logps/chosen": -488.162353515625, + "logps/rejected": -509.6139221191406, + "loss": 0.0701, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3693019151687622, + "rewards/margins": 1.0748904943466187, + "rewards/rejected": -2.444192409515381, + "step": 850 + }, + { + "epoch": 1.03, + "learning_rate": 5.590678699184552e-07, + "logits/chosen": -0.16048532724380493, + "logits/rejected": -0.1280759871006012, + "logps/chosen": -512.06103515625, + "logps/rejected": -648.7377319335938, + "loss": 0.0457, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.176934242248535, + "rewards/margins": 1.2973973751068115, + "rewards/rejected": -3.4743316173553467, + "step": 860 + }, + { + "epoch": 1.04, + "learning_rate": 5.486500777068659e-07, + "logits/chosen": -0.09887855499982834, + "logits/rejected": -0.06003720685839653, + "logps/chosen": -567.089111328125, + "logps/rejected": -624.2326049804688, + "loss": 0.0408, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.2174699306488037, + "rewards/margins": 1.4321911334991455, + "rewards/rejected": -3.6496613025665283, + "step": 870 + }, + { + "epoch": 1.06, + "learning_rate": 5.382109174831493e-07, + "logits/chosen": -0.08137073367834091, + "logits/rejected": -0.01235194131731987, + "logps/chosen": -535.8220825195312, + "logps/rejected": -577.6448974609375, + "loss": 0.0384, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.318791389465332, + "rewards/margins": 0.9558296203613281, + "rewards/rejected": -3.2746212482452393, + "step": 880 + }, + { + "epoch": 1.07, + "learning_rate": 5.277549743191652e-07, + "logits/chosen": -0.05193132162094116, + "logits/rejected": -0.004759219475090504, + "logps/chosen": -552.9955444335938, + "logps/rejected": -598.2587890625, + "loss": 0.0363, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1983516216278076, + "rewards/margins": 1.213283896446228, + "rewards/rejected": -3.411635637283325, + "step": 890 + }, + { + "epoch": 1.08, + "learning_rate": 5.172868406581501e-07, + "logits/chosen": -0.03779071569442749, + "logits/rejected": 0.056744299829006195, + "logps/chosen": -592.2105712890625, + "logps/rejected": -628.3460693359375, + "loss": 0.0302, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.635134220123291, + "rewards/margins": 1.1256375312805176, + "rewards/rejected": -3.760772228240967, + "step": 900 + }, + { + "epoch": 1.09, + "learning_rate": 5.068111142976319e-07, + "logits/chosen": -0.12420773506164551, + "logits/rejected": -0.029382145032286644, + "logps/chosen": -597.504638671875, + "logps/rejected": -654.6985473632812, + "loss": 0.0294, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.694098949432373, + "rewards/margins": 1.2260013818740845, + "rewards/rejected": -3.920100450515747, + "step": 910 + }, + { + "epoch": 1.1, + "learning_rate": 4.963323963699926e-07, + "logits/chosen": -0.023588549345731735, + "logits/rejected": 0.0034168108832091093, + "logps/chosen": -603.0902099609375, + "logps/rejected": -727.82861328125, + "loss": 0.0216, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.0236687660217285, + "rewards/margins": 1.304715633392334, + "rewards/rejected": -4.3283843994140625, + "step": 920 + }, + { + "epoch": 1.12, + "learning_rate": 4.858552893215655e-07, + "logits/chosen": -0.007192631717771292, + "logits/rejected": 0.07475622743368149, + "logps/chosen": -670.5064697265625, + "logps/rejected": -776.722900390625, + "loss": 0.0176, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.2482478618621826, + "rewards/margins": 1.7695789337158203, + "rewards/rejected": -5.017827033996582, + "step": 930 + }, + { + "epoch": 1.13, + "learning_rate": 4.753843948911556e-07, + "logits/chosen": 0.12182103097438812, + "logits/rejected": 0.03238976001739502, + "logps/chosen": -625.9619750976562, + "logps/rejected": -750.8270263671875, + "loss": 0.0191, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.034609317779541, + "rewards/margins": 1.7803478240966797, + "rewards/rejected": -4.814957141876221, + "step": 940 + }, + { + "epoch": 1.14, + "learning_rate": 4.649243120888722e-07, + "logits/chosen": 0.011549100279808044, + "logits/rejected": 0.03488563746213913, + "logps/chosen": -563.0631103515625, + "logps/rejected": -704.4915161132812, + "loss": 0.0259, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.4957776069641113, + "rewards/margins": 1.694275140762329, + "rewards/rejected": -4.190052509307861, + "step": 950 + }, + { + "epoch": 1.15, + "learning_rate": 4.544796351761574e-07, + "logits/chosen": -0.055170875042676926, + "logits/rejected": 0.028366830199956894, + "logps/chosen": -607.5286865234375, + "logps/rejected": -650.486572265625, + "loss": 0.0332, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.490236282348633, + "rewards/margins": 1.102222204208374, + "rewards/rejected": -3.592458724975586, + "step": 960 + }, + { + "epoch": 1.16, + "learning_rate": 4.440549516479022e-07, + "logits/chosen": -0.015740562230348587, + "logits/rejected": 0.13129273056983948, + "logps/chosen": -615.1241455078125, + "logps/rejected": -619.5276489257812, + "loss": 0.0316, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.452130079269409, + "rewards/margins": 1.2859001159667969, + "rewards/rejected": -3.738029956817627, + "step": 970 + }, + { + "epoch": 1.18, + "learning_rate": 4.336548402175345e-07, + "logits/chosen": 0.025289198383688927, + "logits/rejected": 0.15218539535999298, + "logps/chosen": -558.3017578125, + "logps/rejected": -641.06494140625, + "loss": 0.0211, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.584613800048828, + "rewards/margins": 1.562873125076294, + "rewards/rejected": -4.147486686706543, + "step": 980 + }, + { + "epoch": 1.19, + "learning_rate": 4.232838688059627e-07, + "logits/chosen": 0.1965368539094925, + "logits/rejected": 0.2399691641330719, + "logps/chosen": -523.7330322265625, + "logps/rejected": -633.6004028320312, + "loss": 0.0231, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6117119789123535, + "rewards/margins": 1.3616979122161865, + "rewards/rejected": -3.973410129547119, + "step": 990 + }, + { + "epoch": 1.2, + "learning_rate": 4.129465925352618e-07, + "logits/chosen": 0.19317726790905, + "logits/rejected": 0.25055578351020813, + "logps/chosen": -577.004150390625, + "logps/rejected": -663.4541015625, + "loss": 0.0246, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.840785503387451, + "rewards/margins": 1.2757389545440674, + "rewards/rejected": -4.1165242195129395, + "step": 1000 + }, + { + "epoch": 1.21, + "learning_rate": 4.0264755172797837e-07, + "logits/chosen": 0.12826010584831238, + "logits/rejected": 0.30189377069473267, + "logps/chosen": -596.4738159179688, + "logps/rejected": -651.32666015625, + "loss": 0.0217, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.827807664871216, + "rewards/margins": 1.3351318836212158, + "rewards/rejected": -4.162939071655273, + "step": 1010 + }, + { + "epoch": 1.22, + "learning_rate": 3.9239126991293775e-07, + "logits/chosen": 0.059239018708467484, + "logits/rejected": 0.23136253654956818, + "logps/chosen": -616.50341796875, + "logps/rejected": -693.9109497070312, + "loss": 0.021, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.953068494796753, + "rewards/margins": 1.526259183883667, + "rewards/rejected": -4.479328155517578, + "step": 1020 + }, + { + "epoch": 1.24, + "learning_rate": 3.82182251838427e-07, + "logits/chosen": 0.25944507122039795, + "logits/rejected": 0.26485711336135864, + "logps/chosen": -576.2420043945312, + "logps/rejected": -677.7175903320312, + "loss": 0.025, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.817415237426758, + "rewards/margins": 1.3848437070846558, + "rewards/rejected": -4.202259063720703, + "step": 1030 + }, + { + "epoch": 1.25, + "learning_rate": 3.720249814936255e-07, + "logits/chosen": 0.2337515652179718, + "logits/rejected": 0.26738518476486206, + "logps/chosen": -571.0965576171875, + "logps/rejected": -680.7943725585938, + "loss": 0.0222, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.5748653411865234, + "rewards/margins": 1.5103648900985718, + "rewards/rejected": -4.085230350494385, + "step": 1040 + }, + { + "epoch": 1.26, + "learning_rate": 3.6192392013915473e-07, + "logits/chosen": 0.19163131713867188, + "logits/rejected": 0.31179821491241455, + "logps/chosen": -614.9716796875, + "logps/rejected": -675.076904296875, + "loss": 0.0192, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.0146121978759766, + "rewards/margins": 1.1077396869659424, + "rewards/rejected": -4.12235164642334, + "step": 1050 + }, + { + "epoch": 1.27, + "learning_rate": 3.5188350434761025e-07, + "logits/chosen": 0.28871843218803406, + "logits/rejected": 0.36171257495880127, + "logps/chosen": -610.6722412109375, + "logps/rejected": -736.9325561523438, + "loss": 0.0176, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.029247999191284, + "rewards/margins": 1.671494483947754, + "rewards/rejected": -4.700742244720459, + "step": 1060 + }, + { + "epoch": 1.28, + "learning_rate": 3.419081440549368e-07, + "logits/chosen": 0.3570582866668701, + "logits/rejected": 0.33581867814064026, + "logps/chosen": -561.0586547851562, + "logps/rejected": -679.4564819335938, + "loss": 0.0187, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.8346028327941895, + "rewards/margins": 1.4775307178497314, + "rewards/rejected": -4.3121337890625, + "step": 1070 + }, + { + "epoch": 1.3, + "learning_rate": 3.3200222062350324e-07, + "logits/chosen": 0.3482256233692169, + "logits/rejected": 0.41126948595046997, + "logps/chosen": -585.4675903320312, + "logps/rejected": -710.6563720703125, + "loss": 0.0179, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.067091703414917, + "rewards/margins": 1.3588765859603882, + "rewards/rejected": -4.425968647003174, + "step": 1080 + }, + { + "epoch": 1.31, + "learning_rate": 3.2217008491772724e-07, + "logits/chosen": 0.18900027871131897, + "logits/rejected": 0.39844125509262085, + "logps/chosen": -633.6024780273438, + "logps/rejected": -654.0338745117188, + "loss": 0.0193, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8086068630218506, + "rewards/margins": 1.3810145854949951, + "rewards/rejected": -4.189621925354004, + "step": 1090 + }, + { + "epoch": 1.32, + "learning_rate": 3.124160553930953e-07, + "logits/chosen": 0.3631289005279541, + "logits/rejected": 0.4466603398323059, + "logps/chosen": -614.293701171875, + "logps/rejected": -700.2027587890625, + "loss": 0.0195, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.86740779876709, + "rewards/margins": 1.5678730010986328, + "rewards/rejected": -4.435280799865723, + "step": 1100 + }, + { + "epoch": 1.33, + "learning_rate": 3.027444161994178e-07, + "logits/chosen": 0.28234532475471497, + "logits/rejected": 0.46483272314071655, + "logps/chosen": -621.5269775390625, + "logps/rejected": -659.30078125, + "loss": 0.0199, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.8128793239593506, + "rewards/margins": 1.1673749685287476, + "rewards/rejected": -3.9802539348602295, + "step": 1110 + }, + { + "epoch": 1.34, + "learning_rate": 2.9315941529915055e-07, + "logits/chosen": 0.3489723205566406, + "logits/rejected": 0.4230921268463135, + "logps/chosen": -590.6229248046875, + "logps/rejected": -712.2610473632812, + "loss": 0.0191, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.7181546688079834, + "rewards/margins": 1.7558250427246094, + "rewards/rejected": -4.4739789962768555, + "step": 1120 + }, + { + "epoch": 1.36, + "learning_rate": 2.8366526260161205e-07, + "logits/chosen": 0.33894267678260803, + "logits/rejected": 0.39979317784309387, + "logps/chosen": -658.9381103515625, + "logps/rejected": -717.6463623046875, + "loss": 0.0177, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.9368855953216553, + "rewards/margins": 1.5531091690063477, + "rewards/rejected": -4.489995002746582, + "step": 1130 + }, + { + "epoch": 1.37, + "learning_rate": 2.742661281139129e-07, + "logits/chosen": 0.2938409745693207, + "logits/rejected": 0.5151162147521973, + "logps/chosen": -633.9880981445312, + "logps/rejected": -735.9833984375, + "loss": 0.0179, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.052492618560791, + "rewards/margins": 1.437638759613037, + "rewards/rejected": -4.490131378173828, + "step": 1140 + }, + { + "epoch": 1.38, + "learning_rate": 2.6496614010941214e-07, + "logits/chosen": 0.4524363577365875, + "logits/rejected": 0.5436291098594666, + "logps/chosen": -619.8587646484375, + "logps/rejected": -703.4625244140625, + "loss": 0.0157, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.3662285804748535, + "rewards/margins": 1.1978504657745361, + "rewards/rejected": -4.5640788078308105, + "step": 1150 + }, + { + "epoch": 1.39, + "learning_rate": 2.557693833145038e-07, + "logits/chosen": 0.3934541940689087, + "logits/rejected": 0.5870059132575989, + "logps/chosen": -661.9193115234375, + "logps/rejected": -741.275634765625, + "loss": 0.0142, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4275155067443848, + "rewards/margins": 1.533154845237732, + "rewards/rejected": -4.9606709480285645, + "step": 1160 + }, + { + "epoch": 1.4, + "learning_rate": 2.4667989711452873e-07, + "logits/chosen": 0.34201499819755554, + "logits/rejected": 0.42435866594314575, + "logps/chosen": -641.9842529296875, + "logps/rejected": -678.3228759765625, + "loss": 0.0154, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.073949098587036, + "rewards/margins": 1.3109055757522583, + "rewards/rejected": -4.384854316711426, + "step": 1170 + }, + { + "epoch": 1.42, + "learning_rate": 2.3770167377960237e-07, + "logits/chosen": 0.48571348190307617, + "logits/rejected": 0.6182373762130737, + "logps/chosen": -640.8084716796875, + "logps/rejected": -753.9522705078125, + "loss": 0.0167, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.2477283477783203, + "rewards/margins": 1.6778684854507446, + "rewards/rejected": -4.925596714019775, + "step": 1180 + }, + { + "epoch": 1.43, + "learning_rate": 2.2883865671113633e-07, + "logits/chosen": 0.4899370074272156, + "logits/rejected": 0.5625158548355103, + "logps/chosen": -604.4632568359375, + "logps/rejected": -694.26708984375, + "loss": 0.0158, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.1623740196228027, + "rewards/margins": 1.3024260997772217, + "rewards/rejected": -4.464799880981445, + "step": 1190 + }, + { + "epoch": 1.44, + "learning_rate": 2.200947387098232e-07, + "logits/chosen": 0.4543294310569763, + "logits/rejected": 0.4793587625026703, + "logps/chosen": -605.4785766601562, + "logps/rejected": -694.0240478515625, + "loss": 0.0134, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.003962993621826, + "rewards/margins": 1.3972245454788208, + "rewards/rejected": -4.401187419891357, + "step": 1200 + }, + { + "epoch": 1.45, + "learning_rate": 2.1147376026584757e-07, + "logits/chosen": 0.4088858664035797, + "logits/rejected": 0.4953531324863434, + "logps/chosen": -677.3485717773438, + "logps/rejected": -794.863525390625, + "loss": 0.0143, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.3876795768737793, + "rewards/margins": 1.775921106338501, + "rewards/rejected": -5.163600921630859, + "step": 1210 + }, + { + "epoch": 1.46, + "learning_rate": 2.0297950787207047e-07, + "logits/chosen": 0.36632710695266724, + "logits/rejected": 0.48743313550949097, + "logps/chosen": -643.0319213867188, + "logps/rejected": -722.9007568359375, + "loss": 0.0147, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.1978135108947754, + "rewards/margins": 1.263356328010559, + "rewards/rejected": -4.461170196533203, + "step": 1220 + }, + { + "epoch": 1.48, + "learning_rate": 1.9461571236093288e-07, + "logits/chosen": 0.4280903935432434, + "logits/rejected": 0.6009167432785034, + "logps/chosen": -613.9682006835938, + "logps/rejected": -689.1741333007812, + "loss": 0.0176, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.1748690605163574, + "rewards/margins": 1.0791981220245361, + "rewards/rejected": -4.2540669441223145, + "step": 1230 + }, + { + "epoch": 1.49, + "learning_rate": 1.8638604726580476e-07, + "logits/chosen": 0.417907178401947, + "logits/rejected": 0.6247476935386658, + "logps/chosen": -657.6587524414062, + "logps/rejected": -703.07470703125, + "loss": 0.0182, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.0835118293762207, + "rewards/margins": 1.2861461639404297, + "rewards/rejected": -4.369658470153809, + "step": 1240 + }, + { + "epoch": 1.5, + "learning_rate": 1.782941272075017e-07, + "logits/chosen": 0.4170072674751282, + "logits/rejected": 0.6365597248077393, + "logps/chosen": -678.8447265625, + "logps/rejected": -731.3723754882812, + "loss": 0.0154, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.4483695030212402, + "rewards/margins": 1.1840156316757202, + "rewards/rejected": -4.63238525390625, + "step": 1250 + }, + { + "epoch": 1.51, + "learning_rate": 1.7034350630667626e-07, + "logits/chosen": 0.3995654582977295, + "logits/rejected": 0.5743976831436157, + "logps/chosen": -584.9893798828125, + "logps/rejected": -697.0750732421875, + "loss": 0.0175, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.018463611602783, + "rewards/margins": 1.5104460716247559, + "rewards/rejected": -4.528909683227539, + "step": 1260 + }, + { + "epoch": 1.52, + "learning_rate": 1.6253767662278345e-07, + "logits/chosen": 0.4882670044898987, + "logits/rejected": 0.5395201444625854, + "logps/chosen": -603.7424926757812, + "logps/rejected": -686.8632202148438, + "loss": 0.0182, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8431849479675293, + "rewards/margins": 1.2739489078521729, + "rewards/rejected": -4.117133617401123, + "step": 1270 + }, + { + "epoch": 1.54, + "learning_rate": 1.548800666203028e-07, + "logits/chosen": 0.5481593608856201, + "logits/rejected": 0.5967272520065308, + "logps/chosen": -568.0536499023438, + "logps/rejected": -666.0924072265625, + "loss": 0.0186, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.577122926712036, + "rewards/margins": 1.5425646305084229, + "rewards/rejected": -4.119688034057617, + "step": 1280 + }, + { + "epoch": 1.55, + "learning_rate": 1.4737403966289385e-07, + "logits/chosen": 0.4185335636138916, + "logits/rejected": 0.558224618434906, + "logps/chosen": -628.1019897460938, + "logps/rejected": -670.2452392578125, + "loss": 0.0206, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.917240619659424, + "rewards/margins": 1.3834682703018188, + "rewards/rejected": -4.300709247589111, + "step": 1290 + }, + { + "epoch": 1.56, + "learning_rate": 1.400228925361449e-07, + "logits/chosen": 0.456881046295166, + "logits/rejected": 0.577447772026062, + "logps/chosen": -643.3248901367188, + "logps/rejected": -714.2346801757812, + "loss": 0.0143, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.2684836387634277, + "rewards/margins": 1.3436448574066162, + "rewards/rejected": -4.612128257751465, + "step": 1300 + }, + { + "epoch": 1.57, + "learning_rate": 1.328298539995637e-07, + "logits/chosen": 0.5743575096130371, + "logits/rejected": 0.5451524257659912, + "logps/chosen": -630.0186767578125, + "logps/rejected": -750.6807861328125, + "loss": 0.0148, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1200015544891357, + "rewards/margins": 1.702667236328125, + "rewards/rejected": -4.82266902923584, + "step": 1310 + }, + { + "epoch": 1.58, + "learning_rate": 1.257980833684471e-07, + "logits/chosen": 0.606614351272583, + "logits/rejected": 0.6577982902526855, + "logps/chosen": -708.2349853515625, + "logps/rejected": -840.3240966796875, + "loss": 0.0138, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.5392088890075684, + "rewards/margins": 1.9698632955551147, + "rewards/rejected": -5.509071350097656, + "step": 1320 + }, + { + "epoch": 1.6, + "learning_rate": 1.1893066912625078e-07, + "logits/chosen": 0.4186869263648987, + "logits/rejected": 0.5709268450737, + "logps/chosen": -712.13427734375, + "logps/rejected": -783.5457763671875, + "loss": 0.0152, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.4307665824890137, + "rewards/margins": 1.4622032642364502, + "rewards/rejected": -4.892970085144043, + "step": 1330 + }, + { + "epoch": 1.61, + "learning_rate": 1.1223062756807078e-07, + "logits/chosen": 0.5084559917449951, + "logits/rejected": 0.6343249678611755, + "logps/chosen": -659.8488159179688, + "logps/rejected": -722.3294067382812, + "loss": 0.0153, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.1651368141174316, + "rewards/margins": 1.3635410070419312, + "rewards/rejected": -4.528677940368652, + "step": 1340 + }, + { + "epoch": 1.62, + "learning_rate": 1.0570090147583088e-07, + "logits/chosen": 0.4348738193511963, + "logits/rejected": 0.6137397289276123, + "logps/chosen": -639.1541748046875, + "logps/rejected": -713.1842651367188, + "loss": 0.0168, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.3276889324188232, + "rewards/margins": 1.2510149478912354, + "rewards/rejected": -4.578703880310059, + "step": 1350 + }, + { + "epoch": 1.63, + "learning_rate": 9.934435882575848e-08, + "logits/chosen": 0.4675541818141937, + "logits/rejected": 0.7556678056716919, + "logps/chosen": -631.5349731445312, + "logps/rejected": -652.1285400390625, + "loss": 0.0157, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.182419538497925, + "rewards/margins": 0.9836376309394836, + "rewards/rejected": -4.166057586669922, + "step": 1360 + }, + { + "epoch": 1.64, + "learning_rate": 9.316379152871668e-08, + "logits/chosen": 0.7302010655403137, + "logits/rejected": 0.8742607831954956, + "logps/chosen": -614.2385864257812, + "logps/rejected": -697.8746948242188, + "loss": 0.0143, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9696247577667236, + "rewards/margins": 1.4906537532806396, + "rewards/rejected": -4.460278511047363, + "step": 1370 + }, + { + "epoch": 1.66, + "learning_rate": 8.716191420394509e-08, + "logits/chosen": 0.5582844018936157, + "logits/rejected": 0.7243419289588928, + "logps/chosen": -625.010009765625, + "logps/rejected": -712.3367919921875, + "loss": 0.0134, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.13081431388855, + "rewards/margins": 1.5529491901397705, + "rewards/rejected": -4.6837639808654785, + "step": 1380 + }, + { + "epoch": 1.67, + "learning_rate": 8.134136298674931e-08, + "logits/chosen": 0.48102036118507385, + "logits/rejected": 0.6001688838005066, + "logps/chosen": -685.7195434570312, + "logps/rejected": -809.7654418945312, + "loss": 0.014, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.4015579223632812, + "rewards/margins": 1.7538063526153564, + "rewards/rejected": -5.155364036560059, + "step": 1390 + }, + { + "epoch": 1.68, + "learning_rate": 7.570469437066146e-08, + "logits/chosen": 0.5783728957176208, + "logits/rejected": 0.5673348307609558, + "logps/chosen": -617.6305541992188, + "logps/rejected": -682.7364501953125, + "loss": 0.016, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.1451942920684814, + "rewards/margins": 1.0919114351272583, + "rewards/rejected": -4.237105369567871, + "step": 1400 + }, + { + "epoch": 1.69, + "learning_rate": 7.025438408458106e-08, + "logits/chosen": 0.5475814938545227, + "logits/rejected": 0.6235382556915283, + "logps/chosen": -555.6197509765625, + "logps/rejected": -664.3532104492188, + "loss": 0.0146, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.915714740753174, + "rewards/margins": 1.3652766942977905, + "rewards/rejected": -4.280991554260254, + "step": 1410 + }, + { + "epoch": 1.7, + "learning_rate": 6.49928260053893e-08, + "logits/chosen": 0.5400280952453613, + "logits/rejected": 0.5789826512336731, + "logps/chosen": -614.4274291992188, + "logps/rejected": -710.9558715820312, + "loss": 0.0147, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.1186587810516357, + "rewards/margins": 1.3737398386001587, + "rewards/rejected": -4.492398262023926, + "step": 1420 + }, + { + "epoch": 1.72, + "learning_rate": 5.992233110651412e-08, + "logits/chosen": 0.5550512075424194, + "logits/rejected": 0.7411925196647644, + "logps/chosen": -650.88330078125, + "logps/rejected": -781.0955200195312, + "loss": 0.015, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.1585283279418945, + "rewards/margins": 1.8404815196990967, + "rewards/rejected": -4.999009609222412, + "step": 1430 + }, + { + "epoch": 1.73, + "learning_rate": 5.504512644290787e-08, + "logits/chosen": 0.46736687421798706, + "logits/rejected": 0.5480989217758179, + "logps/chosen": -680.0697021484375, + "logps/rejected": -811.5567626953125, + "loss": 0.0137, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.3890538215637207, + "rewards/margins": 1.7540241479873657, + "rewards/rejected": -5.143077373504639, + "step": 1440 + }, + { + "epoch": 1.74, + "learning_rate": 5.036335417288373e-08, + "logits/chosen": 0.6140528917312622, + "logits/rejected": 0.7194653749465942, + "logps/chosen": -698.3580322265625, + "logps/rejected": -771.7311401367188, + "loss": 0.0156, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.3486709594726562, + "rewards/margins": 1.5369219779968262, + "rewards/rejected": -4.885592460632324, + "step": 1450 + }, + { + "epoch": 1.75, + "learning_rate": 4.587907061724033e-08, + "logits/chosen": 0.5339714288711548, + "logits/rejected": 0.7165490984916687, + "logps/chosen": -711.3111572265625, + "logps/rejected": -760.9708251953125, + "loss": 0.0133, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.3206558227539062, + "rewards/margins": 1.4652854204177856, + "rewards/rejected": -4.785941123962402, + "step": 1460 + }, + { + "epoch": 1.76, + "learning_rate": 4.1594245356087467e-08, + "logits/chosen": 0.699637234210968, + "logits/rejected": 0.6114022135734558, + "logps/chosen": -597.2353515625, + "logps/rejected": -700.9934692382812, + "loss": 0.0135, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.9018421173095703, + "rewards/margins": 1.5124543905258179, + "rewards/rejected": -4.4142961502075195, + "step": 1470 + }, + { + "epoch": 1.78, + "learning_rate": 3.751076036377071e-08, + "logits/chosen": 0.5522537231445312, + "logits/rejected": 0.6186084151268005, + "logps/chosen": -587.6602783203125, + "logps/rejected": -697.8819580078125, + "loss": 0.0129, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.0739331245422363, + "rewards/margins": 1.5802268981933594, + "rewards/rejected": -4.654160022735596, + "step": 1480 + }, + { + "epoch": 1.79, + "learning_rate": 3.363040918227289e-08, + "logits/chosen": 0.6091222763061523, + "logits/rejected": 0.6912073493003845, + "logps/chosen": -643.3858032226562, + "logps/rejected": -762.7769775390625, + "loss": 0.0144, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.5035293102264404, + "rewards/margins": 1.3396353721618652, + "rewards/rejected": -4.843164920806885, + "step": 1490 + }, + { + "epoch": 1.8, + "learning_rate": 2.995489613345753e-08, + "logits/chosen": 0.5477187037467957, + "logits/rejected": 0.7094759345054626, + "logps/chosen": -591.9749755859375, + "logps/rejected": -670.8548583984375, + "loss": 0.0136, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.1552910804748535, + "rewards/margins": 1.3263123035430908, + "rewards/rejected": -4.481603622436523, + "step": 1500 + }, + { + "epoch": 1.81, + "learning_rate": 2.6485835570499494e-08, + "logits/chosen": 0.5902107954025269, + "logits/rejected": 0.5623574256896973, + "logps/chosen": -645.7799072265625, + "logps/rejected": -759.1571044921875, + "loss": 0.0126, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.3200345039367676, + "rewards/margins": 1.6903254985809326, + "rewards/rejected": -5.010359764099121, + "step": 1510 + }, + { + "epoch": 1.82, + "learning_rate": 2.3224751168831048e-08, + "logits/chosen": 0.5822694301605225, + "logits/rejected": 0.6582841873168945, + "logps/chosen": -657.9237670898438, + "logps/rejected": -722.8333740234375, + "loss": 0.0138, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.3156898021698, + "rewards/margins": 1.7110404968261719, + "rewards/rejected": -5.026730537414551, + "step": 1520 + }, + { + "epoch": 1.84, + "learning_rate": 2.0173075256915418e-08, + "logits/chosen": 0.571466326713562, + "logits/rejected": 0.6854621767997742, + "logps/chosen": -628.5067138671875, + "logps/rejected": -758.9906005859375, + "loss": 0.014, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.305952548980713, + "rewards/margins": 1.6585476398468018, + "rewards/rejected": -4.964500427246094, + "step": 1530 + }, + { + "epoch": 1.85, + "learning_rate": 1.7332148187142126e-08, + "logits/chosen": 0.42277950048446655, + "logits/rejected": 0.6087282299995422, + "logps/chosen": -669.0001220703125, + "logps/rejected": -746.3592529296875, + "loss": 0.0136, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.435054063796997, + "rewards/margins": 1.4669480323791504, + "rewards/rejected": -4.902002334594727, + "step": 1540 + }, + { + "epoch": 1.86, + "learning_rate": 1.4703217747118746e-08, + "logits/chosen": 0.5524539947509766, + "logits/rejected": 0.6938909292221069, + "logps/chosen": -664.91796875, + "logps/rejected": -753.2789306640625, + "loss": 0.0145, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.3968822956085205, + "rewards/margins": 1.6205644607543945, + "rewards/rejected": -5.017446517944336, + "step": 1550 + }, + { + "epoch": 1.87, + "learning_rate": 1.2287438611620182e-08, + "logits/chosen": 0.5591127872467041, + "logits/rejected": 0.598191499710083, + "logps/chosen": -639.44921875, + "logps/rejected": -717.9773559570312, + "loss": 0.0126, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.239208698272705, + "rewards/margins": 1.4245617389678955, + "rewards/rejected": -4.6637701988220215, + "step": 1560 + }, + { + "epoch": 1.88, + "learning_rate": 1.0085871835434023e-08, + "logits/chosen": 0.5225650072097778, + "logits/rejected": 0.7359067797660828, + "logps/chosen": -657.2638549804688, + "logps/rejected": -744.9007568359375, + "loss": 0.0137, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.3361191749572754, + "rewards/margins": 1.434823751449585, + "rewards/rejected": -4.7709431648254395, + "step": 1570 + }, + { + "epoch": 1.9, + "learning_rate": 8.099484387325494e-09, + "logits/chosen": 0.4896848797798157, + "logits/rejected": 0.623939037322998, + "logps/chosen": -687.8892822265625, + "logps/rejected": -697.3175048828125, + "loss": 0.0135, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.2713406085968018, + "rewards/margins": 1.3368250131607056, + "rewards/rejected": -4.608165740966797, + "step": 1580 + }, + { + "epoch": 1.91, + "learning_rate": 6.3291487253271936e-09, + "logits/chosen": 0.5212961435317993, + "logits/rejected": 0.6600114107131958, + "logps/chosen": -685.01806640625, + "logps/rejected": -809.1822509765625, + "loss": 0.0146, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.535987377166748, + "rewards/margins": 1.684024453163147, + "rewards/rejected": -5.2200117111206055, + "step": 1590 + }, + { + "epoch": 1.92, + "learning_rate": 4.775642413539338e-09, + "logits/chosen": 0.5968783497810364, + "logits/rejected": 0.6918438076972961, + "logps/chosen": -659.1408081054688, + "logps/rejected": -735.6487426757812, + "loss": 0.0143, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.4331729412078857, + "rewards/margins": 1.3792767524719238, + "rewards/rejected": -4.8124494552612305, + "step": 1600 + }, + { + "epoch": 1.93, + "learning_rate": 3.4396477806090674e-09, + "logits/chosen": 0.4734949469566345, + "logits/rejected": 0.6298776268959045, + "logps/chosen": -700.484375, + "logps/rejected": -780.0468139648438, + "loss": 0.0149, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.4217820167541504, + "rewards/margins": 1.7249329090118408, + "rewards/rejected": -5.146714687347412, + "step": 1610 + }, + { + "epoch": 1.94, + "learning_rate": 2.321751620039447e-09, + "logits/chosen": 0.5043723583221436, + "logits/rejected": 0.6294914484024048, + "logps/chosen": -652.2603149414062, + "logps/rejected": -759.38818359375, + "loss": 0.0138, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.322589874267578, + "rewards/margins": 1.9566528797149658, + "rewards/rejected": -5.279242992401123, + "step": 1620 + }, + { + "epoch": 1.96, + "learning_rate": 1.422444932458633e-09, + "logits/chosen": 0.5357510447502136, + "logits/rejected": 0.6437471508979797, + "logps/chosen": -695.9456176757812, + "logps/rejected": -788.6571044921875, + "loss": 0.0133, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4196677207946777, + "rewards/margins": 1.7799441814422607, + "rewards/rejected": -5.199612617492676, + "step": 1630 + }, + { + "epoch": 1.97, + "learning_rate": 7.421227099634886e-10, + "logits/chosen": 0.5378649234771729, + "logits/rejected": 0.6602402925491333, + "logps/chosen": -664.1305541992188, + "logps/rejected": -798.1599731445312, + "loss": 0.0119, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.2800285816192627, + "rewards/margins": 1.8218231201171875, + "rewards/rejected": -5.1018524169921875, + "step": 1640 + }, + { + "epoch": 1.98, + "learning_rate": 2.8108376263175083e-10, + "logits/chosen": 0.6657751202583313, + "logits/rejected": 0.7019415497779846, + "logps/chosen": -635.3778076171875, + "logps/rejected": -713.1087036132812, + "loss": 0.0138, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.1780152320861816, + "rewards/margins": 1.308586835861206, + "rewards/rejected": -4.486601829528809, + "step": 1650 + }, + { + "epoch": 1.99, + "learning_rate": 3.953058727912406e-11, + "logits/chosen": 0.5478759407997131, + "logits/rejected": 0.7440884113311768, + "logps/chosen": -651.7216796875, + "logps/rejected": -715.89599609375, + "loss": 0.0144, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.3715739250183105, + "rewards/margins": 1.3552334308624268, + "rewards/rejected": -4.726807594299316, + "step": 1660 + }, { "epoch": 2.0, - "step": 710, + "step": 1666, "total_flos": 0.0, - "train_loss": 0.5392914261616452, - "train_runtime": 10464.9738, - "train_samples_per_second": 8.705, - "train_steps_per_second": 0.068 + "train_loss": 0.11590367368682951, + "train_runtime": 24766.7088, + "train_samples_per_second": 8.615, + "train_steps_per_second": 0.067 } ], "logging_steps": 10, - "max_steps": 710, + "max_steps": 1666, "num_train_epochs": 2, "save_steps": 10000, "total_flos": 0.0,