{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 6.6666666666666675e-06, "logits/chosen": -3.1526219844818115, "logits/rejected": -3.3119924068450928, "logps/chosen": -18.28135108947754, "logps/rejected": -33.52398681640625, "loss": 0.6997, "rewards/accuracies": 0.0, "rewards/chosen": -0.006796836853027344, "rewards/margins": -0.012901116162538528, "rewards/rejected": 0.006104278843849897, "step": 1 }, { "epoch": 0.04, "learning_rate": 6.666666666666667e-05, "logits/chosen": -3.1073851585388184, "logits/rejected": -3.090308666229248, "logps/chosen": -20.141502380371094, "logps/rejected": -18.037580490112305, "loss": 0.694, "rewards/accuracies": 0.0833333358168602, "rewards/chosen": 0.001263597165234387, "rewards/margins": -0.001611270010471344, "rewards/rejected": 0.002874867059290409, "step": 10 }, { "epoch": 0.08, "learning_rate": 0.00013333333333333334, "logits/chosen": -3.0630269050598145, "logits/rejected": -3.1416983604431152, "logps/chosen": -31.93638038635254, "logps/rejected": -42.507789611816406, "loss": 0.6916, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.010843334719538689, "rewards/margins": 0.003288193140178919, "rewards/rejected": 0.0075551411136984825, "step": 20 }, { "epoch": 0.12, "learning_rate": 0.0002, "logits/chosen": -3.148637294769287, "logits/rejected": -3.150296211242676, "logps/chosen": -22.95195770263672, "logps/rejected": -23.612133026123047, "loss": 0.6974, "rewards/accuracies": 0.15000000596046448, "rewards/chosen": -0.002520971465855837, "rewards/margins": -0.0078018950298428535, "rewards/rejected": 0.005280924029648304, "step": 30 }, { "epoch": 0.16, "learning_rate": 0.0002666666666666667, "logits/chosen": -3.1271812915802, "logits/rejected": -3.0903429985046387, "logps/chosen": -34.57979965209961, "logps/rejected": -27.37040138244629, "loss": 0.6925, "rewards/accuracies": 0.125, "rewards/chosen": 0.03781440109014511, "rewards/margins": 0.006826506461948156, "rewards/rejected": 0.030987894162535667, "step": 40 }, { "epoch": 0.2, "learning_rate": 0.0003333333333333333, "logits/chosen": -3.0824079513549805, "logits/rejected": -3.0999526977539062, "logps/chosen": -29.16314697265625, "logps/rejected": -30.843231201171875, "loss": 0.7057, "rewards/accuracies": 0.25, "rewards/chosen": 0.08534860610961914, "rewards/margins": -0.013902002945542336, "rewards/rejected": 0.09925060719251633, "step": 50 }, { "epoch": 0.24, "learning_rate": 0.0004, "logits/chosen": -3.1495282649993896, "logits/rejected": -3.184638500213623, "logps/chosen": -19.72355079650879, "logps/rejected": -28.213886260986328, "loss": 0.7213, "rewards/accuracies": 0.07500000298023224, "rewards/chosen": -0.05634387582540512, "rewards/margins": -0.043291497975587845, "rewards/rejected": -0.01305237878113985, "step": 60 }, { "epoch": 0.28, "learning_rate": 0.00046666666666666666, "logits/chosen": -3.04826021194458, "logits/rejected": -3.005667209625244, "logps/chosen": -25.625635147094727, "logps/rejected": -24.768199920654297, "loss": 0.7002, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.06301303952932358, "rewards/margins": -0.00712633365765214, "rewards/rejected": -0.055886708199977875, "step": 70 }, { "epoch": 0.32, "learning_rate": 0.0005333333333333334, "logits/chosen": -3.048518419265747, "logits/rejected": -3.061340808868408, "logps/chosen": -32.414894104003906, "logps/rejected": -34.773399353027344, "loss": 0.7835, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.23218408226966858, "rewards/margins": -0.03052915260195732, "rewards/rejected": -0.20165491104125977, "step": 80 }, { "epoch": 0.36, "learning_rate": 0.0006, "logits/chosen": -3.035808563232422, "logits/rejected": -3.1053929328918457, "logps/chosen": -31.509021759033203, "logps/rejected": -45.353553771972656, "loss": 0.7051, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.3204951286315918, "rewards/margins": 0.283873975276947, "rewards/rejected": -0.604369044303894, "step": 90 }, { "epoch": 0.4, "learning_rate": 0.0006666666666666666, "logits/chosen": -2.959228992462158, "logits/rejected": -2.98266339302063, "logps/chosen": -51.024803161621094, "logps/rejected": -54.91625213623047, "loss": 1.0868, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -1.835883378982544, "rewards/margins": 0.3176426589488983, "rewards/rejected": -2.1535260677337646, "step": 100 }, { "epoch": 0.44, "learning_rate": 0.0007333333333333333, "logits/chosen": -3.0634117126464844, "logits/rejected": -3.0850846767425537, "logps/chosen": -23.597423553466797, "logps/rejected": -26.58676528930664, "loss": 0.7629, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.2058214694261551, "rewards/margins": -0.05166854336857796, "rewards/rejected": -0.15415294468402863, "step": 110 }, { "epoch": 0.48, "learning_rate": 0.0008, "logits/chosen": -3.1232190132141113, "logits/rejected": -3.1285691261291504, "logps/chosen": -20.47592544555664, "logps/rejected": -22.593311309814453, "loss": 0.7341, "rewards/accuracies": 0.15000000596046448, "rewards/chosen": -0.46454209089279175, "rewards/margins": -0.023245975375175476, "rewards/rejected": -0.44129619002342224, "step": 120 }, { "epoch": 0.52, "learning_rate": 0.0008666666666666667, "logits/chosen": -2.975956678390503, "logits/rejected": -3.027247190475464, "logps/chosen": -34.07396697998047, "logps/rejected": -42.46125793457031, "loss": 0.723, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.4881093502044678, "rewards/margins": 0.3038038909435272, "rewards/rejected": -0.7919132113456726, "step": 130 }, { "epoch": 0.56, "learning_rate": 0.0009333333333333333, "logits/chosen": -2.981985569000244, "logits/rejected": -2.9559457302093506, "logps/chosen": -33.598899841308594, "logps/rejected": -40.866451263427734, "loss": 0.7877, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.35516494512557983, "rewards/margins": 0.4116950035095215, "rewards/rejected": -0.7668598890304565, "step": 140 }, { "epoch": 0.6, "learning_rate": 0.001, "logits/chosen": -3.05903959274292, "logits/rejected": -3.0611279010772705, "logps/chosen": -20.234691619873047, "logps/rejected": -19.169904708862305, "loss": 0.9163, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -0.36899399757385254, "rewards/margins": -0.08086968958377838, "rewards/rejected": -0.28812432289123535, "step": 150 }, { "epoch": 0.64, "learning_rate": 0.0009723756906077348, "logits/chosen": -2.9612436294555664, "logits/rejected": -2.9223554134368896, "logps/chosen": -32.5883674621582, "logps/rejected": -25.370834350585938, "loss": 0.7856, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": -0.18723489344120026, "rewards/margins": -0.10649768263101578, "rewards/rejected": -0.08073721826076508, "step": 160 }, { "epoch": 0.68, "learning_rate": 0.0009447513812154696, "logits/chosen": -2.9623398780822754, "logits/rejected": -2.914522171020508, "logps/chosen": -39.70682907104492, "logps/rejected": -33.20659637451172, "loss": 0.898, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.4028944969177246, "rewards/margins": -0.04350559413433075, "rewards/rejected": -0.35938888788223267, "step": 170 }, { "epoch": 0.72, "learning_rate": 0.0009171270718232044, "logits/chosen": -3.0877394676208496, "logits/rejected": -3.092603921890259, "logps/chosen": -43.16739273071289, "logps/rejected": -47.593746185302734, "loss": 1.2201, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": -2.5319814682006836, "rewards/margins": -0.10082467645406723, "rewards/rejected": -2.431157112121582, "step": 180 }, { "epoch": 0.76, "learning_rate": 0.0008895027624309392, "logits/chosen": -2.09025239944458, "logits/rejected": -2.0903568267822266, "logps/chosen": -112.7010269165039, "logps/rejected": -105.31596374511719, "loss": 2.9619, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -7.883659362792969, "rewards/margins": -0.7001466751098633, "rewards/rejected": -7.1835126876831055, "step": 190 }, { "epoch": 0.8, "learning_rate": 0.0008618784530386741, "logits/chosen": -1.8135408163070679, "logits/rejected": -1.8160464763641357, "logps/chosen": -75.82559967041016, "logps/rejected": -65.90226745605469, "loss": 2.555, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": -5.0599846839904785, "rewards/margins": -0.6209059953689575, "rewards/rejected": -4.439078330993652, "step": 200 }, { "epoch": 0.84, "learning_rate": 0.0008342541436464089, "logits/chosen": -2.6365649700164795, "logits/rejected": -2.633017063140869, "logps/chosen": -73.8572998046875, "logps/rejected": -115.65068054199219, "loss": 1.3204, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -4.614927291870117, "rewards/margins": 3.1844677925109863, "rewards/rejected": -7.799394130706787, "step": 210 }, { "epoch": 0.88, "learning_rate": 0.0008066298342541437, "logits/chosen": -2.338550329208374, "logits/rejected": -2.337129831314087, "logps/chosen": -75.24410247802734, "logps/rejected": -89.40665435791016, "loss": 1.6189, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -5.084308624267578, "rewards/margins": 0.9201302528381348, "rewards/rejected": -6.004438877105713, "step": 220 }, { "epoch": 0.92, "learning_rate": 0.0007790055248618785, "logits/chosen": -2.328997850418091, "logits/rejected": -2.326946496963501, "logps/chosen": -62.543540954589844, "logps/rejected": -84.85903930664062, "loss": 1.3971, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -3.8174233436584473, "rewards/margins": 1.793341875076294, "rewards/rejected": -5.610764503479004, "step": 230 }, { "epoch": 0.96, "learning_rate": 0.0007513812154696133, "logits/chosen": -2.794644832611084, "logits/rejected": -2.7910995483398438, "logps/chosen": -58.40827560424805, "logps/rejected": -79.7098617553711, "loss": 1.2398, "rewards/accuracies": 0.25, "rewards/chosen": -3.556912660598755, "rewards/margins": 1.693927526473999, "rewards/rejected": -5.250839710235596, "step": 240 }, { "epoch": 1.0, "learning_rate": 0.0007237569060773481, "logits/chosen": -2.703537702560425, "logits/rejected": -2.7038962841033936, "logps/chosen": -40.56044006347656, "logps/rejected": -43.81157684326172, "loss": 1.3514, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -2.4380924701690674, "rewards/margins": 0.2092890739440918, "rewards/rejected": -2.647381544113159, "step": 250 }, { "epoch": 1.04, "learning_rate": 0.0006961325966850829, "logits/chosen": -2.870227098464966, "logits/rejected": -2.891268253326416, "logps/chosen": -67.85215759277344, "logps/rejected": -91.36506652832031, "loss": 1.2686, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -3.6993489265441895, "rewards/margins": 1.7562892436981201, "rewards/rejected": -5.455638408660889, "step": 260 }, { "epoch": 1.08, "learning_rate": 0.0006685082872928176, "logits/chosen": -3.1669762134552, "logits/rejected": -3.1949028968811035, "logps/chosen": -30.239299774169922, "logps/rejected": -42.69363021850586, "loss": 0.7342, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.33911675214767456, "rewards/margins": 0.5873344540596008, "rewards/rejected": -0.9264512062072754, "step": 270 }, { "epoch": 1.12, "learning_rate": 0.0006408839779005525, "logits/chosen": -3.065441131591797, "logits/rejected": -3.0655035972595215, "logps/chosen": -39.723567962646484, "logps/rejected": -53.760826110839844, "loss": 1.3162, "rewards/accuracies": 0.15000000596046448, "rewards/chosen": -2.2118256092071533, "rewards/margins": 0.6879772543907166, "rewards/rejected": -2.8998026847839355, "step": 280 }, { "epoch": 1.16, "learning_rate": 0.0006132596685082873, "logits/chosen": -2.6939263343811035, "logits/rejected": -2.6940252780914307, "logps/chosen": -60.013755798339844, "logps/rejected": -58.038536071777344, "loss": 2.1225, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -3.602609634399414, "rewards/margins": 0.10532107204198837, "rewards/rejected": -3.70793080329895, "step": 290 }, { "epoch": 1.2, "learning_rate": 0.000585635359116022, "logits/chosen": -2.695570468902588, "logits/rejected": -2.6920197010040283, "logps/chosen": -46.42293930053711, "logps/rejected": -44.63296127319336, "loss": 0.9542, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": -2.1268885135650635, "rewards/margins": 0.09368989616632462, "rewards/rejected": -2.22057843208313, "step": 300 }, { "epoch": 1.24, "learning_rate": 0.000558011049723757, "logits/chosen": -3.1045467853546143, "logits/rejected": -3.1222219467163086, "logps/chosen": -22.447757720947266, "logps/rejected": -27.05214500427246, "loss": 0.7383, "rewards/accuracies": 0.15000000596046448, "rewards/chosen": -0.1806814968585968, "rewards/margins": 0.14409010112285614, "rewards/rejected": -0.32477161288261414, "step": 310 }, { "epoch": 1.28, "learning_rate": 0.0005303867403314917, "logits/chosen": -3.1156842708587646, "logits/rejected": -3.130932569503784, "logps/chosen": -21.35702133178711, "logps/rejected": -20.4589900970459, "loss": 0.7337, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.29580003023147583, "rewards/margins": 0.08602263033390045, "rewards/rejected": -0.3818226456642151, "step": 320 }, { "epoch": 1.32, "learning_rate": 0.0005027624309392266, "logits/chosen": -3.050220012664795, "logits/rejected": -3.046596050262451, "logps/chosen": -24.79575538635254, "logps/rejected": -33.02082443237305, "loss": 0.6601, "rewards/accuracies": 0.25, "rewards/chosen": -0.31610769033432007, "rewards/margins": 0.4491572976112366, "rewards/rejected": -0.7652650475502014, "step": 330 }, { "epoch": 1.36, "learning_rate": 0.00047513812154696136, "logits/chosen": -3.1225409507751465, "logits/rejected": -3.142671823501587, "logps/chosen": -34.958404541015625, "logps/rejected": -37.874855041503906, "loss": 0.6053, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.23902547359466553, "rewards/margins": 0.42017507553100586, "rewards/rejected": -0.6592004895210266, "step": 340 }, { "epoch": 1.4, "learning_rate": 0.00044751381215469617, "logits/chosen": -3.3166356086730957, "logits/rejected": -3.271238327026367, "logps/chosen": -41.72985076904297, "logps/rejected": -43.41400909423828, "loss": 0.7479, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.3358752131462097, "rewards/margins": 0.2891044020652771, "rewards/rejected": -0.6249796152114868, "step": 350 }, { "epoch": 1.44, "learning_rate": 0.0004198895027624309, "logits/chosen": -3.2012417316436768, "logits/rejected": -3.2214763164520264, "logps/chosen": -29.948162078857422, "logps/rejected": -34.02383041381836, "loss": 0.7083, "rewards/accuracies": 0.15000000596046448, "rewards/chosen": -0.20552043616771698, "rewards/margins": -0.0057243406772613525, "rewards/rejected": -0.19979611039161682, "step": 360 }, { "epoch": 1.48, "learning_rate": 0.00039226519337016573, "logits/chosen": -3.217780590057373, "logits/rejected": -3.2134361267089844, "logps/chosen": -16.843505859375, "logps/rejected": -17.496295928955078, "loss": 0.763, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -0.05251544713973999, "rewards/margins": -0.043300654739141464, "rewards/rejected": -0.009214771911501884, "step": 370 }, { "epoch": 1.52, "learning_rate": 0.0003646408839779006, "logits/chosen": -3.168470859527588, "logits/rejected": -3.2095096111297607, "logps/chosen": -31.317157745361328, "logps/rejected": -40.99824523925781, "loss": 0.6099, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.06307787448167801, "rewards/margins": 0.3778603971004486, "rewards/rejected": -0.4409382939338684, "step": 380 }, { "epoch": 1.56, "learning_rate": 0.0003370165745856354, "logits/chosen": -3.1807992458343506, "logits/rejected": -3.1729633808135986, "logps/chosen": -25.59493064880371, "logps/rejected": -24.889175415039062, "loss": 0.6768, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.03841208666563034, "rewards/margins": 0.2406325787305832, "rewards/rejected": -0.2790446877479553, "step": 390 }, { "epoch": 1.6, "learning_rate": 0.00030939226519337016, "logits/chosen": -3.1939032077789307, "logits/rejected": -3.1782355308532715, "logps/chosen": -14.640347480773926, "logps/rejected": -13.197868347167969, "loss": 0.6587, "rewards/accuracies": 0.15000000596046448, "rewards/chosen": -0.013051311485469341, "rewards/margins": 0.11761553585529327, "rewards/rejected": -0.1306668370962143, "step": 400 }, { "epoch": 1.64, "learning_rate": 0.00028176795580110497, "logits/chosen": -3.2121384143829346, "logits/rejected": -3.242738723754883, "logps/chosen": -26.492828369140625, "logps/rejected": -33.63092041015625, "loss": 0.6547, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.15410800278186798, "rewards/margins": 0.2064083367586136, "rewards/rejected": -0.36051633954048157, "step": 410 }, { "epoch": 1.68, "learning_rate": 0.0002541436464088398, "logits/chosen": -3.217862367630005, "logits/rejected": -3.2220897674560547, "logps/chosen": -45.029319763183594, "logps/rejected": -50.58649444580078, "loss": 0.7074, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.21466748416423798, "rewards/margins": 0.43003931641578674, "rewards/rejected": -0.644706666469574, "step": 420 }, { "epoch": 1.72, "learning_rate": 0.0002265193370165746, "logits/chosen": -3.2857704162597656, "logits/rejected": -3.285273313522339, "logps/chosen": -24.04534912109375, "logps/rejected": -25.24020767211914, "loss": 0.7055, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.05270111560821533, "rewards/margins": 0.19839158654212952, "rewards/rejected": -0.25109270215034485, "step": 430 }, { "epoch": 1.76, "learning_rate": 0.0001988950276243094, "logits/chosen": -3.1307530403137207, "logits/rejected": -3.097612142562866, "logps/chosen": -25.230710983276367, "logps/rejected": -27.142175674438477, "loss": 0.6265, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.06528893858194351, "rewards/margins": 0.3360690474510193, "rewards/rejected": -0.27078011631965637, "step": 440 }, { "epoch": 1.8, "learning_rate": 0.0001712707182320442, "logits/chosen": -3.210901975631714, "logits/rejected": -3.250978469848633, "logps/chosen": -16.752582550048828, "logps/rejected": -30.28323745727539, "loss": 0.5541, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.05518122762441635, "rewards/margins": 0.479708731174469, "rewards/rejected": -0.42452749609947205, "step": 450 }, { "epoch": 1.84, "learning_rate": 0.000143646408839779, "logits/chosen": -3.048260450363159, "logits/rejected": -3.1042888164520264, "logps/chosen": -33.8494758605957, "logps/rejected": -49.50830841064453, "loss": 0.5998, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0009274661424569786, "rewards/margins": 0.6367529630661011, "rewards/rejected": -0.635825514793396, "step": 460 }, { "epoch": 1.88, "learning_rate": 0.0001160220994475138, "logits/chosen": -3.272062301635742, "logits/rejected": -3.2841033935546875, "logps/chosen": -16.145034790039062, "logps/rejected": -19.171789169311523, "loss": 0.6134, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.10014114528894424, "rewards/margins": 0.250404953956604, "rewards/rejected": -0.15026383101940155, "step": 470 }, { "epoch": 1.92, "learning_rate": 8.839779005524861e-05, "logits/chosen": -3.1270499229431152, "logits/rejected": -3.1859707832336426, "logps/chosen": -28.680988311767578, "logps/rejected": -39.859764099121094, "loss": 0.6342, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.053285278379917145, "rewards/margins": 0.39891186356544495, "rewards/rejected": -0.4521971344947815, "step": 480 }, { "epoch": 1.96, "learning_rate": 6.0773480662983424e-05, "logits/chosen": -3.140641212463379, "logits/rejected": -3.090985059738159, "logps/chosen": -32.661651611328125, "logps/rejected": -32.10502624511719, "loss": 0.7772, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.3610732853412628, "rewards/margins": 0.07261800020933151, "rewards/rejected": -0.43369120359420776, "step": 490 }, { "epoch": 2.0, "learning_rate": 3.3149171270718233e-05, "logits/chosen": -3.1711134910583496, "logits/rejected": -3.190059185028076, "logps/chosen": -36.2880973815918, "logps/rejected": -48.17897415161133, "loss": 0.613, "rewards/accuracies": 0.375, "rewards/chosen": -0.25595012307167053, "rewards/margins": 0.6426823139190674, "rewards/rejected": -0.8986324071884155, "step": 500 }, { "epoch": 2.0, "eval_logits/chosen": -3.225074052810669, "eval_logits/rejected": -3.2351696491241455, "eval_logps/chosen": -28.331165313720703, "eval_logps/rejected": -31.33060073852539, "eval_loss": 0.7142000794410706, "eval_rewards/accuracies": 0.22200000286102295, "eval_rewards/chosen": -0.15271247923374176, "eval_rewards/margins": 0.16096290946006775, "eval_rewards/rejected": -0.3136754035949707, "eval_runtime": 411.5707, "eval_samples_per_second": 2.43, "eval_steps_per_second": 0.304, "step": 500 } ], "logging_steps": 10, "max_steps": 512, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }