{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.5625e-07, "logits/chosen": -0.03749094158411026, "logits/rejected": 0.03204140067100525, "logps/chosen": -559.2154541015625, "logps/rejected": -890.8544921875, "loss": 0.1791, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.5625e-06, "logits/chosen": -0.15300115942955017, "logits/rejected": -0.12813442945480347, "logps/chosen": -481.31634521484375, "logps/rejected": -804.3386840820312, "loss": 0.2179, "rewards/accuracies": 0.5347222089767456, "rewards/chosen": -0.0013270878698676825, "rewards/margins": 0.0023776644375175238, "rewards/rejected": -0.0037047527730464935, "step": 10 }, { "epoch": 0.06, "learning_rate": 3.125e-06, "logits/chosen": -0.17372946441173553, "logits/rejected": -0.10258068144321442, "logps/chosen": -623.6970825195312, "logps/rejected": -831.9564208984375, "loss": 0.1794, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.0440058633685112, "rewards/margins": 0.03512907028198242, "rewards/rejected": -0.07913494110107422, "step": 20 }, { "epoch": 0.1, "learning_rate": 4.6875000000000004e-06, "logits/chosen": -0.19729416072368622, "logits/rejected": -0.2202361822128296, "logps/chosen": -677.2532958984375, "logps/rejected": -1107.8955078125, "loss": 0.1602, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.1397601068019867, "rewards/margins": 0.1527976095676422, "rewards/rejected": -0.2925576865673065, "step": 30 }, { "epoch": 0.13, "learning_rate": 4.989935734988098e-06, "logits/chosen": -0.19801445305347443, "logits/rejected": -0.20075838267803192, "logps/chosen": -686.9973754882812, "logps/rejected": -964.6900634765625, "loss": 0.1518, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.13784293830394745, "rewards/margins": 0.11145637929439545, "rewards/rejected": -0.2492993324995041, "step": 40 }, { "epoch": 0.16, "learning_rate": 4.949188496058089e-06, "logits/chosen": -0.19628724455833435, "logits/rejected": -0.19985933601856232, "logps/chosen": -606.3834228515625, "logps/rejected": -982.189453125, "loss": 0.143, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.13489742577075958, "rewards/margins": 0.14312751591205597, "rewards/rejected": -0.27802491188049316, "step": 50 }, { "epoch": 0.19, "learning_rate": 4.8776412907378845e-06, "logits/chosen": -0.18777325749397278, "logits/rejected": -0.2514544427394867, "logps/chosen": -679.7066650390625, "logps/rejected": -1182.11083984375, "loss": 0.1376, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.14588578045368195, "rewards/margins": 0.16957412660121918, "rewards/rejected": -0.3154599070549011, "step": 60 }, { "epoch": 0.22, "learning_rate": 4.7761938666470405e-06, "logits/chosen": -0.20718152821063995, "logits/rejected": -0.18668043613433838, "logps/chosen": -692.5406494140625, "logps/rejected": -1079.9908447265625, "loss": 0.1417, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.1537138670682907, "rewards/margins": 0.16138955950737, "rewards/rejected": -0.3151034116744995, "step": 70 }, { "epoch": 0.26, "learning_rate": 4.646121984004666e-06, "logits/chosen": -0.21971774101257324, "logits/rejected": -0.24234215915203094, "logps/chosen": -710.6842041015625, "logps/rejected": -1136.7525634765625, "loss": 0.1352, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.19244925677776337, "rewards/margins": 0.1675754338502884, "rewards/rejected": -0.36002466082572937, "step": 80 }, { "epoch": 0.29, "learning_rate": 4.4890613722044526e-06, "logits/chosen": -0.23076686263084412, "logits/rejected": -0.21980659663677216, "logps/chosen": -706.5121459960938, "logps/rejected": -1076.0596923828125, "loss": 0.1278, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.18025998771190643, "rewards/margins": 0.15869399905204773, "rewards/rejected": -0.33895397186279297, "step": 90 }, { "epoch": 0.32, "learning_rate": 4.3069871595684795e-06, "logits/chosen": -0.2438955307006836, "logits/rejected": -0.19910338521003723, "logps/chosen": -733.8079833984375, "logps/rejected": -1110.71923828125, "loss": 0.1256, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.20329563319683075, "rewards/margins": 0.16255612671375275, "rewards/rejected": -0.3658517897129059, "step": 100 }, { "epoch": 0.35, "learning_rate": 4.102189034962561e-06, "logits/chosen": -0.19663023948669434, "logits/rejected": -0.21354413032531738, "logps/chosen": -751.34765625, "logps/rejected": -1080.0113525390625, "loss": 0.1301, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1990022361278534, "rewards/margins": 0.15701577067375183, "rewards/rejected": -0.3560180068016052, "step": 110 }, { "epoch": 0.38, "learning_rate": 3.8772424536302565e-06, "logits/chosen": -0.1580425500869751, "logits/rejected": -0.20701774954795837, "logps/chosen": -649.1971435546875, "logps/rejected": -1110.748779296875, "loss": 0.1265, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.17047809064388275, "rewards/margins": 0.17829902470111847, "rewards/rejected": -0.34877708554267883, "step": 120 }, { "epoch": 0.42, "learning_rate": 3.634976249348867e-06, "logits/chosen": -0.1981876790523529, "logits/rejected": -0.22599034011363983, "logps/chosen": -736.1990966796875, "logps/rejected": -1140.4241943359375, "loss": 0.119, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.1852777749300003, "rewards/margins": 0.19615033268928528, "rewards/rejected": -0.3814280927181244, "step": 130 }, { "epoch": 0.45, "learning_rate": 3.3784370602033572e-06, "logits/chosen": -0.22472605109214783, "logits/rejected": -0.20980677008628845, "logps/chosen": -727.8230590820312, "logps/rejected": -1080.5826416015625, "loss": 0.1191, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.19285574555397034, "rewards/margins": 0.16788846254348755, "rewards/rejected": -0.3607442080974579, "step": 140 }, { "epoch": 0.48, "learning_rate": 3.1108510153447352e-06, "logits/chosen": -0.18862155079841614, "logits/rejected": -0.18806660175323486, "logps/chosen": -744.17919921875, "logps/rejected": -1178.953369140625, "loss": 0.1196, "rewards/accuracies": 0.8125, "rewards/chosen": -0.196334570646286, "rewards/margins": 0.20167379081249237, "rewards/rejected": -0.39800840616226196, "step": 150 }, { "epoch": 0.51, "learning_rate": 2.835583164544139e-06, "logits/chosen": -0.20716705918312073, "logits/rejected": -0.23998220264911652, "logps/chosen": -716.6298217773438, "logps/rejected": -1128.6080322265625, "loss": 0.1196, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.20008280873298645, "rewards/margins": 0.19826427102088928, "rewards/rejected": -0.39834707975387573, "step": 160 }, { "epoch": 0.54, "learning_rate": 2.556095160739513e-06, "logits/chosen": -0.18807800114154816, "logits/rejected": -0.24145498871803284, "logps/chosen": -771.5186767578125, "logps/rejected": -1165.31787109375, "loss": 0.1095, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.24131879210472107, "rewards/margins": 0.18717139959335327, "rewards/rejected": -0.42849016189575195, "step": 170 }, { "epoch": 0.58, "learning_rate": 2.2759017277414165e-06, "logits/chosen": -0.22785386443138123, "logits/rejected": -0.24600060284137726, "logps/chosen": -828.021484375, "logps/rejected": -1258.8414306640625, "loss": 0.1148, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.26432016491889954, "rewards/margins": 0.2154586762189865, "rewards/rejected": -0.47977885603904724, "step": 180 }, { "epoch": 0.61, "learning_rate": 1.9985264605418185e-06, "logits/chosen": -0.19840948283672333, "logits/rejected": -0.22779376804828644, "logps/chosen": -748.4619140625, "logps/rejected": -1191.1099853515625, "loss": 0.1083, "rewards/accuracies": 0.78125, "rewards/chosen": -0.22014987468719482, "rewards/margins": 0.20830897986888885, "rewards/rejected": -0.4284588396549225, "step": 190 }, { "epoch": 0.64, "learning_rate": 1.7274575140626318e-06, "logits/chosen": -0.15184776484966278, "logits/rejected": -0.2133471965789795, "logps/chosen": -777.3941650390625, "logps/rejected": -1259.5482177734375, "loss": 0.1097, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.21874341368675232, "rewards/margins": 0.20576436817646027, "rewards/rejected": -0.4245077669620514, "step": 200 }, { "epoch": 0.67, "learning_rate": 1.466103737583699e-06, "logits/chosen": -0.21760430932044983, "logits/rejected": -0.21733775734901428, "logps/chosen": -786.8435668945312, "logps/rejected": -1184.2471923828125, "loss": 0.1045, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.22768548130989075, "rewards/margins": 0.21592466533184052, "rewards/rejected": -0.44361013174057007, "step": 210 }, { "epoch": 0.7, "learning_rate": 1.217751806485235e-06, "logits/chosen": -0.15853038430213928, "logits/rejected": -0.2094181478023529, "logps/chosen": -753.2785034179688, "logps/rejected": -1242.8369140625, "loss": 0.1112, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.2066824734210968, "rewards/margins": 0.21286919713020325, "rewards/rejected": -0.41955167055130005, "step": 220 }, { "epoch": 0.74, "learning_rate": 9.855248903979505e-07, "logits/chosen": -0.1381937712430954, "logits/rejected": -0.23759886622428894, "logps/chosen": -732.6824951171875, "logps/rejected": -1177.4642333984375, "loss": 0.1109, "rewards/accuracies": 0.84375, "rewards/chosen": -0.19210098683834076, "rewards/margins": 0.2129323035478592, "rewards/rejected": -0.40503329038619995, "step": 230 }, { "epoch": 0.77, "learning_rate": 7.723433775328385e-07, "logits/chosen": -0.18693767488002777, "logits/rejected": -0.19473309814929962, "logps/chosen": -755.0777587890625, "logps/rejected": -1212.91064453125, "loss": 0.112, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.21374483406543732, "rewards/margins": 0.2292633354663849, "rewards/rejected": -0.4430081248283386, "step": 240 }, { "epoch": 0.8, "learning_rate": 5.808881491049723e-07, "logits/chosen": -0.24827036261558533, "logits/rejected": -0.20749957859516144, "logps/chosen": -715.92333984375, "logps/rejected": -1144.6124267578125, "loss": 0.1215, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.20713326334953308, "rewards/margins": 0.18099360167980194, "rewards/rejected": -0.38812685012817383, "step": 250 }, { "epoch": 0.83, "learning_rate": 4.1356686569674344e-07, "logits/chosen": -0.19055083394050598, "logits/rejected": -0.2412928342819214, "logps/chosen": -693.1292724609375, "logps/rejected": -1136.007080078125, "loss": 0.1117, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.1922534853219986, "rewards/margins": 0.17840158939361572, "rewards/rejected": -0.3706550598144531, "step": 260 }, { "epoch": 0.86, "learning_rate": 2.7248368952908055e-07, "logits/chosen": -0.15525199472904205, "logits/rejected": -0.16159489750862122, "logps/chosen": -790.9383544921875, "logps/rejected": -1267.6949462890625, "loss": 0.109, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.21919909119606018, "rewards/margins": 0.19449128210544586, "rewards/rejected": -0.41369038820266724, "step": 270 }, { "epoch": 0.9, "learning_rate": 1.59412823400657e-07, "logits/chosen": -0.19185583293437958, "logits/rejected": -0.1721155345439911, "logps/chosen": -745.1690673828125, "logps/rejected": -1151.0634765625, "loss": 0.1118, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.20249255001544952, "rewards/margins": 0.19131307303905487, "rewards/rejected": -0.3938056528568268, "step": 280 }, { "epoch": 0.93, "learning_rate": 7.577619905828281e-08, "logits/chosen": -0.18523597717285156, "logits/rejected": -0.20255737006664276, "logps/chosen": -761.7174072265625, "logps/rejected": -1124.2659912109375, "loss": 0.1165, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.20564845204353333, "rewards/margins": 0.20229394733905792, "rewards/rejected": -0.40794238448143005, "step": 290 }, { "epoch": 0.96, "learning_rate": 2.262559558016325e-08, "logits/chosen": -0.24068090319633484, "logits/rejected": -0.1731335073709488, "logps/chosen": -718.3553466796875, "logps/rejected": -1177.1654052734375, "loss": 0.1115, "rewards/accuracies": 0.78125, "rewards/chosen": -0.21485304832458496, "rewards/margins": 0.19611014425754547, "rewards/rejected": -0.41096314787864685, "step": 300 }, { "epoch": 0.99, "learning_rate": 6.294126437336734e-10, "logits/chosen": -0.1374652087688446, "logits/rejected": -0.15347729623317719, "logps/chosen": -747.0050048828125, "logps/rejected": -1133.5335693359375, "loss": 0.1124, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.199618861079216, "rewards/margins": 0.20002660155296326, "rewards/rejected": -0.39964547753334045, "step": 310 }, { "epoch": 1.0, "step": 312, "total_flos": 0.0, "train_loss": 0.004328498234733557, "train_runtime": 430.9957, "train_samples_per_second": 46.404, "train_steps_per_second": 0.724 } ], "logging_steps": 10, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }