{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 100, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.5625e-07, "logits/chosen": -0.8713370561599731, "logits/rejected": -0.8000868558883667, "logps/chosen": -71.98661804199219, "logps/rejected": -66.86463928222656, "loss": 0.1482, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.5625e-06, "logits/chosen": -1.0018833875656128, "logits/rejected": -0.9745380878448486, "logps/chosen": -17.541515350341797, "logps/rejected": -15.201577186584473, "loss": 0.1565, "rewards/accuracies": 0.013888888992369175, "rewards/chosen": -2.488666950739571e-06, "rewards/margins": -2.0448896975722164e-05, "rewards/rejected": 1.7960232071345672e-05, "step": 10 }, { "epoch": 0.06, "learning_rate": 3.125e-06, "logits/chosen": -1.0368674993515015, "logits/rejected": -1.0124459266662598, "logps/chosen": -26.138195037841797, "logps/rejected": -24.956317901611328, "loss": 0.142, "rewards/accuracies": 0.04374999925494194, "rewards/chosen": -4.275892479199683e-06, "rewards/margins": 7.450838893419132e-05, "rewards/rejected": -7.878428004914895e-05, "step": 20 }, { "epoch": 0.1, "learning_rate": 4.6875000000000004e-06, "logits/chosen": -1.0422089099884033, "logits/rejected": -1.0167020559310913, "logps/chosen": -16.177021026611328, "logps/rejected": -12.827156066894531, "loss": 0.1594, "rewards/accuracies": 0.0062500000931322575, "rewards/chosen": 1.4253237168304622e-05, "rewards/margins": -4.276504478184506e-05, "rewards/rejected": 5.7018281950149685e-05, "step": 30 }, { "epoch": 0.13, "learning_rate": 4.989935734988098e-06, "logits/chosen": -1.0433483123779297, "logits/rejected": -1.0170705318450928, "logps/chosen": -14.03108024597168, "logps/rejected": -15.067425727844238, "loss": 0.1303, "rewards/accuracies": 0.012500000186264515, "rewards/chosen": 6.6591557697393e-05, "rewards/margins": -1.4248180377762765e-05, "rewards/rejected": 8.083973807515576e-05, "step": 40 }, { "epoch": 0.16, "learning_rate": 4.949188496058089e-06, "logits/chosen": -1.025377631187439, "logits/rejected": -0.9797303080558777, "logps/chosen": -8.638357162475586, "logps/rejected": -6.794472694396973, "loss": 0.147, "rewards/accuracies": 0.0062500000931322575, "rewards/chosen": 2.7260210117674433e-05, "rewards/margins": 9.803772627492435e-06, "rewards/rejected": 1.7456437490181997e-05, "step": 50 }, { "epoch": 0.19, "learning_rate": 4.8776412907378845e-06, "logits/chosen": -1.0003960132598877, "logits/rejected": -0.97206050157547, "logps/chosen": -9.381010055541992, "logps/rejected": -9.556936264038086, "loss": 0.152, "rewards/accuracies": 0.0, "rewards/chosen": 2.744483936112374e-06, "rewards/margins": -6.780033436371014e-05, "rewards/rejected": 7.054481829982251e-05, "step": 60 }, { "epoch": 0.22, "learning_rate": 4.7761938666470405e-06, "logits/chosen": -1.1042237281799316, "logits/rejected": -1.0950109958648682, "logps/chosen": -18.2097225189209, "logps/rejected": -16.501184463500977, "loss": 0.1346, "rewards/accuracies": 0.012500000186264515, "rewards/chosen": 6.527634104713798e-05, "rewards/margins": -3.1286239391192794e-05, "rewards/rejected": 9.656258771428838e-05, "step": 70 }, { "epoch": 0.26, "learning_rate": 4.646121984004666e-06, "logits/chosen": -1.0520336627960205, "logits/rejected": -1.0432665348052979, "logps/chosen": -1.5275797843933105, "logps/rejected": -1.3896734714508057, "loss": 0.1494, "rewards/accuracies": 0.0, "rewards/chosen": -1.1070919754274655e-05, "rewards/margins": -1.675033900028211e-06, "rewards/rejected": -9.39588608162012e-06, "step": 80 }, { "epoch": 0.29, "learning_rate": 4.4890613722044526e-06, "logits/chosen": -0.9697812795639038, "logits/rejected": -0.9518159627914429, "logps/chosen": -14.255193710327148, "logps/rejected": -14.01024055480957, "loss": 0.1562, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": 0.00017183150339405984, "rewards/margins": 3.2527732400922105e-05, "rewards/rejected": 0.00013930378190707415, "step": 90 }, { "epoch": 0.32, "learning_rate": 4.3069871595684795e-06, "logits/chosen": -1.0500797033309937, "logits/rejected": -1.0140620470046997, "logps/chosen": -9.409942626953125, "logps/rejected": -6.755279541015625, "loss": 0.1204, "rewards/accuracies": 0.0, "rewards/chosen": 1.4868164726067334e-05, "rewards/margins": -4.348888614913449e-05, "rewards/rejected": 5.835705087520182e-05, "step": 100 }, { "epoch": 0.32, "eval_logits/chosen": -0.8371561169624329, "eval_logits/rejected": -0.7447798252105713, "eval_logps/chosen": -396.1279296875, "eval_logps/rejected": -367.0281982421875, "eval_loss": 0.05370998755097389, "eval_rewards/accuracies": 0.4494999945163727, "eval_rewards/chosen": 0.00018531581736169755, "eval_rewards/margins": -3.043685865122825e-05, "eval_rewards/rejected": 0.0002157526760129258, "eval_runtime": 546.3436, "eval_samples_per_second": 3.661, "eval_steps_per_second": 0.915, "step": 100 }, { "epoch": 0.35, "learning_rate": 4.102189034962561e-06, "logits/chosen": -1.04524827003479, "logits/rejected": -1.0312702655792236, "logps/chosen": -21.346084594726562, "logps/rejected": -22.236892700195312, "loss": 0.1488, "rewards/accuracies": 0.03125, "rewards/chosen": 0.00011841374362120405, "rewards/margins": -2.6545517357590143e-06, "rewards/rejected": 0.0001210682894452475, "step": 110 }, { "epoch": 0.38, "learning_rate": 3.8772424536302565e-06, "logits/chosen": -1.0237395763397217, "logits/rejected": -0.9514573216438293, "logps/chosen": -19.651378631591797, "logps/rejected": -17.589191436767578, "loss": 0.145, "rewards/accuracies": 0.05000000074505806, "rewards/chosen": 0.0001629686012165621, "rewards/margins": 0.00013977885828353465, "rewards/rejected": 2.318973929504864e-05, "step": 120 }, { "epoch": 0.42, "learning_rate": 3.634976249348867e-06, "logits/chosen": -1.105330228805542, "logits/rejected": -1.073228120803833, "logps/chosen": -11.863168716430664, "logps/rejected": -9.925447463989258, "loss": 0.1365, "rewards/accuracies": 0.012500000186264515, "rewards/chosen": 7.793636905262247e-05, "rewards/margins": -2.657127333804965e-05, "rewards/rejected": 0.00010450764966662973, "step": 130 }, { "epoch": 0.45, "learning_rate": 3.3784370602033572e-06, "logits/chosen": -1.0086735486984253, "logits/rejected": -0.9279971122741699, "logps/chosen": -17.02583885192871, "logps/rejected": -12.851661682128906, "loss": 0.1655, "rewards/accuracies": 0.03750000149011612, "rewards/chosen": 0.000239654938923195, "rewards/margins": 7.573223410872743e-05, "rewards/rejected": 0.00016392269753850996, "step": 140 }, { "epoch": 0.48, "learning_rate": 3.1108510153447352e-06, "logits/chosen": -1.003169059753418, "logits/rejected": -0.9840670824050903, "logps/chosen": -14.473843574523926, "logps/rejected": -11.736700057983398, "loss": 0.1524, "rewards/accuracies": 0.03125, "rewards/chosen": 0.00021637363533955067, "rewards/margins": 0.00011020306556019932, "rewards/rejected": 0.00010617056250339374, "step": 150 }, { "epoch": 0.51, "learning_rate": 2.835583164544139e-06, "logits/chosen": -1.0689656734466553, "logits/rejected": -1.0586764812469482, "logps/chosen": -14.865160942077637, "logps/rejected": -14.334310531616211, "loss": 0.1408, "rewards/accuracies": 0.03750000149011612, "rewards/chosen": 0.00026066350983455777, "rewards/margins": 8.685861394042149e-05, "rewards/rejected": 0.0001738048595143482, "step": 160 }, { "epoch": 0.54, "learning_rate": 2.556095160739513e-06, "logits/chosen": -1.0525116920471191, "logits/rejected": -0.9792855381965637, "logps/chosen": -23.66820526123047, "logps/rejected": -22.754222869873047, "loss": 0.1427, "rewards/accuracies": 0.03125, "rewards/chosen": 0.00037254736525937915, "rewards/margins": 8.062725828494877e-05, "rewards/rejected": 0.00029192009242251515, "step": 170 }, { "epoch": 0.58, "learning_rate": 2.2759017277414165e-06, "logits/chosen": -1.0303270816802979, "logits/rejected": -0.9566599726676941, "logps/chosen": -26.506671905517578, "logps/rejected": -17.675500869750977, "loss": 0.1365, "rewards/accuracies": 0.03125, "rewards/chosen": 0.00029862881638109684, "rewards/margins": -5.4413605539593846e-05, "rewards/rejected": 0.00035304244374856353, "step": 180 }, { "epoch": 0.61, "learning_rate": 1.9985264605418185e-06, "logits/chosen": -1.0593003034591675, "logits/rejected": -1.015201449394226, "logps/chosen": -13.778154373168945, "logps/rejected": -12.183830261230469, "loss": 0.1457, "rewards/accuracies": 0.012500000186264515, "rewards/chosen": 0.00020911027968395501, "rewards/margins": 3.675861080409959e-05, "rewards/rejected": 0.0001723516616038978, "step": 190 }, { "epoch": 0.64, "learning_rate": 1.7274575140626318e-06, "logits/chosen": -1.102203369140625, "logits/rejected": -1.0614955425262451, "logps/chosen": -22.697526931762695, "logps/rejected": -21.1995906829834, "loss": 0.1673, "rewards/accuracies": 0.03125, "rewards/chosen": 0.00026087305741384625, "rewards/margins": 3.989339165855199e-05, "rewards/rejected": 0.00022097965120337903, "step": 200 }, { "epoch": 0.64, "eval_logits/chosen": -0.8517611026763916, "eval_logits/rejected": -0.7569481730461121, "eval_logps/chosen": -395.0410461425781, "eval_logps/rejected": -365.74951171875, "eval_loss": 0.05380060523748398, "eval_rewards/accuracies": 0.43050000071525574, "eval_rewards/chosen": 0.0012642339570447803, "eval_rewards/margins": -0.00022168662690091878, "eval_rewards/rejected": 0.001485920511186123, "eval_runtime": 546.0518, "eval_samples_per_second": 3.663, "eval_steps_per_second": 0.916, "step": 200 }, { "epoch": 0.67, "learning_rate": 1.466103737583699e-06, "logits/chosen": -1.0760139226913452, "logits/rejected": -1.0579842329025269, "logps/chosen": -21.285213470458984, "logps/rejected": -18.403087615966797, "loss": 0.147, "rewards/accuracies": 0.01875000074505806, "rewards/chosen": 0.00021679059136658907, "rewards/margins": -9.555607539368793e-05, "rewards/rejected": 0.0003123466740362346, "step": 210 }, { "epoch": 0.7, "learning_rate": 1.217751806485235e-06, "logits/chosen": -1.0322855710983276, "logits/rejected": -0.9539896249771118, "logps/chosen": -19.72937774658203, "logps/rejected": -17.00729751586914, "loss": 0.1377, "rewards/accuracies": 0.03750000149011612, "rewards/chosen": 0.0002937263634521514, "rewards/margins": 0.00012941876775585115, "rewards/rejected": 0.0001643076102482155, "step": 220 }, { "epoch": 0.74, "learning_rate": 9.855248903979505e-07, "logits/chosen": -1.0536764860153198, "logits/rejected": -1.0304574966430664, "logps/chosen": -29.4453067779541, "logps/rejected": -24.992198944091797, "loss": 0.1457, "rewards/accuracies": 0.03125, "rewards/chosen": 0.0004715063259936869, "rewards/margins": 4.0133098082151264e-05, "rewards/rejected": 0.0004313732497394085, "step": 230 }, { "epoch": 0.77, "learning_rate": 7.723433775328385e-07, "logits/chosen": -1.094244122505188, "logits/rejected": -1.053062915802002, "logps/chosen": -24.80852699279785, "logps/rejected": -23.72032356262207, "loss": 0.157, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": 0.00014916087093297392, "rewards/margins": -0.0001278716663364321, "rewards/rejected": 0.0002770325227174908, "step": 240 }, { "epoch": 0.8, "learning_rate": 5.808881491049723e-07, "logits/chosen": -1.1208736896514893, "logits/rejected": -1.0476964712142944, "logps/chosen": -38.830894470214844, "logps/rejected": -27.4512939453125, "loss": 0.1685, "rewards/accuracies": 0.03750000149011612, "rewards/chosen": 0.0004542851238511503, "rewards/margins": -1.4414394172490574e-05, "rewards/rejected": 0.0004686995525844395, "step": 250 }, { "epoch": 0.83, "learning_rate": 4.1356686569674344e-07, "logits/chosen": -1.0925004482269287, "logits/rejected": -1.0564720630645752, "logps/chosen": -27.70499610900879, "logps/rejected": -23.542407989501953, "loss": 0.144, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": 0.00039118691347539425, "rewards/margins": -7.202206325018778e-05, "rewards/rejected": 0.0004632089694496244, "step": 260 }, { "epoch": 0.86, "learning_rate": 2.7248368952908055e-07, "logits/chosen": -1.074318528175354, "logits/rejected": -1.048244595527649, "logps/chosen": -32.280303955078125, "logps/rejected": -28.2834415435791, "loss": 0.1723, "rewards/accuracies": 0.03125, "rewards/chosen": 0.0003155902377329767, "rewards/margins": 1.3381155440583825e-05, "rewards/rejected": 0.0003022090531885624, "step": 270 }, { "epoch": 0.9, "learning_rate": 1.59412823400657e-07, "logits/chosen": -1.0563385486602783, "logits/rejected": -1.0031250715255737, "logps/chosen": -25.797882080078125, "logps/rejected": -24.61809539794922, "loss": 0.1346, "rewards/accuracies": 0.03750000149011612, "rewards/chosen": 0.00044602793059311807, "rewards/margins": 0.00014313617430161685, "rewards/rejected": 0.00030289177084341645, "step": 280 }, { "epoch": 0.93, "learning_rate": 7.577619905828281e-08, "logits/chosen": -1.0759773254394531, "logits/rejected": -1.001573085784912, "logps/chosen": -24.67647361755371, "logps/rejected": -17.433469772338867, "loss": 0.1347, "rewards/accuracies": 0.01875000074505806, "rewards/chosen": 8.942681597545743e-05, "rewards/margins": -0.00017002139065880328, "rewards/rejected": 0.0002594481920823455, "step": 290 }, { "epoch": 0.96, "learning_rate": 2.262559558016325e-08, "logits/chosen": -1.0650088787078857, "logits/rejected": -1.0040552616119385, "logps/chosen": -17.18710708618164, "logps/rejected": -16.234764099121094, "loss": 0.1395, "rewards/accuracies": 0.03125, "rewards/chosen": 0.00038518753717653453, "rewards/margins": 4.9668800784274936e-05, "rewards/rejected": 0.00033551876549609005, "step": 300 }, { "epoch": 0.96, "eval_logits/chosen": -0.8541178107261658, "eval_logits/rejected": -0.7586507797241211, "eval_logps/chosen": -395.30059814453125, "eval_logps/rejected": -365.9885559082031, "eval_loss": 0.05379989370703697, "eval_rewards/accuracies": 0.43950000405311584, "eval_rewards/chosen": 0.00100463698618114, "eval_rewards/margins": -0.00024221515923272818, "eval_rewards/rejected": 0.0012468521017581224, "eval_runtime": 545.846, "eval_samples_per_second": 3.664, "eval_steps_per_second": 0.916, "step": 300 }, { "epoch": 0.99, "learning_rate": 6.294126437336734e-10, "logits/chosen": -1.0554258823394775, "logits/rejected": -0.9918642044067383, "logps/chosen": -20.51413345336914, "logps/rejected": -15.299043655395508, "loss": 0.1462, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": 0.0004165357968304306, "rewards/margins": 0.00014305347576737404, "rewards/rejected": 0.0002734823210630566, "step": 310 }, { "epoch": 1.0, "step": 312, "total_flos": 0.0, "train_loss": 0.10059727164797294, "train_runtime": 2585.2833, "train_samples_per_second": 1.934, "train_steps_per_second": 0.121 } ], "logging_steps": 10, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }