{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992383853769993, "eval_steps": 500, "global_step": 328, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.5151515151515152e-07, "logits/chosen": 0.17451781034469604, "logits/rejected": 0.3383212387561798, "logps/chosen": -333.87017822265625, "logps/rejected": -329.5252380371094, "loss": 0.3667, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.5151515151515152e-06, "logits/chosen": 0.1506168246269226, "logits/rejected": 0.24525302648544312, "logps/chosen": -346.35150146484375, "logps/rejected": -342.8780517578125, "loss": 0.3709, "rewards/accuracies": 0.3958333432674408, "rewards/chosen": -9.601834608474746e-05, "rewards/margins": -0.00013167243741918355, "rewards/rejected": 3.5654094972414896e-05, "step": 10 }, { "epoch": 0.06, "learning_rate": 3.0303030303030305e-06, "logits/chosen": 0.15077748894691467, "logits/rejected": 0.20032937824726105, "logps/chosen": -319.31719970703125, "logps/rejected": -321.6078186035156, "loss": 0.3724, "rewards/accuracies": 0.46875, "rewards/chosen": 3.2127718441188335e-05, "rewards/margins": 6.670473521808162e-05, "rewards/rejected": -3.457700586295687e-05, "step": 20 }, { "epoch": 0.09, "learning_rate": 4.5454545454545455e-06, "logits/chosen": 0.11577685922384262, "logits/rejected": 0.2106866091489792, "logps/chosen": -372.7516174316406, "logps/rejected": -331.03900146484375, "loss": 0.3667, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.00032594671938568354, "rewards/margins": 0.0004646074085030705, "rewards/rejected": -0.0001386606745654717, "step": 30 }, { "epoch": 0.12, "learning_rate": 4.9930567839810125e-06, "logits/chosen": 0.1186821237206459, "logits/rejected": 0.15511082112789154, "logps/chosen": -348.07867431640625, "logps/rejected": -310.28216552734375, "loss": 0.38, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.0011213215766474605, "rewards/margins": 0.0014382528606802225, "rewards/rejected": -0.00031693134224042296, "step": 40 }, { "epoch": 0.15, "learning_rate": 4.959142005221991e-06, "logits/chosen": 0.15616841614246368, "logits/rejected": 0.25814804434776306, "logps/chosen": -356.5604553222656, "logps/rejected": -330.3963623046875, "loss": 0.3654, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.0028078085742890835, "rewards/margins": 0.004432971589267254, "rewards/rejected": -0.0016251627821475267, "step": 50 }, { "epoch": 0.18, "learning_rate": 4.897364164920515e-06, "logits/chosen": 0.10601208359003067, "logits/rejected": 0.14923986792564392, "logps/chosen": -339.3543395996094, "logps/rejected": -339.65673828125, "loss": 0.362, "rewards/accuracies": 0.6875, "rewards/chosen": 0.002892224583774805, "rewards/margins": 0.008964595384895802, "rewards/rejected": -0.0060723708011209965, "step": 60 }, { "epoch": 0.21, "learning_rate": 4.808423230692374e-06, "logits/chosen": 0.11075379699468613, "logits/rejected": 0.23828363418579102, "logps/chosen": -347.7062072753906, "logps/rejected": -341.0438537597656, "loss": 0.3603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.002659523393958807, "rewards/margins": 0.02260093204677105, "rewards/rejected": -0.025260454043745995, "step": 70 }, { "epoch": 0.24, "learning_rate": 4.693326938861367e-06, "logits/chosen": 0.1428188979625702, "logits/rejected": 0.16575627028942108, "logps/chosen": -378.0843505859375, "logps/rejected": -402.1046447753906, "loss": 0.3431, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.02090095356106758, "rewards/margins": 0.04105537384748459, "rewards/rejected": -0.06195632368326187, "step": 80 }, { "epoch": 0.27, "learning_rate": 4.553379376404085e-06, "logits/chosen": 0.12297731637954712, "logits/rejected": 0.1944548785686493, "logps/chosen": -432.58233642578125, "logps/rejected": -455.89849853515625, "loss": 0.3162, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.09480606764554977, "rewards/margins": 0.05626205727458, "rewards/rejected": -0.15106813609600067, "step": 90 }, { "epoch": 0.3, "learning_rate": 4.3901662051233755e-06, "logits/chosen": 0.1246790885925293, "logits/rejected": 0.1642937809228897, "logps/chosen": -521.7025756835938, "logps/rejected": -577.9944458007812, "loss": 0.3234, "rewards/accuracies": 0.65625, "rewards/chosen": -0.17070159316062927, "rewards/margins": 0.0818825513124466, "rewards/rejected": -0.25258415937423706, "step": 100 }, { "epoch": 0.34, "learning_rate": 4.205536695466524e-06, "logits/chosen": 0.12067300081253052, "logits/rejected": 0.15556882321834564, "logps/chosen": -559.78466796875, "logps/rejected": -652.8531494140625, "loss": 0.3134, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2003019154071808, "rewards/margins": 0.11271178722381592, "rewards/rejected": -0.3130136728286743, "step": 110 }, { "epoch": 0.37, "learning_rate": 4.001582773552153e-06, "logits/chosen": 0.10444744676351547, "logits/rejected": 0.17774121463298798, "logps/chosen": -521.9983520507812, "logps/rejected": -591.5448608398438, "loss": 0.3171, "rewards/accuracies": 0.625, "rewards/chosen": -0.17163847386837006, "rewards/margins": 0.09191058576107025, "rewards/rejected": -0.2635490298271179, "step": 120 }, { "epoch": 0.4, "learning_rate": 3.7806153188114027e-06, "logits/chosen": 0.10605984926223755, "logits/rejected": 0.21231666207313538, "logps/chosen": -524.1884155273438, "logps/rejected": -579.4110717773438, "loss": 0.3028, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1743537038564682, "rewards/margins": 0.0857800766825676, "rewards/rejected": -0.2601337730884552, "step": 130 }, { "epoch": 0.43, "learning_rate": 3.5451379808006014e-06, "logits/chosen": 0.06628812849521637, "logits/rejected": 0.16180165112018585, "logps/chosen": -472.8372497558594, "logps/rejected": -526.1737060546875, "loss": 0.317, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.16417238116264343, "rewards/margins": 0.0910586565732956, "rewards/rejected": -0.2552310526371002, "step": 140 }, { "epoch": 0.46, "learning_rate": 3.2978188118513814e-06, "logits/chosen": 0.1335766613483429, "logits/rejected": 0.14664001762866974, "logps/chosen": -483.44171142578125, "logps/rejected": -523.9747314453125, "loss": 0.3184, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.17518754303455353, "rewards/margins": 0.040780000388622284, "rewards/rejected": -0.21596750617027283, "step": 150 }, { "epoch": 0.49, "learning_rate": 3.041460036971664e-06, "logits/chosen": 0.07944774627685547, "logits/rejected": 0.2122001200914383, "logps/chosen": -538.4556884765625, "logps/rejected": -592.0149536132812, "loss": 0.2983, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1783067286014557, "rewards/margins": 0.09875308722257614, "rewards/rejected": -0.27705979347229004, "step": 160 }, { "epoch": 0.52, "learning_rate": 2.7789663035166035e-06, "logits/chosen": 0.1213790625333786, "logits/rejected": 0.19217556715011597, "logps/chosen": -557.9765625, "logps/rejected": -667.1573486328125, "loss": 0.2923, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.19924937188625336, "rewards/margins": 0.11881480365991592, "rewards/rejected": -0.3180641531944275, "step": 170 }, { "epoch": 0.55, "learning_rate": 2.513311770373421e-06, "logits/chosen": 0.16672459244728088, "logits/rejected": 0.25797826051712036, "logps/chosen": -593.9732666015625, "logps/rejected": -623.8164672851562, "loss": 0.2986, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.19991374015808105, "rewards/margins": 0.09594612568616867, "rewards/rejected": -0.2958598732948303, "step": 180 }, { "epoch": 0.58, "learning_rate": 2.247506409552795e-06, "logits/chosen": 0.12442562729120255, "logits/rejected": 0.24189035594463348, "logps/chosen": -549.0313110351562, "logps/rejected": -624.5281982421875, "loss": 0.3063, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19940030574798584, "rewards/margins": 0.11483033001422882, "rewards/rejected": -0.31423062086105347, "step": 190 }, { "epoch": 0.61, "learning_rate": 1.9845619020032552e-06, "logits/chosen": 0.12666499614715576, "logits/rejected": 0.17624810338020325, "logps/chosen": -509.42083740234375, "logps/rejected": -640.7295532226562, "loss": 0.2951, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.18602913618087769, "rewards/margins": 0.1454179733991623, "rewards/rejected": -0.33144712448120117, "step": 200 }, { "epoch": 0.64, "learning_rate": 1.7274575140626318e-06, "logits/chosen": 0.117406465113163, "logits/rejected": 0.22609476745128632, "logps/chosen": -545.6236572265625, "logps/rejected": -737.5299072265625, "loss": 0.2762, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.20832762122154236, "rewards/margins": 0.18485741317272186, "rewards/rejected": -0.3931850492954254, "step": 210 }, { "epoch": 0.67, "learning_rate": 1.4791063411799938e-06, "logits/chosen": 0.17229710519313812, "logits/rejected": 0.27004474401474, "logps/chosen": -575.3265991210938, "logps/rejected": -680.2108764648438, "loss": 0.2883, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.2224891185760498, "rewards/margins": 0.13917511701583862, "rewards/rejected": -0.3616642355918884, "step": 220 }, { "epoch": 0.7, "learning_rate": 1.2423223013801946e-06, "logits/chosen": 0.10027740895748138, "logits/rejected": 0.18238651752471924, "logps/chosen": -547.7401123046875, "logps/rejected": -610.2566528320312, "loss": 0.311, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.21439309418201447, "rewards/margins": 0.11130455881357193, "rewards/rejected": -0.325697660446167, "step": 230 }, { "epoch": 0.73, "learning_rate": 1.019788252448267e-06, "logits/chosen": 0.12439197301864624, "logits/rejected": 0.2294197529554367, "logps/chosen": -583.9920043945312, "logps/rejected": -710.6575317382812, "loss": 0.29, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.22053727507591248, "rewards/margins": 0.14553256332874298, "rewards/rejected": -0.36606985330581665, "step": 240 }, { "epoch": 0.76, "learning_rate": 8.140255940787059e-07, "logits/chosen": 0.14149069786071777, "logits/rejected": 0.18100661039352417, "logps/chosen": -556.0686645507812, "logps/rejected": -666.5164794921875, "loss": 0.3205, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.2281671017408371, "rewards/margins": 0.11391140520572662, "rewards/rejected": -0.3420785069465637, "step": 250 }, { "epoch": 0.79, "learning_rate": 6.273656994094232e-07, "logits/chosen": 0.14345864951610565, "logits/rejected": 0.23669950664043427, "logps/chosen": -549.0147705078125, "logps/rejected": -657.0481567382812, "loss": 0.2972, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.19012194871902466, "rewards/margins": 0.12817278504371643, "rewards/rejected": -0.3182947039604187, "step": 260 }, { "epoch": 0.82, "learning_rate": 4.619234996325314e-07, "logits/chosen": 0.15616485476493835, "logits/rejected": 0.20667704939842224, "logps/chosen": -523.2460327148438, "logps/rejected": -626.14697265625, "loss": 0.291, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.18850921094417572, "rewards/margins": 0.13663442432880402, "rewards/rejected": -0.32514363527297974, "step": 270 }, { "epoch": 0.85, "learning_rate": 3.195735209788528e-07, "logits/chosen": 0.04023386538028717, "logits/rejected": 0.2141951620578766, "logps/chosen": -506.42816162109375, "logps/rejected": -639.3445434570312, "loss": 0.2909, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.18361696600914001, "rewards/margins": 0.1365058869123459, "rewards/rejected": -0.3201228678226471, "step": 280 }, { "epoch": 0.88, "learning_rate": 2.019286455866981e-07, "logits/chosen": 0.1362815946340561, "logits/rejected": 0.2391398847103119, "logps/chosen": -536.0242919921875, "logps/rejected": -593.8597412109375, "loss": 0.2988, "rewards/accuracies": 0.65625, "rewards/chosen": -0.20827972888946533, "rewards/margins": 0.08858077228069305, "rewards/rejected": -0.2968604862689972, "step": 290 }, { "epoch": 0.91, "learning_rate": 1.1032183690276754e-07, "logits/chosen": 0.1295126974582672, "logits/rejected": 0.19113779067993164, "logps/chosen": -561.409912109375, "logps/rejected": -623.5128173828125, "loss": 0.2962, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.21096615493297577, "rewards/margins": 0.0925087183713913, "rewards/rejected": -0.30347487330436707, "step": 300 }, { "epoch": 0.94, "learning_rate": 4.579103667367385e-08, "logits/chosen": 0.1159089058637619, "logits/rejected": 0.1896822154521942, "logps/chosen": -593.0438232421875, "logps/rejected": -640.1717529296875, "loss": 0.2828, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2263847142457962, "rewards/margins": 0.09501803666353226, "rewards/rejected": -0.32140272855758667, "step": 310 }, { "epoch": 0.97, "learning_rate": 9.067404651211808e-09, "logits/chosen": 0.1434229016304016, "logits/rejected": 0.21832433342933655, "logps/chosen": -563.3028564453125, "logps/rejected": -658.5872192382812, "loss": 0.2885, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.20054848492145538, "rewards/margins": 0.13671842217445374, "rewards/rejected": -0.3372668921947479, "step": 320 }, { "epoch": 1.0, "step": 328, "total_flos": 0.0, "train_loss": 0.31671198575598436, "train_runtime": 11065.3868, "train_samples_per_second": 1.898, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 328, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }