{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9975412715138743, "eval_steps": 10000, "global_step": 355, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.7777777777777774e-08, "logits/chosen": -0.13174405694007874, "logits/rejected": -0.027169257402420044, "logps/chosen": -477.4691162109375, "logps/rejected": -277.6482238769531, "loss": 0.4106, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 2.7777777777777776e-07, "logits/chosen": -0.0896572694182396, "logits/rejected": -0.04708625003695488, "logps/chosen": -334.1234130859375, "logps/rejected": -264.19927978515625, "loss": 0.4187, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": -0.0011318529723212123, "rewards/margins": -0.0006679879734292626, "rewards/rejected": -0.0004638649697881192, "step": 10 }, { "epoch": 0.06, "learning_rate": 5.555555555555555e-07, "logits/chosen": -0.06541652977466583, "logits/rejected": -0.027149802073836327, "logps/chosen": -312.1936950683594, "logps/rejected": -212.1822967529297, "loss": 0.422, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0018345726421102881, "rewards/margins": 0.008850323967635632, "rewards/rejected": -0.007015751209110022, "step": 20 }, { "epoch": 0.08, "learning_rate": 8.333333333333333e-07, "logits/chosen": -0.033993594348430634, "logits/rejected": 0.014452556148171425, "logps/chosen": -369.52886962890625, "logps/rejected": -227.0442657470703, "loss": 0.4287, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.012805985286831856, "rewards/margins": 0.05532551556825638, "rewards/rejected": -0.04251953214406967, "step": 30 }, { "epoch": 0.11, "learning_rate": 9.99612097830993e-07, "logits/chosen": -0.049494121223688126, "logits/rejected": -0.007341804448515177, "logps/chosen": -328.2823791503906, "logps/rejected": -251.8525848388672, "loss": 0.4553, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.03969588130712509, "rewards/margins": 0.06737084686756134, "rewards/rejected": -0.10706672817468643, "step": 40 }, { "epoch": 0.14, "learning_rate": 9.952551076085863e-07, "logits/chosen": -0.054784227162599564, "logits/rejected": -0.018202614039182663, "logps/chosen": -343.4543762207031, "logps/rejected": -278.887451171875, "loss": 0.478, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.029784226790070534, "rewards/margins": 0.14676395058631897, "rewards/rejected": -0.17654818296432495, "step": 50 }, { "epoch": 0.17, "learning_rate": 9.860986139994238e-07, "logits/chosen": -0.17503580451011658, "logits/rejected": -0.10935833305120468, "logps/chosen": -399.97161865234375, "logps/rejected": -245.5420684814453, "loss": 0.4856, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0788765698671341, "rewards/margins": 0.30753207206726074, "rewards/rejected": -0.22865548729896545, "step": 60 }, { "epoch": 0.2, "learning_rate": 9.722313523268027e-07, "logits/chosen": -0.13078172504901886, "logits/rejected": -0.018874743953347206, "logps/chosen": -382.87396240234375, "logps/rejected": -252.6072540283203, "loss": 0.4667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.09564249962568283, "rewards/margins": 0.21217863261699677, "rewards/rejected": -0.11653614044189453, "step": 70 }, { "epoch": 0.22, "learning_rate": 9.537877098354784e-07, "logits/chosen": 0.019111448898911476, "logits/rejected": 0.04028189927339554, "logps/chosen": -277.33154296875, "logps/rejected": -215.5694122314453, "loss": 0.4657, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.01853206194937229, "rewards/margins": 0.18713198602199554, "rewards/rejected": -0.168599933385849, "step": 80 }, { "epoch": 0.25, "learning_rate": 9.309464233486386e-07, "logits/chosen": -0.184749573469162, "logits/rejected": -0.12197474390268326, "logps/chosen": -374.56268310546875, "logps/rejected": -224.47860717773438, "loss": 0.4724, "rewards/accuracies": 0.71875, "rewards/chosen": 0.19164375960826874, "rewards/margins": 0.33850011229515076, "rewards/rejected": -0.1468563675880432, "step": 90 }, { "epoch": 0.28, "learning_rate": 9.039288471343504e-07, "logits/chosen": -0.06358620524406433, "logits/rejected": -0.022323714569211006, "logps/chosen": -352.3625183105469, "logps/rejected": -265.12457275390625, "loss": 0.4579, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.11637835204601288, "rewards/margins": 0.15370506048202515, "rewards/rejected": -0.03732669726014137, "step": 100 }, { "epoch": 0.31, "learning_rate": 8.729968077675454e-07, "logits/chosen": -0.16022691130638123, "logits/rejected": -0.06551636755466461, "logps/chosen": -304.0919189453125, "logps/rejected": -257.5033874511719, "loss": 0.4444, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.09323982149362564, "rewards/margins": 0.10607878863811493, "rewards/rejected": -0.012838983908295631, "step": 110 }, { "epoch": 0.34, "learning_rate": 8.384500667760089e-07, "logits/chosen": -0.18294575810432434, "logits/rejected": -0.1334661990404129, "logps/chosen": -323.1263427734375, "logps/rejected": -206.44387817382812, "loss": 0.4535, "rewards/accuracies": 0.65625, "rewards/chosen": 0.17489886283874512, "rewards/margins": 0.1883935183286667, "rewards/rejected": -0.013494668528437614, "step": 120 }, { "epoch": 0.37, "learning_rate": 8.006234156598042e-07, "logits/chosen": -0.09687581658363342, "logits/rejected": -0.0031311712227761745, "logps/chosen": -361.1056823730469, "logps/rejected": -219.41552734375, "loss": 0.4484, "rewards/accuracies": 0.625, "rewards/chosen": 0.12578140199184418, "rewards/margins": 0.2744066119194031, "rewards/rejected": -0.1486252248287201, "step": 130 }, { "epoch": 0.39, "learning_rate": 7.59883431436215e-07, "logits/chosen": -0.03516136482357979, "logits/rejected": -0.005446717143058777, "logps/chosen": -316.314208984375, "logps/rejected": -241.97024536132812, "loss": 0.4383, "rewards/accuracies": 0.625, "rewards/chosen": 0.025916021317243576, "rewards/margins": 0.16546496748924255, "rewards/rejected": -0.13954894244670868, "step": 140 }, { "epoch": 0.42, "learning_rate": 7.166249241521318e-07, "logits/chosen": 0.0030886970926076174, "logits/rejected": 0.06723493337631226, "logps/chosen": -293.86627197265625, "logps/rejected": -255.26492309570312, "loss": 0.4382, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.05645722150802612, "rewards/margins": 0.17538480460643768, "rewards/rejected": -0.2318420112133026, "step": 150 }, { "epoch": 0.45, "learning_rate": 6.712671107909358e-07, "logits/chosen": -0.03268152475357056, "logits/rejected": 0.12709534168243408, "logps/chosen": -369.74859619140625, "logps/rejected": -260.13128662109375, "loss": 0.4255, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.003464625682681799, "rewards/margins": 0.2506219744682312, "rewards/rejected": -0.24715733528137207, "step": 160 }, { "epoch": 0.48, "learning_rate": 6.24249552652447e-07, "logits/chosen": 0.04178273305296898, "logits/rejected": 0.12335582822561264, "logps/chosen": -316.4767150878906, "logps/rejected": -268.18829345703125, "loss": 0.4169, "rewards/accuracies": 0.59375, "rewards/chosen": -0.006209957879036665, "rewards/margins": 0.20326288044452667, "rewards/rejected": -0.20947282016277313, "step": 170 }, { "epoch": 0.51, "learning_rate": 5.760278955766694e-07, "logits/chosen": -0.12427058070898056, "logits/rejected": 0.009830540046095848, "logps/chosen": -327.13958740234375, "logps/rejected": -258.3717041015625, "loss": 0.4267, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.042253121733665466, "rewards/margins": 0.19665148854255676, "rewards/rejected": -0.23890459537506104, "step": 180 }, { "epoch": 0.53, "learning_rate": 5.270694542927088e-07, "logits/chosen": -0.16560761630535126, "logits/rejected": -0.04073227569460869, "logps/chosen": -341.1844482421875, "logps/rejected": -227.2850341796875, "loss": 0.4261, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.0071119763888418674, "rewards/margins": 0.2952454090118408, "rewards/rejected": -0.288133442401886, "step": 190 }, { "epoch": 0.56, "learning_rate": 4.778486836848107e-07, "logits/chosen": -0.007979141548275948, "logits/rejected": 0.1243690699338913, "logps/chosen": -329.4273986816406, "logps/rejected": -260.57806396484375, "loss": 0.4096, "rewards/accuracies": 0.5625, "rewards/chosen": -0.19496320188045502, "rewards/margins": 0.16824397444725037, "rewards/rejected": -0.3632071614265442, "step": 200 }, { "epoch": 0.59, "learning_rate": 4.2884258086335745e-07, "logits/chosen": 0.09336410462856293, "logits/rejected": 0.19506987929344177, "logps/chosen": -391.4615173339844, "logps/rejected": -279.2521057128906, "loss": 0.4003, "rewards/accuracies": 0.59375, "rewards/chosen": -0.11562051624059677, "rewards/margins": 0.245724156498909, "rewards/rejected": -0.36134466528892517, "step": 210 }, { "epoch": 0.62, "learning_rate": 3.8052606259922095e-07, "logits/chosen": -0.16688141226768494, "logits/rejected": -0.08500812947750092, "logps/chosen": -362.3302917480469, "logps/rejected": -247.5942840576172, "loss": 0.4244, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07475811243057251, "rewards/margins": 0.20620782673358917, "rewards/rejected": -0.28096598386764526, "step": 220 }, { "epoch": 0.65, "learning_rate": 3.333673629186279e-07, "logits/chosen": -0.02717510424554348, "logits/rejected": 0.12363864481449127, "logps/chosen": -332.33319091796875, "logps/rejected": -247.22817993164062, "loss": 0.4115, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.08057795464992523, "rewards/margins": 0.2381751984357834, "rewards/rejected": -0.3187531530857086, "step": 230 }, { "epoch": 0.67, "learning_rate": 2.878234954603167e-07, "logits/chosen": 0.033598482608795166, "logits/rejected": 0.18793973326683044, "logps/chosen": -381.00506591796875, "logps/rejected": -270.8756103515625, "loss": 0.3798, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.12533587217330933, "rewards/margins": 0.24204333126544952, "rewards/rejected": -0.36737921833992004, "step": 240 }, { "epoch": 0.7, "learning_rate": 2.443358245691555e-07, "logits/chosen": 0.030673842877149582, "logits/rejected": 0.18992134928703308, "logps/chosen": -383.7073059082031, "logps/rejected": -261.9964294433594, "loss": 0.3877, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.09187673032283783, "rewards/margins": 0.3298332989215851, "rewards/rejected": -0.4217100143432617, "step": 250 }, { "epoch": 0.73, "learning_rate": 2.0332578804662782e-07, "logits/chosen": 0.024305405095219612, "logits/rejected": 0.132650688290596, "logps/chosen": -368.91131591796875, "logps/rejected": -269.9962463378906, "loss": 0.4026, "rewards/accuracies": 0.625, "rewards/chosen": -0.14426225423812866, "rewards/margins": 0.2727457880973816, "rewards/rejected": -0.41700801253318787, "step": 260 }, { "epoch": 0.76, "learning_rate": 1.651908130088947e-07, "logits/chosen": 0.13495397567749023, "logits/rejected": 0.21630129218101501, "logps/chosen": -346.6638488769531, "logps/rejected": -274.6488952636719, "loss": 0.3821, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.25730255246162415, "rewards/margins": 0.2660498023033142, "rewards/rejected": -0.5233522653579712, "step": 270 }, { "epoch": 0.79, "learning_rate": 1.3030046443173442e-07, "logits/chosen": 0.12753400206565857, "logits/rejected": 0.26089444756507874, "logps/chosen": -396.9707336425781, "logps/rejected": -273.0684814453125, "loss": 0.4015, "rewards/accuracies": 0.625, "rewards/chosen": -0.19528909027576447, "rewards/margins": 0.3000025451183319, "rewards/rejected": -0.49529165029525757, "step": 280 }, { "epoch": 0.81, "learning_rate": 9.899286370670574e-08, "logits/chosen": 0.18344645202159882, "logits/rejected": 0.3353565037250519, "logps/chosen": -358.2286376953125, "logps/rejected": -288.5234680175781, "loss": 0.4025, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.28220534324645996, "rewards/margins": 0.19654087722301483, "rewards/rejected": -0.478746235370636, "step": 290 }, { "epoch": 0.84, "learning_rate": 7.157141191620548e-08, "logits/chosen": 0.0641961470246315, "logits/rejected": 0.2366667091846466, "logps/chosen": -380.06103515625, "logps/rejected": -267.75213623046875, "loss": 0.3997, "rewards/accuracies": 0.625, "rewards/chosen": -0.13748347759246826, "rewards/margins": 0.3018878996372223, "rewards/rejected": -0.43937140703201294, "step": 300 }, { "epoch": 0.87, "learning_rate": 4.830184958207006e-08, "logits/chosen": 0.03403336927294731, "logits/rejected": 0.19396355748176575, "logps/chosen": -347.7532653808594, "logps/rejected": -284.3299865722656, "loss": 0.4026, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.16916589438915253, "rewards/margins": 0.25709637999534607, "rewards/rejected": -0.4262623190879822, "step": 310 }, { "epoch": 0.9, "learning_rate": 2.940968138161731e-08, "logits/chosen": 0.11429999023675919, "logits/rejected": 0.17834721505641937, "logps/chosen": -330.48284912109375, "logps/rejected": -274.64727783203125, "loss": 0.3982, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1756475865840912, "rewards/margins": 0.1860581338405609, "rewards/rejected": -0.3617057204246521, "step": 320 }, { "epoch": 0.93, "learning_rate": 1.507799078812799e-08, "logits/chosen": -0.007492154836654663, "logits/rejected": 0.10714348405599594, "logps/chosen": -425.7674865722656, "logps/rejected": -314.00860595703125, "loss": 0.3971, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.17962414026260376, "rewards/margins": 0.292041152715683, "rewards/rejected": -0.47166532278060913, "step": 330 }, { "epoch": 0.96, "learning_rate": 5.445665814031941e-09, "logits/chosen": 0.06636445224285126, "logits/rejected": 0.1753680408000946, "logps/chosen": -369.9612731933594, "logps/rejected": -278.1523132324219, "loss": 0.4052, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.18568792939186096, "rewards/margins": 0.26998209953308105, "rewards/rejected": -0.4556700587272644, "step": 340 }, { "epoch": 0.98, "learning_rate": 6.060530510659245e-10, "logits/chosen": 0.03166942670941353, "logits/rejected": 0.10389814525842667, "logps/chosen": -357.81988525390625, "logps/rejected": -275.1876525878906, "loss": 0.4038, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.20625650882720947, "rewards/margins": 0.26138800382614136, "rewards/rejected": -0.46764451265335083, "step": 350 }, { "epoch": 1.0, "step": 355, "total_flos": 0.0, "train_loss": 0.4252443082735572, "train_runtime": 5295.2592, "train_samples_per_second": 8.602, "train_steps_per_second": 0.067 } ], "logging_steps": 10, "max_steps": 355, "num_train_epochs": 1, "save_steps": 10000, "total_flos": 0.0, "trial_name": null, "trial_params": null }