{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 651, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 117760.0, "learning_rate": 5.000000000000001e-07, "log_odds_chosen": 0.36438828706741333, "log_odds_ratio": -0.6397662162780762, "logits/chosen": 3.8861491680145264, "logits/rejected": 5.231001853942871, "logps/chosen": -0.9861465692520142, "logps/rejected": -1.2529093027114868, "loss": 1.953, "nll_loss": 3.2415008544921875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04930732399225235, "rewards/margins": 0.013338141143321991, "rewards/rejected": -0.06264545768499374, "step": 10 }, { "epoch": 0.09, "grad_norm": 29184.0, "learning_rate": 1.0000000000000002e-06, "log_odds_chosen": 0.17107267677783966, "log_odds_ratio": -0.6301043033599854, "logits/chosen": 4.779696464538574, "logits/rejected": 5.251872539520264, "logps/chosen": -1.1045284271240234, "logps/rejected": -1.2445374727249146, "loss": 1.7108, "nll_loss": 1.8614288568496704, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05522642284631729, "rewards/margins": 0.007000453770160675, "rewards/rejected": -0.06222687289118767, "step": 20 }, { "epoch": 0.14, "grad_norm": 3932160.0, "learning_rate": 1.5e-06, "log_odds_chosen": 0.478428453207016, "log_odds_ratio": -0.5682710409164429, "logits/chosen": 4.58956241607666, "logits/rejected": 5.215265274047852, "logps/chosen": -0.9884525537490845, "logps/rejected": -1.2604442834854126, "loss": 2.1071, "nll_loss": 1.525723934173584, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04942262917757034, "rewards/margins": 0.013599586673080921, "rewards/rejected": -0.06302221864461899, "step": 30 }, { "epoch": 0.18, "grad_norm": 63232.0, "learning_rate": 2.0000000000000003e-06, "log_odds_chosen": 0.2763148248195648, "log_odds_ratio": -0.6428317427635193, "logits/chosen": 5.248695373535156, "logits/rejected": 5.335747718811035, "logps/chosen": -0.9019734263420105, "logps/rejected": -1.058569312095642, "loss": 1.6367, "nll_loss": 1.17539644241333, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.045098677277565, "rewards/margins": 0.007829795591533184, "rewards/rejected": -0.052928466349840164, "step": 40 }, { "epoch": 0.23, "grad_norm": 1802240.0, "learning_rate": 2.5e-06, "log_odds_chosen": -0.07109338045120239, "log_odds_ratio": -0.9068069458007812, "logits/chosen": 4.34699821472168, "logits/rejected": 5.148941993713379, "logps/chosen": -1.0307292938232422, "logps/rejected": -1.0001896619796753, "loss": 2.0499, "nll_loss": 2.4581282138824463, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05153647065162659, "rewards/margins": -0.0015269846189767122, "rewards/rejected": -0.05000948905944824, "step": 50 }, { "epoch": 0.28, "grad_norm": 473088.0, "learning_rate": 3e-06, "log_odds_chosen": 0.7022095918655396, "log_odds_ratio": -0.47877854108810425, "logits/chosen": 5.137725830078125, "logits/rejected": 5.073107719421387, "logps/chosen": -0.7480964660644531, "logps/rejected": -1.172572374343872, "loss": 1.9116, "nll_loss": 1.2398216724395752, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.037404827773571014, "rewards/margins": 0.021223794668912888, "rewards/rejected": -0.058628618717193604, "step": 60 }, { "epoch": 0.32, "grad_norm": 4161536.0, "learning_rate": 3.5e-06, "log_odds_chosen": -0.30822521448135376, "log_odds_ratio": -1.0616459846496582, "logits/chosen": 4.378929615020752, "logits/rejected": 5.239219665527344, "logps/chosen": -1.115562081336975, "logps/rejected": -0.8684147596359253, "loss": 2.0166, "nll_loss": 2.142368793487549, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05577809736132622, "rewards/margins": -0.012357364408671856, "rewards/rejected": -0.043420739471912384, "step": 70 }, { "epoch": 0.37, "grad_norm": 211968.0, "learning_rate": 4.000000000000001e-06, "log_odds_chosen": 0.38707518577575684, "log_odds_ratio": -0.5776039361953735, "logits/chosen": 5.019408226013184, "logits/rejected": 5.3371453285217285, "logps/chosen": -0.9375723004341125, "logps/rejected": -1.1710981130599976, "loss": 1.8918, "nll_loss": 1.6274166107177734, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.046878617256879807, "rewards/margins": 0.011676294729113579, "rewards/rejected": -0.058554910123348236, "step": 80 }, { "epoch": 0.41, "grad_norm": 1704.0, "learning_rate": 4.5e-06, "log_odds_chosen": 0.5051761865615845, "log_odds_ratio": -0.5127500295639038, "logits/chosen": 4.478859901428223, "logits/rejected": 4.748915672302246, "logps/chosen": -0.8121053576469421, "logps/rejected": -1.1007237434387207, "loss": 1.8015, "nll_loss": 1.57842218875885, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04060526192188263, "rewards/margins": 0.014430919662117958, "rewards/rejected": -0.055036187171936035, "step": 90 }, { "epoch": 0.46, "grad_norm": 8.9375, "learning_rate": 5e-06, "log_odds_chosen": 1.037414789199829, "log_odds_ratio": -0.3248421549797058, "logits/chosen": 4.653676509857178, "logits/rejected": 5.350204944610596, "logps/chosen": -0.6695261001586914, "logps/rejected": -1.300445795059204, "loss": 0.9356, "nll_loss": 0.7057152986526489, "rewards/accuracies": 1.0, "rewards/chosen": -0.03347630053758621, "rewards/margins": 0.031545985490083694, "rewards/rejected": -0.0650222972035408, "step": 100 }, { "epoch": 0.51, "grad_norm": 2.8125, "learning_rate": 4.767312946227961e-06, "log_odds_chosen": 0.6677854061126709, "log_odds_ratio": -0.5610898733139038, "logits/chosen": 4.671368598937988, "logits/rejected": 5.119524002075195, "logps/chosen": -0.8684868812561035, "logps/rejected": -1.2020485401153564, "loss": 0.8288, "nll_loss": 0.9947482347488403, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.043424345552921295, "rewards/margins": 0.01667807623744011, "rewards/rejected": -0.06010241433978081, "step": 110 }, { "epoch": 0.55, "grad_norm": 1.9921875, "learning_rate": 4.564354645876385e-06, "log_odds_chosen": 0.5597886443138123, "log_odds_ratio": -0.5193617343902588, "logits/chosen": 5.5248026847839355, "logits/rejected": 6.067958354949951, "logps/chosen": -0.9015194773674011, "logps/rejected": -1.2406196594238281, "loss": 0.7451, "nll_loss": 0.8342186212539673, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.045075975358486176, "rewards/margins": 0.01695500686764717, "rewards/rejected": -0.06203098222613335, "step": 120 }, { "epoch": 0.6, "grad_norm": 2.875, "learning_rate": 4.385290096535147e-06, "log_odds_chosen": 0.26957136392593384, "log_odds_ratio": -0.7732787728309631, "logits/chosen": 4.8973588943481445, "logits/rejected": 5.552582263946533, "logps/chosen": -0.877202033996582, "logps/rejected": -0.9518612623214722, "loss": 0.7319, "nll_loss": 0.6940464377403259, "rewards/accuracies": 0.5, "rewards/chosen": -0.04386010393500328, "rewards/margins": 0.0037329583428800106, "rewards/rejected": -0.04759306460618973, "step": 130 }, { "epoch": 0.65, "grad_norm": 2.5, "learning_rate": 4.2257712736425835e-06, "log_odds_chosen": 0.7680839896202087, "log_odds_ratio": -0.5321913957595825, "logits/chosen": 5.471996307373047, "logits/rejected": 5.644137382507324, "logps/chosen": -0.6714180111885071, "logps/rejected": -0.9587985277175903, "loss": 0.732, "nll_loss": 0.6440631151199341, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.033570900559425354, "rewards/margins": 0.014369020238518715, "rewards/rejected": -0.047939930111169815, "step": 140 }, { "epoch": 0.69, "grad_norm": 2.375, "learning_rate": 4.082482904638631e-06, "log_odds_chosen": 0.5068908929824829, "log_odds_ratio": -0.604145884513855, "logits/chosen": 5.474297523498535, "logits/rejected": 5.376832485198975, "logps/chosen": -0.8357957005500793, "logps/rejected": -1.0136160850524902, "loss": 0.706, "nll_loss": 0.6737378835678101, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04178978502750397, "rewards/margins": 0.00889101903885603, "rewards/rejected": -0.05068080872297287, "step": 150 }, { "epoch": 0.74, "grad_norm": 2.015625, "learning_rate": 3.952847075210474e-06, "log_odds_chosen": 0.615983784198761, "log_odds_ratio": -0.4876289963722229, "logits/chosen": 5.473410129547119, "logits/rejected": 6.06318998336792, "logps/chosen": -0.9676389694213867, "logps/rejected": -1.349385142326355, "loss": 0.6996, "nll_loss": 0.6852242350578308, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.048381954431533813, "rewards/margins": 0.019087309017777443, "rewards/rejected": -0.06746925413608551, "step": 160 }, { "epoch": 0.78, "grad_norm": 2.171875, "learning_rate": 3.834824944236852e-06, "log_odds_chosen": 0.4551977515220642, "log_odds_ratio": -0.5428072214126587, "logits/chosen": 4.785284042358398, "logits/rejected": 6.005092620849609, "logps/chosen": -0.7350739240646362, "logps/rejected": -1.0496256351470947, "loss": 0.6959, "nll_loss": 0.5339438319206238, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03675369173288345, "rewards/margins": 0.015727588906884193, "rewards/rejected": -0.052481282502412796, "step": 170 }, { "epoch": 0.83, "grad_norm": 2.015625, "learning_rate": 3.72677996249965e-06, "log_odds_chosen": 0.5587902665138245, "log_odds_ratio": -0.6063727140426636, "logits/chosen": 4.6595892906188965, "logits/rejected": 5.4700422286987305, "logps/chosen": -0.7482207417488098, "logps/rejected": -0.9887701272964478, "loss": 0.7233, "nll_loss": 0.5874465703964233, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03741103783249855, "rewards/margins": 0.012027469463646412, "rewards/rejected": -0.04943850636482239, "step": 180 }, { "epoch": 0.88, "grad_norm": 2.0625, "learning_rate": 3.6273812505500587e-06, "log_odds_chosen": 0.9965683817863464, "log_odds_ratio": -0.4162277281284332, "logits/chosen": 5.304540157318115, "logits/rejected": 5.486930847167969, "logps/chosen": -0.7579169869422913, "logps/rejected": -1.1843591928482056, "loss": 0.7298, "nll_loss": 0.6787526607513428, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03789585083723068, "rewards/margins": 0.021322116255760193, "rewards/rejected": -0.059217967092990875, "step": 190 }, { "epoch": 0.92, "grad_norm": 1.9609375, "learning_rate": 3.5355339059327378e-06, "log_odds_chosen": 0.2911016047000885, "log_odds_ratio": -0.6208275556564331, "logits/chosen": 5.865508556365967, "logits/rejected": 5.9140448570251465, "logps/chosen": -1.0318800210952759, "logps/rejected": -1.2233208417892456, "loss": 0.6888, "nll_loss": 0.8277570009231567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.051594000309705734, "rewards/margins": 0.009572046808898449, "rewards/rejected": -0.061166055500507355, "step": 200 }, { "epoch": 0.97, "grad_norm": 2.546875, "learning_rate": 3.450327796711771e-06, "log_odds_chosen": 0.3929597735404968, "log_odds_ratio": -0.6252869367599487, "logits/chosen": 5.480368137359619, "logits/rejected": 5.818605899810791, "logps/chosen": -0.8382253646850586, "logps/rejected": -1.1194109916687012, "loss": 0.703, "nll_loss": 0.7914389967918396, "rewards/accuracies": 0.5, "rewards/chosen": -0.04191126674413681, "rewards/margins": 0.014059278182685375, "rewards/rejected": -0.05597054958343506, "step": 210 }, { "epoch": 1.01, "grad_norm": 2.234375, "learning_rate": 3.3709993123162106e-06, "log_odds_chosen": 1.1686198711395264, "log_odds_ratio": -0.39844751358032227, "logits/chosen": 4.818378448486328, "logits/rejected": 5.660789966583252, "logps/chosen": -0.5040851831436157, "logps/rejected": -0.9685913324356079, "loss": 0.6554, "nll_loss": 0.49605101346969604, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.025204259902238846, "rewards/margins": 0.02322530373930931, "rewards/rejected": -0.04842956364154816, "step": 220 }, { "epoch": 1.06, "grad_norm": 2.046875, "learning_rate": 3.296902366978936e-06, "log_odds_chosen": 0.7159255743026733, "log_odds_ratio": -0.5276229977607727, "logits/chosen": 4.3275017738342285, "logits/rejected": 5.1829423904418945, "logps/chosen": -0.7593253254890442, "logps/rejected": -1.0148638486862183, "loss": 0.6289, "nll_loss": 0.609928548336029, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03796626627445221, "rewards/margins": 0.012776928022503853, "rewards/rejected": -0.05074319988489151, "step": 230 }, { "epoch": 1.11, "grad_norm": 2.375, "learning_rate": 3.2274861218395142e-06, "log_odds_chosen": 0.7326894998550415, "log_odds_ratio": -0.5214331150054932, "logits/chosen": 4.783654689788818, "logits/rejected": 5.283537864685059, "logps/chosen": -0.7465990781784058, "logps/rejected": -0.9910147786140442, "loss": 0.6382, "nll_loss": 0.7347540855407715, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03732995316386223, "rewards/margins": 0.012220785021781921, "rewards/rejected": -0.04955074191093445, "step": 240 }, { "epoch": 1.15, "grad_norm": 2.0625, "learning_rate": 3.1622776601683796e-06, "log_odds_chosen": 0.040362291038036346, "log_odds_ratio": -0.7654204964637756, "logits/chosen": 4.929324150085449, "logits/rejected": 4.940483570098877, "logps/chosen": -0.939703106880188, "logps/rejected": -0.9395262598991394, "loss": 0.6626, "nll_loss": 0.7169132232666016, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.04698516055941582, "rewards/margins": -8.843839168548584e-06, "rewards/rejected": -0.04697632044553757, "step": 250 }, { "epoch": 1.2, "grad_norm": 2.59375, "learning_rate": 3.1008683647302113e-06, "log_odds_chosen": 0.8304751515388489, "log_odds_ratio": -0.4627406597137451, "logits/chosen": 4.34907341003418, "logits/rejected": 4.541801929473877, "logps/chosen": -0.7797168493270874, "logps/rejected": -1.0878037214279175, "loss": 0.6408, "nll_loss": 0.6424815058708191, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03898584097623825, "rewards/margins": 0.015404346399009228, "rewards/rejected": -0.05439019203186035, "step": 260 }, { "epoch": 1.24, "grad_norm": 2.328125, "learning_rate": 3.0429030972509227e-06, "log_odds_chosen": 0.2547241747379303, "log_odds_ratio": -0.7041358351707458, "logits/chosen": 4.1212077140808105, "logits/rejected": 5.139257431030273, "logps/chosen": -0.5988011360168457, "logps/rejected": -0.7647382020950317, "loss": 0.6441, "nll_loss": 0.4384763836860657, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.029940057545900345, "rewards/margins": 0.008296851068735123, "rewards/rejected": -0.03823690861463547, "step": 270 }, { "epoch": 1.29, "grad_norm": 1.9609375, "learning_rate": 2.988071523335984e-06, "log_odds_chosen": 0.7432643175125122, "log_odds_ratio": -0.4928904175758362, "logits/chosen": 4.240169525146484, "logits/rejected": 4.746310234069824, "logps/chosen": -0.7583116292953491, "logps/rejected": -1.0217373371124268, "loss": 0.6349, "nll_loss": 0.5912537574768066, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03791557624936104, "rewards/margins": 0.013171288184821606, "rewards/rejected": -0.05108686536550522, "step": 280 }, { "epoch": 1.34, "grad_norm": 2.5, "learning_rate": 2.9361010975735177e-06, "log_odds_chosen": 0.6404408812522888, "log_odds_ratio": -0.5461726784706116, "logits/chosen": 4.347890377044678, "logits/rejected": 5.2955708503723145, "logps/chosen": -0.8145158886909485, "logps/rejected": -1.124975323677063, "loss": 0.6204, "nll_loss": 0.5651360154151917, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.040725789964199066, "rewards/margins": 0.015522971749305725, "rewards/rejected": -0.05624876171350479, "step": 290 }, { "epoch": 1.38, "grad_norm": 1.96875, "learning_rate": 2.8867513459481293e-06, "log_odds_chosen": 0.4704459607601166, "log_odds_ratio": -0.6623938083648682, "logits/chosen": 4.255876064300537, "logits/rejected": 5.063040733337402, "logps/chosen": -0.7718355059623718, "logps/rejected": -1.144460916519165, "loss": 0.6404, "nll_loss": 0.6724303364753723, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03859177231788635, "rewards/margins": 0.01863126829266548, "rewards/rejected": -0.05722304433584213, "step": 300 }, { "epoch": 1.43, "grad_norm": 2.453125, "learning_rate": 2.839809171235324e-06, "log_odds_chosen": 1.5952459573745728, "log_odds_ratio": -0.2707791328430176, "logits/chosen": 2.7694969177246094, "logits/rejected": 5.479510307312012, "logps/chosen": -0.4962679445743561, "logps/rejected": -1.2316776514053345, "loss": 0.6428, "nll_loss": 0.3623020648956299, "rewards/accuracies": 1.0, "rewards/chosen": -0.024813394993543625, "rewards/margins": 0.03677048534154892, "rewards/rejected": -0.06158388406038284, "step": 310 }, { "epoch": 1.47, "grad_norm": 2.078125, "learning_rate": 2.7950849718747376e-06, "log_odds_chosen": 0.4402007460594177, "log_odds_ratio": -0.5388344526290894, "logits/chosen": 4.8701372146606445, "logits/rejected": 4.049181938171387, "logps/chosen": -0.8427563905715942, "logps/rejected": -1.1280080080032349, "loss": 0.6661, "nll_loss": 0.6774541735649109, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04213782027363777, "rewards/margins": 0.014262576587498188, "rewards/rejected": -0.056400395929813385, "step": 320 }, { "epoch": 1.52, "grad_norm": 1.9765625, "learning_rate": 2.752409412815902e-06, "log_odds_chosen": 1.4536019563674927, "log_odds_ratio": -0.3178521990776062, "logits/chosen": 4.046222686767578, "logits/rejected": 4.855486869812012, "logps/chosen": -0.4614998400211334, "logps/rejected": -1.0025476217269897, "loss": 0.6396, "nll_loss": 0.46759381890296936, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02307499200105667, "rewards/margins": 0.027052391320466995, "rewards/rejected": -0.05012737959623337, "step": 330 }, { "epoch": 1.57, "grad_norm": 2.65625, "learning_rate": 2.711630722733202e-06, "log_odds_chosen": 0.4552677273750305, "log_odds_ratio": -0.5441101789474487, "logits/chosen": 4.233187198638916, "logits/rejected": 4.776756286621094, "logps/chosen": -0.9984881281852722, "logps/rejected": -1.3039405345916748, "loss": 0.6326, "nll_loss": 0.7266319990158081, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04992440715432167, "rewards/margins": 0.01527262944728136, "rewards/rejected": -0.06519703567028046, "step": 340 }, { "epoch": 1.61, "grad_norm": 1.9609375, "learning_rate": 2.6726124191242444e-06, "log_odds_chosen": 0.3951299488544464, "log_odds_ratio": -0.6442996263504028, "logits/chosen": 4.592418193817139, "logits/rejected": 4.885247707366943, "logps/chosen": -0.9690208435058594, "logps/rejected": -1.1191128492355347, "loss": 0.6271, "nll_loss": 0.7028160095214844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04845104366540909, "rewards/margins": 0.007504602428525686, "rewards/rejected": -0.055955640971660614, "step": 350 }, { "epoch": 1.66, "grad_norm": 2.109375, "learning_rate": 2.6352313834736496e-06, "log_odds_chosen": 0.6397253274917603, "log_odds_ratio": -0.4948647916316986, "logits/chosen": 3.1035220623016357, "logits/rejected": 4.4074320793151855, "logps/chosen": -0.7063679695129395, "logps/rejected": -1.086042881011963, "loss": 0.6133, "nll_loss": 0.5209956765174866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03531839698553085, "rewards/margins": 0.0189837496727705, "rewards/rejected": -0.054302144795656204, "step": 360 }, { "epoch": 1.71, "grad_norm": 1.9296875, "learning_rate": 2.599376224550182e-06, "log_odds_chosen": 0.5072129368782043, "log_odds_ratio": -0.5375211834907532, "logits/chosen": 4.4618144035339355, "logits/rejected": 4.897726535797119, "logps/chosen": -0.8658114671707153, "logps/rejected": -1.161678433418274, "loss": 0.625, "nll_loss": 0.7147814035415649, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.043290577828884125, "rewards/margins": 0.014793348498642445, "rewards/rejected": -0.058083921670913696, "step": 370 }, { "epoch": 1.75, "grad_norm": 2.28125, "learning_rate": 2.564945880212886e-06, "log_odds_chosen": 0.5736058950424194, "log_odds_ratio": -0.4948197305202484, "logits/chosen": 4.31764554977417, "logits/rejected": 4.153486251831055, "logps/chosen": -0.8540223836898804, "logps/rejected": -1.1471771001815796, "loss": 0.6393, "nll_loss": 0.6763076186180115, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0427011176943779, "rewards/margins": 0.014657735824584961, "rewards/rejected": -0.05735884979367256, "step": 380 }, { "epoch": 1.8, "grad_norm": 3.640625, "learning_rate": 2.5318484177091667e-06, "log_odds_chosen": 0.8381564021110535, "log_odds_ratio": -0.5308811068534851, "logits/chosen": 4.037534236907959, "logits/rejected": 5.888669013977051, "logps/chosen": -0.700161337852478, "logps/rejected": -1.2042081356048584, "loss": 0.6318, "nll_loss": 0.5512461066246033, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03500806540250778, "rewards/margins": 0.025202345103025436, "rewards/rejected": -0.060210417956113815, "step": 390 }, { "epoch": 1.84, "grad_norm": 2.171875, "learning_rate": 2.5e-06, "log_odds_chosen": 0.7038768529891968, "log_odds_ratio": -0.43052348494529724, "logits/chosen": 3.822885036468506, "logits/rejected": 4.210227012634277, "logps/chosen": -0.6150542497634888, "logps/rejected": -0.9889954328536987, "loss": 0.6218, "nll_loss": 0.5013046264648438, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.030752714723348618, "rewards/margins": 0.01869705691933632, "rewards/rejected": -0.04944976791739464, "step": 400 }, { "epoch": 1.89, "grad_norm": 2.03125, "learning_rate": 2.4693239916239746e-06, "log_odds_chosen": 0.49417972564697266, "log_odds_ratio": -0.5454962253570557, "logits/chosen": 3.7158710956573486, "logits/rejected": 4.625822067260742, "logps/chosen": -0.7136448621749878, "logps/rejected": -0.9806584119796753, "loss": 0.6163, "nll_loss": 0.5766875147819519, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03568224236369133, "rewards/margins": 0.013350683264434338, "rewards/rejected": -0.04903292655944824, "step": 410 }, { "epoch": 1.94, "grad_norm": 1.96875, "learning_rate": 2.4397501823713327e-06, "log_odds_chosen": 1.2905668020248413, "log_odds_ratio": -0.3054632544517517, "logits/chosen": 4.375031471252441, "logits/rejected": 5.165828704833984, "logps/chosen": -0.6634560823440552, "logps/rejected": -1.2297804355621338, "loss": 0.6299, "nll_loss": 0.5654190182685852, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03317280486226082, "rewards/margins": 0.02831621840596199, "rewards/rejected": -0.06148902326822281, "step": 420 }, { "epoch": 1.98, "grad_norm": 2.234375, "learning_rate": 2.411214110852061e-06, "log_odds_chosen": 0.4614163041114807, "log_odds_ratio": -0.5477044582366943, "logits/chosen": 3.945091724395752, "logits/rejected": 4.783943176269531, "logps/chosen": -0.670985758304596, "logps/rejected": -0.8528381586074829, "loss": 0.6328, "nll_loss": 0.5353778004646301, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03354928642511368, "rewards/margins": 0.009092616848647594, "rewards/rejected": -0.04264190047979355, "step": 430 }, { "epoch": 2.03, "grad_norm": 2.140625, "learning_rate": 2.3836564731139807e-06, "log_odds_chosen": 0.519318699836731, "log_odds_ratio": -0.5034213066101074, "logits/chosen": 3.990828037261963, "logits/rejected": 4.283727645874023, "logps/chosen": -0.7843809723854065, "logps/rejected": -1.1084554195404053, "loss": 0.598, "nll_loss": 0.6064985394477844, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03921904414892197, "rewards/margins": 0.01620371639728546, "rewards/rejected": -0.055422764271497726, "step": 440 }, { "epoch": 2.07, "grad_norm": 2.015625, "learning_rate": 2.357022603955159e-06, "log_odds_chosen": 1.2161670923233032, "log_odds_ratio": -0.5558447241783142, "logits/chosen": 2.7631869316101074, "logits/rejected": 4.014997959136963, "logps/chosen": -0.4891352653503418, "logps/rejected": -1.057556390762329, "loss": 0.6063, "nll_loss": 0.5005042552947998, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.02445676364004612, "rewards/margins": 0.028421055525541306, "rewards/rejected": -0.052877821028232574, "step": 450 }, { "epoch": 2.12, "grad_norm": 2.0625, "learning_rate": 2.3312620206007847e-06, "log_odds_chosen": 0.8278636932373047, "log_odds_ratio": -0.43884754180908203, "logits/chosen": 4.009448051452637, "logits/rejected": 4.671367645263672, "logps/chosen": -0.7134698629379272, "logps/rejected": -1.146784782409668, "loss": 0.5862, "nll_loss": 0.5619599223136902, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03567349165678024, "rewards/margins": 0.021665748208761215, "rewards/rejected": -0.05733924359083176, "step": 460 }, { "epoch": 2.17, "grad_norm": 2.609375, "learning_rate": 2.3063280200722128e-06, "log_odds_chosen": 1.677671194076538, "log_odds_ratio": -0.2895694375038147, "logits/chosen": 2.985790491104126, "logits/rejected": 4.190914630889893, "logps/chosen": -0.5018793344497681, "logps/rejected": -1.0572091341018677, "loss": 0.5765, "nll_loss": 0.5001329183578491, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.025093963369727135, "rewards/margins": 0.02776649035513401, "rewards/rejected": -0.05286044999957085, "step": 470 }, { "epoch": 2.21, "grad_norm": 2.0, "learning_rate": 2.2821773229381924e-06, "log_odds_chosen": 1.0791471004486084, "log_odds_ratio": -0.37350553274154663, "logits/chosen": 3.676426649093628, "logits/rejected": 3.8374907970428467, "logps/chosen": -0.7438164353370667, "logps/rejected": -1.29355788230896, "loss": 0.5652, "nll_loss": 0.6555451154708862, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.037190817296504974, "rewards/margins": 0.027487074956297874, "rewards/rejected": -0.0646779015660286, "step": 480 }, { "epoch": 2.26, "grad_norm": 2.21875, "learning_rate": 2.2587697572631284e-06, "log_odds_chosen": 0.275502473115921, "log_odds_ratio": -0.7135687470436096, "logits/chosen": 4.321534156799316, "logits/rejected": 4.41732120513916, "logps/chosen": -0.9727070927619934, "logps/rejected": -1.0810346603393555, "loss": 0.5952, "nll_loss": 0.7110171914100647, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04863535612821579, "rewards/margins": 0.005416377447545528, "rewards/rejected": -0.05405173450708389, "step": 490 }, { "epoch": 2.3, "grad_norm": 2.15625, "learning_rate": 2.23606797749979e-06, "log_odds_chosen": 0.34863442182540894, "log_odds_ratio": -0.6463712453842163, "logits/chosen": 4.6876606941223145, "logits/rejected": 5.054124355316162, "logps/chosen": -0.9338000416755676, "logps/rejected": -1.1037800312042236, "loss": 0.5953, "nll_loss": 0.8528131246566772, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04669000208377838, "rewards/margins": 0.008499005809426308, "rewards/rejected": -0.05518900603055954, "step": 500 }, { "epoch": 2.35, "grad_norm": 2.171875, "learning_rate": 2.2140372138502386e-06, "log_odds_chosen": 0.9548345804214478, "log_odds_ratio": -0.39882007241249084, "logits/chosen": 3.5289406776428223, "logits/rejected": 3.8287463188171387, "logps/chosen": -0.6570809483528137, "logps/rejected": -1.1388274431228638, "loss": 0.609, "nll_loss": 0.5968061685562134, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.032854050397872925, "rewards/margins": 0.024087321013212204, "rewards/rejected": -0.05694136768579483, "step": 510 }, { "epoch": 2.4, "grad_norm": 1.9609375, "learning_rate": 2.1926450482675734e-06, "log_odds_chosen": 0.4539831280708313, "log_odds_ratio": -0.5872747302055359, "logits/chosen": 3.2061939239501953, "logits/rejected": 4.589787006378174, "logps/chosen": -0.7979894280433655, "logps/rejected": -1.0285401344299316, "loss": 0.5827, "nll_loss": 0.6084668636322021, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.039899468421936035, "rewards/margins": 0.011527536436915398, "rewards/rejected": -0.051426999270915985, "step": 520 }, { "epoch": 2.44, "grad_norm": 2.484375, "learning_rate": 2.1718612138153473e-06, "log_odds_chosen": 0.8493059277534485, "log_odds_ratio": -0.6372500658035278, "logits/chosen": 3.078615665435791, "logits/rejected": 4.099945068359375, "logps/chosen": -0.6704202890396118, "logps/rejected": -0.7899671792984009, "loss": 0.5788, "nll_loss": 0.5733928084373474, "rewards/accuracies": 0.5, "rewards/chosen": -0.03352101519703865, "rewards/margins": 0.005977341439574957, "rewards/rejected": -0.039498358964920044, "step": 530 }, { "epoch": 2.49, "grad_norm": 1.859375, "learning_rate": 2.151657414559676e-06, "log_odds_chosen": 0.6374627351760864, "log_odds_ratio": -0.5592355728149414, "logits/chosen": 3.680483341217041, "logits/rejected": 3.9816291332244873, "logps/chosen": -0.8559755086898804, "logps/rejected": -1.1612054109573364, "loss": 0.6003, "nll_loss": 0.6403124928474426, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04279877990484238, "rewards/margins": 0.015261486172676086, "rewards/rejected": -0.05806026607751846, "step": 540 }, { "epoch": 2.53, "grad_norm": 1.8984375, "learning_rate": 2.132007163556104e-06, "log_odds_chosen": 1.399209976196289, "log_odds_ratio": -0.5735031366348267, "logits/chosen": 3.132289171218872, "logits/rejected": 3.5427193641662598, "logps/chosen": -0.5963010191917419, "logps/rejected": -0.9639393091201782, "loss": 0.5984, "nll_loss": 0.5058175325393677, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.029815051704645157, "rewards/margins": 0.018381912261247635, "rewards/rejected": -0.04819696769118309, "step": 550 }, { "epoch": 2.58, "grad_norm": 1.859375, "learning_rate": 2.1128856368212917e-06, "log_odds_chosen": 0.688880443572998, "log_odds_ratio": -0.4902462959289551, "logits/chosen": 2.6950721740722656, "logits/rejected": 3.1528286933898926, "logps/chosen": -0.6383022665977478, "logps/rejected": -0.9691828489303589, "loss": 0.5718, "nll_loss": 0.4289799630641937, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03191510960459709, "rewards/margins": 0.016544032841920853, "rewards/rejected": -0.048459142446517944, "step": 560 }, { "epoch": 2.63, "grad_norm": 2.421875, "learning_rate": 2.0942695414584777e-06, "log_odds_chosen": 1.3283271789550781, "log_odds_ratio": -0.3012233078479767, "logits/chosen": 3.4564871788024902, "logits/rejected": 4.7043867111206055, "logps/chosen": -0.6779360771179199, "logps/rejected": -1.523970365524292, "loss": 0.6138, "nll_loss": 0.5768535137176514, "rewards/accuracies": 1.0, "rewards/chosen": -0.033896803855895996, "rewards/margins": 0.042301714420318604, "rewards/rejected": -0.0761985182762146, "step": 570 }, { "epoch": 2.67, "grad_norm": 1.953125, "learning_rate": 2.0761369963434992e-06, "log_odds_chosen": 1.4566174745559692, "log_odds_ratio": -0.32581037282943726, "logits/chosen": 2.691676616668701, "logits/rejected": 4.661564826965332, "logps/chosen": -0.4493564963340759, "logps/rejected": -1.0139671564102173, "loss": 0.5782, "nll_loss": 0.37120580673217773, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.022467825561761856, "rewards/margins": 0.028230536729097366, "rewards/rejected": -0.05069836229085922, "step": 580 }, { "epoch": 2.72, "grad_norm": 2.0625, "learning_rate": 2.058467423981546e-06, "log_odds_chosen": 1.0190517902374268, "log_odds_ratio": -0.5730624198913574, "logits/chosen": 3.407086133956909, "logits/rejected": 4.482596397399902, "logps/chosen": -0.7345553040504456, "logps/rejected": -0.9309635162353516, "loss": 0.5723, "nll_loss": 0.5519307851791382, "rewards/accuracies": 0.5, "rewards/chosen": -0.03672776371240616, "rewards/margins": 0.009820410050451756, "rewards/rejected": -0.04654817283153534, "step": 590 }, { "epoch": 2.76, "grad_norm": 2.375, "learning_rate": 2.0412414523193154e-06, "log_odds_chosen": 1.107779860496521, "log_odds_ratio": -0.40593117475509644, "logits/chosen": 3.215078830718994, "logits/rejected": 4.503358840942383, "logps/chosen": -0.663019597530365, "logps/rejected": -1.2786920070648193, "loss": 0.5815, "nll_loss": 0.5633824467658997, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03315097838640213, "rewards/margins": 0.030783619731664658, "rewards/rejected": -0.06393460184335709, "step": 600 }, { "epoch": 2.81, "grad_norm": 2.09375, "learning_rate": 2.0244408254472904e-06, "log_odds_chosen": 0.7602224349975586, "log_odds_ratio": -0.5018362998962402, "logits/chosen": 3.604353666305542, "logits/rejected": 4.481316089630127, "logps/chosen": -0.7105517387390137, "logps/rejected": -1.0740478038787842, "loss": 0.5873, "nll_loss": 0.5312780737876892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.035527586936950684, "rewards/margins": 0.018174810335040092, "rewards/rejected": -0.05370239168405533, "step": 610 }, { "epoch": 2.86, "grad_norm": 1.90625, "learning_rate": 2.0080483222562476e-06, "log_odds_chosen": 1.3286904096603394, "log_odds_ratio": -0.36574870347976685, "logits/chosen": 3.620469331741333, "logits/rejected": 4.373411655426025, "logps/chosen": -0.4990506172180176, "logps/rejected": -0.953050971031189, "loss": 0.5716, "nll_loss": 0.5527733564376831, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.024952532723546028, "rewards/margins": 0.022700021043419838, "rewards/rejected": -0.047652553766965866, "step": 620 }, { "epoch": 2.9, "grad_norm": 2.359375, "learning_rate": 1.9920476822239895e-06, "log_odds_chosen": 0.4847317636013031, "log_odds_ratio": -0.5640643835067749, "logits/chosen": 3.125113010406494, "logits/rejected": 3.340205669403076, "logps/chosen": -0.8360971212387085, "logps/rejected": -1.0480194091796875, "loss": 0.5738, "nll_loss": 0.6136351823806763, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.041804857552051544, "rewards/margins": 0.010596117004752159, "rewards/rejected": -0.05240097641944885, "step": 630 }, { "epoch": 2.95, "grad_norm": 2.09375, "learning_rate": 1.976423537605237e-06, "log_odds_chosen": 0.8931509256362915, "log_odds_ratio": -0.40087467432022095, "logits/chosen": 3.574153423309326, "logits/rejected": 4.537802219390869, "logps/chosen": -0.6440940499305725, "logps/rejected": -1.088226556777954, "loss": 0.5846, "nll_loss": 0.5598152875900269, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.032204702496528625, "rewards/margins": 0.0222066268324852, "rewards/rejected": -0.054411329329013824, "step": 640 }, { "epoch": 3.0, "grad_norm": 2.5625, "learning_rate": 1.961161351381841e-06, "log_odds_chosen": 1.2053475379943848, "log_odds_ratio": -0.430248886346817, "logits/chosen": 2.245370388031006, "logits/rejected": 3.5309462547302246, "logps/chosen": -0.5642444491386414, "logps/rejected": -0.9910544157028198, "loss": 0.5605, "nll_loss": 0.45886915922164917, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.028212225064635277, "rewards/margins": 0.021340493112802505, "rewards/rejected": -0.04955272004008293, "step": 650 }, { "epoch": 3.0, "step": 651, "total_flos": 0.0, "train_loss": 0.812556631554107, "train_runtime": 4771.9621, "train_samples_per_second": 4.358, "train_steps_per_second": 0.136 } ], "logging_steps": 10, "max_steps": 651, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }