{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9988623435722411, "eval_steps": 10000000, "global_step": 439, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 23.98402260612519, "learning_rate": 2.2727272727272727e-09, "logits/chosen": -1.6768856048583984, "logits/rejected": -1.7259055376052856, "logps/chosen": -394.9654541015625, "logps/rejected": -320.0859069824219, "loss": 0.693, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 31.95891283237601, "learning_rate": 2.2727272727272725e-08, "logits/chosen": -1.7029528617858887, "logits/rejected": -1.6683764457702637, "logps/chosen": -429.5246887207031, "logps/rejected": -403.747314453125, "loss": 0.6933, "rewards/accuracies": 0.4305555522441864, "rewards/chosen": 2.8328322514425963e-05, "rewards/margins": -0.0006085141212679446, "rewards/rejected": 0.0006368425092659891, "step": 10 }, { "epoch": 0.05, "grad_norm": 49.89152060792663, "learning_rate": 4.545454545454545e-08, "logits/chosen": -1.7806730270385742, "logits/rejected": -1.7358741760253906, "logps/chosen": -442.21636962890625, "logps/rejected": -401.44000244140625, "loss": 0.6922, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0007078830385580659, "rewards/margins": 0.0019328873604536057, "rewards/rejected": -0.0026407705154269934, "step": 20 }, { "epoch": 0.07, "grad_norm": 35.37044354910807, "learning_rate": 6.818181818181817e-08, "logits/chosen": -1.751228928565979, "logits/rejected": -1.6827186346054077, "logps/chosen": -440.0089416503906, "logps/rejected": -401.7633361816406, "loss": 0.6874, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.002162130083888769, "rewards/margins": 0.012089937925338745, "rewards/rejected": -0.014252068474888802, "step": 30 }, { "epoch": 0.09, "grad_norm": 24.433299552566275, "learning_rate": 9.09090909090909e-08, "logits/chosen": -1.7572921514511108, "logits/rejected": -1.6904194355010986, "logps/chosen": -431.66802978515625, "logps/rejected": -390.01898193359375, "loss": 0.6773, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.00594430323690176, "rewards/margins": 0.07497542351484299, "rewards/rejected": -0.06903111934661865, "step": 40 }, { "epoch": 0.11, "grad_norm": 31.92529179185726, "learning_rate": 9.994307990108962e-08, "logits/chosen": -1.7621084451675415, "logits/rejected": -1.6976591348648071, "logps/chosen": -444.54913330078125, "logps/rejected": -393.5503234863281, "loss": 0.6599, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.04167298600077629, "rewards/margins": 0.09635747224092484, "rewards/rejected": -0.05468447878956795, "step": 50 }, { "epoch": 0.14, "grad_norm": 28.704756119671675, "learning_rate": 9.959570405988094e-08, "logits/chosen": -1.8457626104354858, "logits/rejected": -1.7673368453979492, "logps/chosen": -385.0513916015625, "logps/rejected": -358.7567443847656, "loss": 0.6566, "rewards/accuracies": 0.59375, "rewards/chosen": 0.10071470588445663, "rewards/margins": 0.06823419034481049, "rewards/rejected": 0.032480526715517044, "step": 60 }, { "epoch": 0.16, "grad_norm": 24.531377945654775, "learning_rate": 9.893476820924666e-08, "logits/chosen": -1.9651501178741455, "logits/rejected": -1.8813607692718506, "logps/chosen": -403.64105224609375, "logps/rejected": -371.46234130859375, "loss": 0.6395, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.2845568060874939, "rewards/margins": 0.13415710628032684, "rewards/rejected": 0.15039967000484467, "step": 70 }, { "epoch": 0.18, "grad_norm": 27.08910191775396, "learning_rate": 9.796445099843647e-08, "logits/chosen": -2.0368916988372803, "logits/rejected": -1.9531713724136353, "logps/chosen": -407.11553955078125, "logps/rejected": -381.14794921875, "loss": 0.6473, "rewards/accuracies": 0.65625, "rewards/chosen": 0.3643694519996643, "rewards/margins": 0.1756153404712677, "rewards/rejected": 0.18875406682491302, "step": 80 }, { "epoch": 0.2, "grad_norm": 38.20992212526602, "learning_rate": 9.669088708527066e-08, "logits/chosen": -2.025817394256592, "logits/rejected": -1.9624574184417725, "logps/chosen": -415.40020751953125, "logps/rejected": -390.00140380859375, "loss": 0.6291, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.36643102765083313, "rewards/margins": 0.1800619214773178, "rewards/rejected": 0.1863691359758377, "step": 90 }, { "epoch": 0.23, "grad_norm": 27.439269349288335, "learning_rate": 9.512212835085849e-08, "logits/chosen": -2.1013572216033936, "logits/rejected": -2.0152974128723145, "logps/chosen": -388.7185974121094, "logps/rejected": -383.03546142578125, "loss": 0.6197, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.37033963203430176, "rewards/margins": 0.19511529803276062, "rewards/rejected": 0.17522430419921875, "step": 100 }, { "epoch": 0.25, "grad_norm": 26.198395783666243, "learning_rate": 9.326809299301306e-08, "logits/chosen": -2.1095404624938965, "logits/rejected": -2.0083327293395996, "logps/chosen": -427.6131896972656, "logps/rejected": -393.1694030761719, "loss": 0.6112, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.37072473764419556, "rewards/margins": 0.26910915970802307, "rewards/rejected": 0.10161559283733368, "step": 110 }, { "epoch": 0.27, "grad_norm": 26.3105476660504, "learning_rate": 9.114050282021158e-08, "logits/chosen": -2.1161999702453613, "logits/rejected": -2.05537748336792, "logps/chosen": -435.20086669921875, "logps/rejected": -412.62078857421875, "loss": 0.607, "rewards/accuracies": 0.6875, "rewards/chosen": 0.268494188785553, "rewards/margins": 0.23757004737854004, "rewards/rejected": 0.030924171209335327, "step": 120 }, { "epoch": 0.3, "grad_norm": 27.752798140691322, "learning_rate": 8.875280914254802e-08, "logits/chosen": -2.1366939544677734, "logits/rejected": -2.0489516258239746, "logps/chosen": -392.9619140625, "logps/rejected": -363.98553466796875, "loss": 0.602, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.2517772316932678, "rewards/margins": 0.2715442478656769, "rewards/rejected": -0.01976701058447361, "step": 130 }, { "epoch": 0.32, "grad_norm": 24.888665971199188, "learning_rate": 8.612010772821971e-08, "logits/chosen": -2.180723190307617, "logits/rejected": -2.1342811584472656, "logps/chosen": -445.15472412109375, "logps/rejected": -407.91998291015625, "loss": 0.6048, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.32857105135917664, "rewards/margins": 0.3196411728858948, "rewards/rejected": 0.008929857984185219, "step": 140 }, { "epoch": 0.34, "grad_norm": 27.895144442719968, "learning_rate": 8.325904336322055e-08, "logits/chosen": -2.172715663909912, "logits/rejected": -2.112837314605713, "logps/chosen": -402.29217529296875, "logps/rejected": -380.25335693359375, "loss": 0.6071, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1980087012052536, "rewards/margins": 0.2475043088197708, "rewards/rejected": -0.049495600163936615, "step": 150 }, { "epoch": 0.36, "grad_norm": 24.760628369907216, "learning_rate": 8.01877046176447e-08, "logits/chosen": -2.131298303604126, "logits/rejected": -2.059814453125, "logps/chosen": -396.01788330078125, "logps/rejected": -379.63665771484375, "loss": 0.5888, "rewards/accuracies": 0.65625, "rewards/chosen": 0.12456746399402618, "rewards/margins": 0.23458366096019745, "rewards/rejected": -0.11001620441675186, "step": 160 }, { "epoch": 0.39, "grad_norm": 22.13569267518615, "learning_rate": 7.692550948392249e-08, "logits/chosen": -2.1969549655914307, "logits/rejected": -2.1351304054260254, "logps/chosen": -421.291015625, "logps/rejected": -387.77484130859375, "loss": 0.589, "rewards/accuracies": 0.65625, "rewards/chosen": 0.19595439732074738, "rewards/margins": 0.26243916153907776, "rewards/rejected": -0.06648479402065277, "step": 170 }, { "epoch": 0.41, "grad_norm": 29.37383088378205, "learning_rate": 7.349308261002021e-08, "logits/chosen": -2.1651830673217773, "logits/rejected": -2.1070454120635986, "logps/chosen": -426.6806640625, "logps/rejected": -408.07781982421875, "loss": 0.5855, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2470887452363968, "rewards/margins": 0.2750667631626129, "rewards/rejected": -0.02797803282737732, "step": 180 }, { "epoch": 0.43, "grad_norm": 29.642029984354075, "learning_rate": 6.991212490377531e-08, "logits/chosen": -2.239077091217041, "logits/rejected": -2.188472270965576, "logps/chosen": -460.720947265625, "logps/rejected": -433.29608154296875, "loss": 0.58, "rewards/accuracies": 0.71875, "rewards/chosen": 0.17696049809455872, "rewards/margins": 0.3639344274997711, "rewards/rejected": -0.1869739592075348, "step": 190 }, { "epoch": 0.46, "grad_norm": 40.17804362794063, "learning_rate": 6.620527633276978e-08, "logits/chosen": -2.1767477989196777, "logits/rejected": -2.1026599407196045, "logps/chosen": -425.94677734375, "logps/rejected": -428.23846435546875, "loss": 0.5857, "rewards/accuracies": 0.71875, "rewards/chosen": 0.10997031629085541, "rewards/margins": 0.38212892413139343, "rewards/rejected": -0.2721586227416992, "step": 200 }, { "epoch": 0.48, "grad_norm": 23.44190889972074, "learning_rate": 6.239597278716581e-08, "logits/chosen": -2.2649803161621094, "logits/rejected": -2.202864646911621, "logps/chosen": -410.3318786621094, "logps/rejected": -393.8056640625, "loss": 0.5721, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.14795458316802979, "rewards/margins": 0.41721048951148987, "rewards/rejected": -0.26925593614578247, "step": 210 }, { "epoch": 0.5, "grad_norm": 25.725130617472978, "learning_rate": 5.8508297910462456e-08, "logits/chosen": -2.2174572944641113, "logits/rejected": -2.1320137977600098, "logps/chosen": -415.51153564453125, "logps/rejected": -422.1241149902344, "loss": 0.5729, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.06063612177968025, "rewards/margins": 0.38312196731567383, "rewards/rejected": -0.32248586416244507, "step": 220 }, { "epoch": 0.52, "grad_norm": 33.748772620696556, "learning_rate": 5.456683083494731e-08, "logits/chosen": -2.204589366912842, "logits/rejected": -2.164806842803955, "logps/chosen": -464.0941467285156, "logps/rejected": -462.9925231933594, "loss": 0.5856, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1456037312746048, "rewards/margins": 0.29207009077072144, "rewards/rejected": -0.14646635949611664, "step": 230 }, { "epoch": 0.55, "grad_norm": 35.95467752105537, "learning_rate": 5.059649078450834e-08, "logits/chosen": -2.2099993228912354, "logits/rejected": -2.1632981300354004, "logps/chosen": -434.889404296875, "logps/rejected": -441.2001037597656, "loss": 0.5675, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.03340945765376091, "rewards/margins": 0.3251820206642151, "rewards/rejected": -0.2917725443840027, "step": 240 }, { "epoch": 0.57, "grad_norm": 21.692896426665428, "learning_rate": 4.6622379527277186e-08, "logits/chosen": -2.2246696949005127, "logits/rejected": -2.1740729808807373, "logps/chosen": -406.245361328125, "logps/rejected": -402.9896545410156, "loss": 0.5644, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.07031740248203278, "rewards/margins": 0.33437561988830566, "rewards/rejected": -0.40469303727149963, "step": 250 }, { "epoch": 0.59, "grad_norm": 39.74076278883575, "learning_rate": 4.26696226741691e-08, "logits/chosen": -2.2478280067443848, "logits/rejected": -2.1786162853240967, "logps/chosen": -446.00897216796875, "logps/rejected": -439.8905334472656, "loss": 0.5747, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.13826611638069153, "rewards/margins": 0.3671457767486572, "rewards/rejected": -0.5054119229316711, "step": 260 }, { "epoch": 0.61, "grad_norm": 24.611550372970484, "learning_rate": 3.876321082668098e-08, "logits/chosen": -2.306821346282959, "logits/rejected": -2.2316880226135254, "logps/chosen": -465.56439208984375, "logps/rejected": -459.61700439453125, "loss": 0.564, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.019088072702288628, "rewards/margins": 0.4447278082370758, "rewards/rejected": -0.463815838098526, "step": 270 }, { "epoch": 0.64, "grad_norm": 39.92086883833645, "learning_rate": 3.492784157826244e-08, "logits/chosen": -2.236232280731201, "logits/rejected": -2.1331005096435547, "logps/chosen": -452.8758850097656, "logps/rejected": -426.164306640625, "loss": 0.5642, "rewards/accuracies": 0.75, "rewards/chosen": -0.054852552711963654, "rewards/margins": 0.43687576055526733, "rewards/rejected": -0.49172839522361755, "step": 280 }, { "epoch": 0.66, "grad_norm": 30.441413413514407, "learning_rate": 3.118776336817812e-08, "logits/chosen": -2.286245346069336, "logits/rejected": -2.217912435531616, "logps/chosen": -441.1709899902344, "logps/rejected": -427.1979064941406, "loss": 0.5531, "rewards/accuracies": 0.75, "rewards/chosen": -0.020920906215906143, "rewards/margins": 0.44389209151268005, "rewards/rejected": -0.4648129940032959, "step": 290 }, { "epoch": 0.68, "grad_norm": 27.497209882056495, "learning_rate": 2.7566622175067443e-08, "logits/chosen": -2.2853000164031982, "logits/rejected": -2.2172305583953857, "logps/chosen": -439.1316833496094, "logps/rejected": -443.3128967285156, "loss": 0.5682, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.03673394396901131, "rewards/margins": 0.42858797311782837, "rewards/rejected": -0.46532192826271057, "step": 300 }, { "epoch": 0.71, "grad_norm": 35.65048747036057, "learning_rate": 2.408731201945432e-08, "logits/chosen": -2.270371437072754, "logits/rejected": -2.2171905040740967, "logps/chosen": -437.3782653808594, "logps/rejected": -453.547607421875, "loss": 0.5546, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.06338675320148468, "rewards/margins": 0.3352622985839844, "rewards/rejected": -0.39864906668663025, "step": 310 }, { "epoch": 0.73, "grad_norm": 34.348920131629896, "learning_rate": 2.0771830220378112e-08, "logits/chosen": -2.223020076751709, "logits/rejected": -2.158247709274292, "logps/chosen": -462.366455078125, "logps/rejected": -461.9310607910156, "loss": 0.5476, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.025463122874498367, "rewards/margins": 0.39216503500938416, "rewards/rejected": -0.41762813925743103, "step": 320 }, { "epoch": 0.75, "grad_norm": 27.509152301102358, "learning_rate": 1.7641138321260257e-08, "logits/chosen": -2.2643046379089355, "logits/rejected": -2.188499689102173, "logps/chosen": -440.750732421875, "logps/rejected": -428.7913513183594, "loss": 0.5541, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.06256476789712906, "rewards/margins": 0.46622103452682495, "rewards/rejected": -0.5287858247756958, "step": 330 }, { "epoch": 0.77, "grad_norm": 27.197721402060907, "learning_rate": 1.4715029564277793e-08, "logits/chosen": -2.340247631072998, "logits/rejected": -2.2862460613250732, "logps/chosen": -450.05523681640625, "logps/rejected": -448.6039123535156, "loss": 0.5553, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.009611198678612709, "rewards/margins": 0.4653921127319336, "rewards/rejected": -0.45578089356422424, "step": 340 }, { "epoch": 0.8, "grad_norm": 23.708931464533432, "learning_rate": 1.2012003751113343e-08, "logits/chosen": -2.325669288635254, "logits/rejected": -2.259887933731079, "logps/chosen": -452.64862060546875, "logps/rejected": -454.2327575683594, "loss": 0.538, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.17242571711540222, "rewards/margins": 0.4136194586753845, "rewards/rejected": -0.5860452055931091, "step": 350 }, { "epoch": 0.82, "grad_norm": 32.57218388588953, "learning_rate": 9.549150281252633e-09, "logits/chosen": -2.284872531890869, "logits/rejected": -2.2292895317077637, "logps/chosen": -473.66009521484375, "logps/rejected": -472.35919189453125, "loss": 0.547, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.11653508991003036, "rewards/margins": 0.4531070590019226, "rewards/rejected": -0.5696421265602112, "step": 360 }, { "epoch": 0.84, "grad_norm": 36.6888447736333, "learning_rate": 7.3420401072985306e-09, "logits/chosen": -2.322558641433716, "logits/rejected": -2.267219066619873, "logps/chosen": -451.59295654296875, "logps/rejected": -459.50665283203125, "loss": 0.5582, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14963454008102417, "rewards/margins": 0.4179585874080658, "rewards/rejected": -0.5675932168960571, "step": 370 }, { "epoch": 0.86, "grad_norm": 25.485415219281943, "learning_rate": 5.404627290395369e-09, "logits/chosen": -2.2841172218322754, "logits/rejected": -2.213308334350586, "logps/chosen": -445.1299743652344, "logps/rejected": -450.1717224121094, "loss": 0.5491, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.027432629838585854, "rewards/margins": 0.4644508957862854, "rewards/rejected": -0.4918835163116455, "step": 380 }, { "epoch": 0.89, "grad_norm": 37.42685396157706, "learning_rate": 3.74916077816162e-09, "logits/chosen": -2.292269706726074, "logits/rejected": -2.226890802383423, "logps/chosen": -436.8121643066406, "logps/rejected": -433.2718200683594, "loss": 0.5552, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1562749743461609, "rewards/margins": 0.388277530670166, "rewards/rejected": -0.5445524454116821, "step": 390 }, { "epoch": 0.91, "grad_norm": 24.281602089300215, "learning_rate": 2.386106962899165e-09, "logits/chosen": -2.218071460723877, "logits/rejected": -2.1384034156799316, "logps/chosen": -447.8563537597656, "logps/rejected": -437.1399841308594, "loss": 0.5478, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.219069242477417, "rewards/margins": 0.37984699010849, "rewards/rejected": -0.598916232585907, "step": 400 }, { "epoch": 0.93, "grad_norm": 28.633671701703214, "learning_rate": 1.3240835096913706e-09, "logits/chosen": -2.2583167552948, "logits/rejected": -2.1519083976745605, "logps/chosen": -440.5953063964844, "logps/rejected": -431.39501953125, "loss": 0.5514, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.04258907586336136, "rewards/margins": 0.5055002570152283, "rewards/rejected": -0.5480893850326538, "step": 410 }, { "epoch": 0.96, "grad_norm": 25.241002875120977, "learning_rate": 5.698048727497462e-10, "logits/chosen": -2.2798821926116943, "logits/rejected": -2.2044219970703125, "logps/chosen": -424.9578552246094, "logps/rejected": -435.0152893066406, "loss": 0.5514, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.11878044903278351, "rewards/margins": 0.5139662623405457, "rewards/rejected": -0.632746696472168, "step": 420 }, { "epoch": 0.98, "grad_norm": 23.361501927411613, "learning_rate": 1.2803984447259387e-10, "logits/chosen": -2.295804977416992, "logits/rejected": -2.235670328140259, "logps/chosen": -475.6675720214844, "logps/rejected": -457.9853515625, "loss": 0.5428, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.12296229600906372, "rewards/margins": 0.4703094959259033, "rewards/rejected": -0.593271791934967, "step": 430 }, { "epoch": 1.0, "step": 439, "total_flos": 0.0, "train_loss": 0.048529281703106095, "train_runtime": 584.5423, "train_samples_per_second": 96.205, "train_steps_per_second": 0.751 } ], "logging_steps": 10, "max_steps": 439, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }