{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9988623435722411, "eval_steps": 10000000, "global_step": 439, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 22.78893022336454, "learning_rate": 2.2727272727272727e-09, "logits/chosen": -1.6768856048583984, "logits/rejected": -1.7259055376052856, "logps/chosen": -1.2793102264404297, "logps/rejected": -1.2162058353424072, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 25.567365971420195, "learning_rate": 2.2727272727272725e-08, "logits/chosen": -1.7031302452087402, "logits/rejected": -1.6688512563705444, "logps/chosen": -1.213205337524414, "logps/rejected": -1.220165729522705, "loss": 0.693, "rewards/accuracies": 0.4513888955116272, "rewards/chosen": 0.0002006387512665242, "rewards/margins": -0.0009609226835891604, "rewards/rejected": 0.001161561463959515, "step": 10 }, { "epoch": 0.05, "grad_norm": 26.2968319036787, "learning_rate": 4.545454545454545e-08, "logits/chosen": -1.7800958156585693, "logits/rejected": -1.7349421977996826, "logps/chosen": -1.1448661088943481, "logps/rejected": -1.185571312904358, "loss": 0.6924, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.0032871514558792114, "rewards/margins": 0.0009361729025840759, "rewards/rejected": -0.004223324358463287, "step": 20 }, { "epoch": 0.07, "grad_norm": 28.64228481569523, "learning_rate": 6.818181818181817e-08, "logits/chosen": -1.744109869003296, "logits/rejected": -1.6754045486450195, "logps/chosen": -1.195277452468872, "logps/rejected": -1.2481118440628052, "loss": 0.6883, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02475227788090706, "rewards/margins": 0.011208651587367058, "rewards/rejected": -0.035960931330919266, "step": 30 }, { "epoch": 0.09, "grad_norm": 19.497764912088467, "learning_rate": 9.09090909090909e-08, "logits/chosen": -1.7310603857040405, "logits/rejected": -1.6648972034454346, "logps/chosen": -1.2513717412948608, "logps/rejected": -1.3350750207901, "loss": 0.6798, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.08884630352258682, "rewards/margins": 0.06275957077741623, "rewards/rejected": -0.15160587430000305, "step": 40 }, { "epoch": 0.11, "grad_norm": 24.1777149048992, "learning_rate": 9.994307990108962e-08, "logits/chosen": -1.6923043727874756, "logits/rejected": -1.6273339986801147, "logps/chosen": -1.2986948490142822, "logps/rejected": -1.356567144393921, "loss": 0.6636, "rewards/accuracies": 0.71875, "rewards/chosen": -0.18121571838855743, "rewards/margins": 0.0755590870976448, "rewards/rejected": -0.25677481293678284, "step": 50 }, { "epoch": 0.14, "grad_norm": 28.137865307641825, "learning_rate": 9.959570405988094e-08, "logits/chosen": -1.7212276458740234, "logits/rejected": -1.6404285430908203, "logps/chosen": -1.2889435291290283, "logps/rejected": -1.3794549703598022, "loss": 0.6598, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.35969024896621704, "rewards/margins": 0.07716657221317291, "rewards/rejected": -0.4368568956851959, "step": 60 }, { "epoch": 0.16, "grad_norm": 21.253003888204287, "learning_rate": 9.893476820924666e-08, "logits/chosen": -1.8007967472076416, "logits/rejected": -1.7120532989501953, "logps/chosen": -1.476588487625122, "logps/rejected": -1.5963420867919922, "loss": 0.6499, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5274931192398071, "rewards/margins": 0.14617758989334106, "rewards/rejected": -0.6736707091331482, "step": 70 }, { "epoch": 0.18, "grad_norm": 26.777703545743453, "learning_rate": 9.796445099843647e-08, "logits/chosen": -1.7857911586761475, "logits/rejected": -1.6999902725219727, "logps/chosen": -1.5350762605667114, "logps/rejected": -1.6896966695785522, "loss": 0.654, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6562157869338989, "rewards/margins": 0.18818075954914093, "rewards/rejected": -0.8443965911865234, "step": 80 }, { "epoch": 0.2, "grad_norm": 25.00309177460346, "learning_rate": 9.669088708527066e-08, "logits/chosen": -1.7340404987335205, "logits/rejected": -1.6651910543441772, "logps/chosen": -1.6042931079864502, "logps/rejected": -1.6743539571762085, "loss": 0.6399, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.737191379070282, "rewards/margins": 0.12603236734867096, "rewards/rejected": -0.8632237315177917, "step": 90 }, { "epoch": 0.23, "grad_norm": 27.969594098965942, "learning_rate": 9.512212835085849e-08, "logits/chosen": -1.772202730178833, "logits/rejected": -1.6827232837677002, "logps/chosen": -1.5563361644744873, "logps/rejected": -1.69924795627594, "loss": 0.6231, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7757617831230164, "rewards/margins": 0.20040392875671387, "rewards/rejected": -0.9761656522750854, "step": 100 }, { "epoch": 0.25, "grad_norm": 25.83218161515289, "learning_rate": 9.326809299301306e-08, "logits/chosen": -1.771267294883728, "logits/rejected": -1.6669820547103882, "logps/chosen": -1.600318193435669, "logps/rejected": -1.7721306085586548, "loss": 0.6211, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7993522882461548, "rewards/margins": 0.24041876196861267, "rewards/rejected": -1.0397710800170898, "step": 110 }, { "epoch": 0.27, "grad_norm": 26.667996957961257, "learning_rate": 9.114050282021158e-08, "logits/chosen": -1.767559289932251, "logits/rejected": -1.7058799266815186, "logps/chosen": -1.565065622329712, "logps/rejected": -1.7299690246582031, "loss": 0.6144, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8350059390068054, "rewards/margins": 0.2350219041109085, "rewards/rejected": -1.0700278282165527, "step": 120 }, { "epoch": 0.3, "grad_norm": 25.330536994335382, "learning_rate": 8.875280914254802e-08, "logits/chosen": -1.765298843383789, "logits/rejected": -1.6749998331069946, "logps/chosen": -1.7646329402923584, "logps/rejected": -1.9669040441513062, "loss": 0.5993, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1436058282852173, "rewards/margins": 0.34769219160079956, "rewards/rejected": -1.491297960281372, "step": 130 }, { "epoch": 0.32, "grad_norm": 24.002049854639584, "learning_rate": 8.612010772821971e-08, "logits/chosen": -1.7989845275878906, "logits/rejected": -1.7554515600204468, "logps/chosen": -1.8291784524917603, "logps/rejected": -1.9761606454849243, "loss": 0.5991, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2305986881256104, "rewards/margins": 0.3437841534614563, "rewards/rejected": -1.5743829011917114, "step": 140 }, { "epoch": 0.34, "grad_norm": 28.7685379938573, "learning_rate": 8.325904336322055e-08, "logits/chosen": -1.777856469154358, "logits/rejected": -1.7238337993621826, "logps/chosen": -1.9271425008773804, "logps/rejected": -2.141960859298706, "loss": 0.6082, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5195552110671997, "rewards/margins": 0.3352898359298706, "rewards/rejected": -1.8548450469970703, "step": 150 }, { "epoch": 0.36, "grad_norm": 22.36425985091183, "learning_rate": 8.01877046176447e-08, "logits/chosen": -1.72470223903656, "logits/rejected": -1.6554687023162842, "logps/chosen": -1.9798545837402344, "logps/rejected": -2.1781439781188965, "loss": 0.5901, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6031091213226318, "rewards/margins": 0.29380664229393005, "rewards/rejected": -1.8969157934188843, "step": 160 }, { "epoch": 0.39, "grad_norm": 19.908864367961982, "learning_rate": 7.692550948392249e-08, "logits/chosen": -1.7723356485366821, "logits/rejected": -1.7198549509048462, "logps/chosen": -1.9558120965957642, "logps/rejected": -2.159667491912842, "loss": 0.5921, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4911130666732788, "rewards/margins": 0.3868214190006256, "rewards/rejected": -1.877934217453003, "step": 170 }, { "epoch": 0.41, "grad_norm": 31.864529340507623, "learning_rate": 7.349308261002021e-08, "logits/chosen": -1.7342097759246826, "logits/rejected": -1.6827507019042969, "logps/chosen": -1.9941844940185547, "logps/rejected": -2.22194504737854, "loss": 0.5855, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5941965579986572, "rewards/margins": 0.35622045397758484, "rewards/rejected": -1.950416922569275, "step": 180 }, { "epoch": 0.43, "grad_norm": 24.691918932601595, "learning_rate": 6.991212490377531e-08, "logits/chosen": -1.7905107736587524, "logits/rejected": -1.7474826574325562, "logps/chosen": -2.10019850730896, "logps/rejected": -2.3696181774139404, "loss": 0.5604, "rewards/accuracies": 0.75, "rewards/chosen": -1.740191102027893, "rewards/margins": 0.4957551956176758, "rewards/rejected": -2.2359461784362793, "step": 190 }, { "epoch": 0.46, "grad_norm": 30.11763021258028, "learning_rate": 6.620527633276978e-08, "logits/chosen": -1.7320966720581055, "logits/rejected": -1.6665375232696533, "logps/chosen": -2.161935329437256, "logps/rejected": -2.522731304168701, "loss": 0.5808, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9349641799926758, "rewards/margins": 0.5480148792266846, "rewards/rejected": -2.4829792976379395, "step": 200 }, { "epoch": 0.48, "grad_norm": 27.459036499855436, "learning_rate": 6.239597278716581e-08, "logits/chosen": -1.7859094142913818, "logits/rejected": -1.7306747436523438, "logps/chosen": -2.2954204082489014, "logps/rejected": -2.540391445159912, "loss": 0.5619, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1001808643341064, "rewards/margins": 0.5312689542770386, "rewards/rejected": -2.6314499378204346, "step": 210 }, { "epoch": 0.5, "grad_norm": 26.562029580561404, "learning_rate": 5.8508297910462456e-08, "logits/chosen": -1.7313369512557983, "logits/rejected": -1.6572465896606445, "logps/chosen": -2.212656021118164, "logps/rejected": -2.5422561168670654, "loss": 0.5673, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0982813835144043, "rewards/margins": 0.5436533689498901, "rewards/rejected": -2.641934871673584, "step": 220 }, { "epoch": 0.52, "grad_norm": 26.043734197401424, "learning_rate": 5.456683083494731e-08, "logits/chosen": -1.7218725681304932, "logits/rejected": -1.6862335205078125, "logps/chosen": -2.1797163486480713, "logps/rejected": -2.389193058013916, "loss": 0.5843, "rewards/accuracies": 0.65625, "rewards/chosen": -1.9718148708343506, "rewards/margins": 0.37147068977355957, "rewards/rejected": -2.34328556060791, "step": 230 }, { "epoch": 0.55, "grad_norm": 33.60349201324705, "learning_rate": 5.059649078450834e-08, "logits/chosen": -1.7049477100372314, "logits/rejected": -1.6663001775741577, "logps/chosen": -2.2287259101867676, "logps/rejected": -2.5061872005462646, "loss": 0.5527, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.120744466781616, "rewards/margins": 0.4308454990386963, "rewards/rejected": -2.5515899658203125, "step": 240 }, { "epoch": 0.57, "grad_norm": 24.721682812448297, "learning_rate": 4.6622379527277186e-08, "logits/chosen": -1.716957688331604, "logits/rejected": -1.6692520380020142, "logps/chosen": -2.323068857192993, "logps/rejected": -2.551687240600586, "loss": 0.5645, "rewards/accuracies": 0.625, "rewards/chosen": -2.3683600425720215, "rewards/margins": 0.3910773694515228, "rewards/rejected": -2.759437322616577, "step": 250 }, { "epoch": 0.59, "grad_norm": 28.965518895994943, "learning_rate": 4.26696226741691e-08, "logits/chosen": -1.731431007385254, "logits/rejected": -1.6687753200531006, "logps/chosen": -2.4228413105010986, "logps/rejected": -2.701592206954956, "loss": 0.5647, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.380035877227783, "rewards/margins": 0.5175679922103882, "rewards/rejected": -2.897603750228882, "step": 260 }, { "epoch": 0.61, "grad_norm": 24.276560436452733, "learning_rate": 3.876321082668098e-08, "logits/chosen": -1.7877088785171509, "logits/rejected": -1.7258117198944092, "logps/chosen": -2.3104138374328613, "logps/rejected": -2.597568988800049, "loss": 0.5577, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.143846035003662, "rewards/margins": 0.5556932091712952, "rewards/rejected": -2.6995394229888916, "step": 270 }, { "epoch": 0.64, "grad_norm": 26.01398581499373, "learning_rate": 3.492784157826244e-08, "logits/chosen": -1.7255363464355469, "logits/rejected": -1.6368858814239502, "logps/chosen": -2.307375431060791, "logps/rejected": -2.6467177867889404, "loss": 0.5569, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1689984798431396, "rewards/margins": 0.6079045534133911, "rewards/rejected": -2.7769031524658203, "step": 280 }, { "epoch": 0.66, "grad_norm": 22.449000475495634, "learning_rate": 3.118776336817812e-08, "logits/chosen": -1.7589473724365234, "logits/rejected": -1.7079540491104126, "logps/chosen": -2.2426087856292725, "logps/rejected": -2.575594425201416, "loss": 0.5461, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0787200927734375, "rewards/margins": 0.6515394449234009, "rewards/rejected": -2.730259656906128, "step": 290 }, { "epoch": 0.68, "grad_norm": 23.916342153040954, "learning_rate": 2.7566622175067443e-08, "logits/chosen": -1.7405074834823608, "logits/rejected": -1.6836473941802979, "logps/chosen": -2.341914176940918, "logps/rejected": -2.724348545074463, "loss": 0.5507, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.2666609287261963, "rewards/margins": 0.6088961958885193, "rewards/rejected": -2.875556707382202, "step": 300 }, { "epoch": 0.71, "grad_norm": 28.87016818276154, "learning_rate": 2.408731201945432e-08, "logits/chosen": -1.738867998123169, "logits/rejected": -1.6946017742156982, "logps/chosen": -2.2607665061950684, "logps/rejected": -2.508131742477417, "loss": 0.5549, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.133742332458496, "rewards/margins": 0.45442089438438416, "rewards/rejected": -2.5881636142730713, "step": 310 }, { "epoch": 0.73, "grad_norm": 26.474891102018546, "learning_rate": 2.0771830220378112e-08, "logits/chosen": -1.6925548315048218, "logits/rejected": -1.6349446773529053, "logps/chosen": -2.292504072189331, "logps/rejected": -2.5281872749328613, "loss": 0.5545, "rewards/accuracies": 0.625, "rewards/chosen": -2.139239549636841, "rewards/margins": 0.48706698417663574, "rewards/rejected": -2.6263065338134766, "step": 320 }, { "epoch": 0.75, "grad_norm": 26.153240463475917, "learning_rate": 1.7641138321260257e-08, "logits/chosen": -1.7273342609405518, "logits/rejected": -1.6638519763946533, "logps/chosen": -2.218357563018799, "logps/rejected": -2.6145200729370117, "loss": 0.5485, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.069005250930786, "rewards/margins": 0.7372555136680603, "rewards/rejected": -2.806260585784912, "step": 330 }, { "epoch": 0.77, "grad_norm": 26.27728864294511, "learning_rate": 1.4715029564277793e-08, "logits/chosen": -1.7901878356933594, "logits/rejected": -1.7485193014144897, "logps/chosen": -2.2015347480773926, "logps/rejected": -2.5572023391723633, "loss": 0.5577, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9937489032745361, "rewards/margins": 0.637208104133606, "rewards/rejected": -2.6309566497802734, "step": 340 }, { "epoch": 0.8, "grad_norm": 26.035613152091976, "learning_rate": 1.2012003751113343e-08, "logits/chosen": -1.7805286645889282, "logits/rejected": -1.7256838083267212, "logps/chosen": -2.3138480186462402, "logps/rejected": -2.6945948600769043, "loss": 0.536, "rewards/accuracies": 0.71875, "rewards/chosen": -2.3033559322357178, "rewards/margins": 0.6611131429672241, "rewards/rejected": -2.9644687175750732, "step": 350 }, { "epoch": 0.82, "grad_norm": 31.4301067051425, "learning_rate": 9.549150281252633e-09, "logits/chosen": -1.7304834127426147, "logits/rejected": -1.6880794763565063, "logps/chosen": -2.2502734661102295, "logps/rejected": -2.593324661254883, "loss": 0.54, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.1562764644622803, "rewards/margins": 0.5826417207717896, "rewards/rejected": -2.7389183044433594, "step": 360 }, { "epoch": 0.84, "grad_norm": 25.13509986712804, "learning_rate": 7.3420401072985306e-09, "logits/chosen": -1.7723455429077148, "logits/rejected": -1.721980094909668, "logps/chosen": -2.3107597827911377, "logps/rejected": -2.679028272628784, "loss": 0.5374, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1922922134399414, "rewards/margins": 0.6216103434562683, "rewards/rejected": -2.8139023780822754, "step": 370 }, { "epoch": 0.86, "grad_norm": 28.235691242335875, "learning_rate": 5.404627290395369e-09, "logits/chosen": -1.733109712600708, "logits/rejected": -1.6734564304351807, "logps/chosen": -2.241391897201538, "logps/rejected": -2.590752363204956, "loss": 0.5365, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.103571653366089, "rewards/margins": 0.6187294721603394, "rewards/rejected": -2.7223010063171387, "step": 380 }, { "epoch": 0.89, "grad_norm": 33.12956908569685, "learning_rate": 3.74916077816162e-09, "logits/chosen": -1.7393004894256592, "logits/rejected": -1.6843922138214111, "logps/chosen": -2.267721652984619, "logps/rejected": -2.5625317096710205, "loss": 0.5513, "rewards/accuracies": 0.6875, "rewards/chosen": -2.235853672027588, "rewards/margins": 0.5573619604110718, "rewards/rejected": -2.7932159900665283, "step": 390 }, { "epoch": 0.91, "grad_norm": 28.33250387056753, "learning_rate": 2.386106962899165e-09, "logits/chosen": -1.666548490524292, "logits/rejected": -1.6048628091812134, "logps/chosen": -2.401040554046631, "logps/rejected": -2.733664035797119, "loss": 0.5422, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.349238872528076, "rewards/margins": 0.5525677800178528, "rewards/rejected": -2.901806592941284, "step": 400 }, { "epoch": 0.93, "grad_norm": 31.32317842504756, "learning_rate": 1.3240835096913706e-09, "logits/chosen": -1.6938998699188232, "logits/rejected": -1.6031659841537476, "logps/chosen": -2.251462459564209, "logps/rejected": -2.6835711002349854, "loss": 0.5597, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1598236560821533, "rewards/margins": 0.7253878116607666, "rewards/rejected": -2.885211706161499, "step": 410 }, { "epoch": 0.96, "grad_norm": 23.813879321456167, "learning_rate": 5.698048727497462e-10, "logits/chosen": -1.7287553548812866, "logits/rejected": -1.6634016036987305, "logps/chosen": -2.358034610748291, "logps/rejected": -2.7596287727355957, "loss": 0.5431, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.3334739208221436, "rewards/margins": 0.669273853302002, "rewards/rejected": -3.0027480125427246, "step": 420 }, { "epoch": 0.98, "grad_norm": 24.13495374841747, "learning_rate": 1.2803984447259387e-10, "logits/chosen": -1.7348169088363647, "logits/rejected": -1.6923096179962158, "logps/chosen": -2.339822769165039, "logps/rejected": -2.7242085933685303, "loss": 0.5338, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.314966917037964, "rewards/margins": 0.6734664440155029, "rewards/rejected": -2.9884331226348877, "step": 430 }, { "epoch": 1.0, "step": 439, "total_flos": 0.0, "train_loss": 0.5863300847029632, "train_runtime": 6843.0852, "train_samples_per_second": 8.218, "train_steps_per_second": 0.064 } ], "logging_steps": 10, "max_steps": 439, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }