{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 391, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 2089.5337638761494, "learning_rate": 2.5e-09, "logits/chosen": -4.623842239379883, "logits/rejected": -4.85917854309082, "logps/chosen": -239.31422424316406, "logps/rejected": -207.56365966796875, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 2112.4857671472687, "learning_rate": 2.5e-08, "logits/chosen": -4.333562850952148, "logits/rejected": -4.643319129943848, "logps/chosen": -265.2981262207031, "logps/rejected": -215.68804931640625, "loss": 0.7355, "rewards/accuracies": 0.3888888955116272, "rewards/chosen": -0.09561138600111008, "rewards/margins": -0.10567205399274826, "rewards/rejected": 0.010060659609735012, "step": 10 }, { "epoch": 0.05, "grad_norm": 1939.2525079641944, "learning_rate": 5e-08, "logits/chosen": -4.508406162261963, "logits/rejected": -4.7436203956604, "logps/chosen": -267.76934814453125, "logps/rejected": -216.88119506835938, "loss": 0.6656, "rewards/accuracies": 0.6875, "rewards/chosen": 0.08529385179281235, "rewards/margins": 0.22122922539710999, "rewards/rejected": -0.13593538105487823, "step": 20 }, { "epoch": 0.08, "grad_norm": 1485.5526937989268, "learning_rate": 7.5e-08, "logits/chosen": -4.591097354888916, "logits/rejected": -4.771042823791504, "logps/chosen": -257.5138244628906, "logps/rejected": -215.06607055664062, "loss": 0.4916, "rewards/accuracies": 0.75, "rewards/chosen": 0.5094950795173645, "rewards/margins": 0.7761520147323608, "rewards/rejected": -0.2666569650173187, "step": 30 }, { "epoch": 0.1, "grad_norm": 1059.7800988486467, "learning_rate": 1e-07, "logits/chosen": -4.61653995513916, "logits/rejected": -4.705571174621582, "logps/chosen": -250.05783081054688, "logps/rejected": -220.47665405273438, "loss": 0.3139, "rewards/accuracies": 0.875, "rewards/chosen": 1.7706722021102905, "rewards/margins": 2.1734442710876465, "rewards/rejected": -0.4027720093727112, "step": 40 }, { "epoch": 0.13, "grad_norm": 837.9194721075112, "learning_rate": 9.979985922607475e-08, "logits/chosen": -4.497745513916016, "logits/rejected": -4.6963934898376465, "logps/chosen": -266.4471740722656, "logps/rejected": -227.05908203125, "loss": 0.2475, "rewards/accuracies": 0.875, "rewards/chosen": 2.7611026763916016, "rewards/margins": 3.3548762798309326, "rewards/rejected": -0.5937734246253967, "step": 50 }, { "epoch": 0.15, "grad_norm": 912.9246800740217, "learning_rate": 9.92010391574745e-08, "logits/chosen": -4.585003852844238, "logits/rejected": -4.705927848815918, "logps/chosen": -235.20071411132812, "logps/rejected": -217.2942352294922, "loss": 0.2013, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.9608712196350098, "rewards/margins": 4.097281455993652, "rewards/rejected": -1.1364095211029053, "step": 60 }, { "epoch": 0.18, "grad_norm": 634.4685088072516, "learning_rate": 9.820833372667812e-08, "logits/chosen": -4.462503910064697, "logits/rejected": -4.6857805252075195, "logps/chosen": -246.69186401367188, "logps/rejected": -220.57937622070312, "loss": 0.1884, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 3.2250447273254395, "rewards/margins": 4.633510112762451, "rewards/rejected": -1.4084659814834595, "step": 70 }, { "epoch": 0.2, "grad_norm": 468.8604524803785, "learning_rate": 9.682969016701356e-08, "logits/chosen": -4.449667453765869, "logits/rejected": -4.664923667907715, "logps/chosen": -253.8452606201172, "logps/rejected": -233.0582733154297, "loss": 0.1796, "rewards/accuracies": 0.875, "rewards/chosen": 3.6873557567596436, "rewards/margins": 5.057134628295898, "rewards/rejected": -1.3697788715362549, "step": 80 }, { "epoch": 0.23, "grad_norm": 958.5162002808887, "learning_rate": 9.507614539004081e-08, "logits/chosen": -4.535862445831299, "logits/rejected": -4.733909606933594, "logps/chosen": -243.66317749023438, "logps/rejected": -206.82388305664062, "loss": 0.1733, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 3.7747840881347656, "rewards/margins": 5.771730899810791, "rewards/rejected": -1.9969465732574463, "step": 90 }, { "epoch": 0.26, "grad_norm": 928.8107393024507, "learning_rate": 9.296173762811083e-08, "logits/chosen": -4.406120777130127, "logits/rejected": -4.672289848327637, "logps/chosen": -248.62539672851562, "logps/rejected": -231.67758178710938, "loss": 0.1833, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 4.046411037445068, "rewards/margins": 6.330681324005127, "rewards/rejected": -2.2842705249786377, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -4.5091094970703125, "eval_logits/rejected": -4.724847316741943, "eval_logps/chosen": -389.6205749511719, "eval_logps/rejected": -515.4835205078125, "eval_loss": 1.8368816375732422, "eval_rewards/accuracies": 0.375, "eval_rewards/chosen": 0.4269474744796753, "eval_rewards/margins": -1.0251328945159912, "eval_rewards/rejected": 1.452080249786377, "eval_runtime": 97.8781, "eval_samples_per_second": 20.434, "eval_steps_per_second": 0.327, "step": 100 }, { "epoch": 0.28, "grad_norm": 759.2228167566217, "learning_rate": 9.050339404945832e-08, "logits/chosen": -4.45731258392334, "logits/rejected": -4.700920581817627, "logps/chosen": -240.77047729492188, "logps/rejected": -220.7100830078125, "loss": 0.1645, "rewards/accuracies": 0.9375, "rewards/chosen": 4.207625865936279, "rewards/margins": 6.219720363616943, "rewards/rejected": -2.012094259262085, "step": 110 }, { "epoch": 0.31, "grad_norm": 615.4147404438793, "learning_rate": 8.77207952455395e-08, "logits/chosen": -4.41110897064209, "logits/rejected": -4.632037162780762, "logps/chosen": -266.83837890625, "logps/rejected": -232.83670043945312, "loss": 0.1648, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 4.575605869293213, "rewards/margins": 6.689634799957275, "rewards/rejected": -2.1140289306640625, "step": 120 }, { "epoch": 0.33, "grad_norm": 1154.0005388666061, "learning_rate": 8.463621767547997e-08, "logits/chosen": -4.474618434906006, "logits/rejected": -4.724778652191162, "logps/chosen": -250.192626953125, "logps/rejected": -220.4983673095703, "loss": 0.1701, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.125626564025879, "rewards/margins": 6.710474967956543, "rewards/rejected": -2.5848488807678223, "step": 130 }, { "epoch": 0.36, "grad_norm": 510.3907389648352, "learning_rate": 8.127435532896387e-08, "logits/chosen": -4.497905254364014, "logits/rejected": -4.757509708404541, "logps/chosen": -276.1819763183594, "logps/rejected": -237.9337921142578, "loss": 0.169, "rewards/accuracies": 0.9375, "rewards/chosen": 4.006547451019287, "rewards/margins": 6.8867011070251465, "rewards/rejected": -2.880154848098755, "step": 140 }, { "epoch": 0.38, "grad_norm": 616.3949177365913, "learning_rate": 7.766212203526569e-08, "logits/chosen": -4.483530521392822, "logits/rejected": -4.700650691986084, "logps/chosen": -244.07785034179688, "logps/rejected": -224.0546417236328, "loss": 0.1668, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.170205116271973, "rewards/margins": 6.6378936767578125, "rewards/rejected": -2.4676883220672607, "step": 150 }, { "epoch": 0.41, "grad_norm": 759.2665515018776, "learning_rate": 7.382843600106538e-08, "logits/chosen": -4.538361072540283, "logits/rejected": -4.685894966125488, "logps/chosen": -243.0140380859375, "logps/rejected": -220.0860137939453, "loss": 0.1473, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 4.22122859954834, "rewards/margins": 6.459697723388672, "rewards/rejected": -2.238469362258911, "step": 160 }, { "epoch": 0.43, "grad_norm": 688.1440407430587, "learning_rate": 6.980398830195784e-08, "logits/chosen": -4.427027702331543, "logits/rejected": -4.675489902496338, "logps/chosen": -251.1200408935547, "logps/rejected": -225.5527801513672, "loss": 0.1434, "rewards/accuracies": 0.9375, "rewards/chosen": 4.977096080780029, "rewards/margins": 7.851990699768066, "rewards/rejected": -2.874894618988037, "step": 170 }, { "epoch": 0.46, "grad_norm": 572.2642343737211, "learning_rate": 6.562099718102787e-08, "logits/chosen": -4.530760765075684, "logits/rejected": -4.731973171234131, "logps/chosen": -228.52304077148438, "logps/rejected": -202.01510620117188, "loss": 0.1552, "rewards/accuracies": 0.9375, "rewards/chosen": 4.1703057289123535, "rewards/margins": 7.167737007141113, "rewards/rejected": -2.9974308013916016, "step": 180 }, { "epoch": 0.49, "grad_norm": 887.5255514170451, "learning_rate": 6.131295012148612e-08, "logits/chosen": -4.499785423278809, "logits/rejected": -4.621634006500244, "logps/chosen": -251.9990692138672, "logps/rejected": -240.3909149169922, "loss": 0.1634, "rewards/accuracies": 0.90625, "rewards/chosen": 4.446890830993652, "rewards/margins": 7.0593156814575195, "rewards/rejected": -2.6124250888824463, "step": 190 }, { "epoch": 0.51, "grad_norm": 622.6699519046258, "learning_rate": 5.691433575823665e-08, "logits/chosen": -4.48135232925415, "logits/rejected": -4.617772102355957, "logps/chosen": -243.34725952148438, "logps/rejected": -220.18392944335938, "loss": 0.1786, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 4.395993709564209, "rewards/margins": 6.814687252044678, "rewards/rejected": -2.4186930656433105, "step": 200 }, { "epoch": 0.51, "eval_logits/chosen": -4.4697723388671875, "eval_logits/rejected": -4.687928199768066, "eval_logps/chosen": -389.2646179199219, "eval_logps/rejected": -514.896484375, "eval_loss": 2.016343355178833, "eval_rewards/accuracies": 0.375, "eval_rewards/chosen": 0.6049206256866455, "eval_rewards/margins": -1.140692114830017, "eval_rewards/rejected": 1.745612621307373, "eval_runtime": 97.8297, "eval_samples_per_second": 20.444, "eval_steps_per_second": 0.327, "step": 200 }, { "epoch": 0.54, "grad_norm": 473.28271440178054, "learning_rate": 5.2460367774593905e-08, "logits/chosen": -4.541897773742676, "logits/rejected": -4.740262031555176, "logps/chosen": -255.6215362548828, "logps/rejected": -234.78518676757812, "loss": 0.1232, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 5.131775856018066, "rewards/margins": 8.201360702514648, "rewards/rejected": -3.069584846496582, "step": 210 }, { "epoch": 0.56, "grad_norm": 670.7484200931372, "learning_rate": 4.798670299452925e-08, "logits/chosen": -4.39837646484375, "logits/rejected": -4.688643455505371, "logps/chosen": -253.91787719726562, "logps/rejected": -231.707275390625, "loss": 0.1672, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 4.446724891662598, "rewards/margins": 7.963796138763428, "rewards/rejected": -3.517070770263672, "step": 220 }, { "epoch": 0.59, "grad_norm": 763.2480410824999, "learning_rate": 4.3529155927297226e-08, "logits/chosen": -4.47940731048584, "logits/rejected": -4.748034954071045, "logps/chosen": -252.20700073242188, "logps/rejected": -230.70425415039062, "loss": 0.1691, "rewards/accuracies": 0.9375, "rewards/chosen": 4.38104248046875, "rewards/margins": 7.8776116371154785, "rewards/rejected": -3.4965691566467285, "step": 230 }, { "epoch": 0.61, "grad_norm": 547.6628902362396, "learning_rate": 3.9123412049691636e-08, "logits/chosen": -4.450512886047363, "logits/rejected": -4.651386260986328, "logps/chosen": -263.7304382324219, "logps/rejected": -227.78604125976562, "loss": 0.1511, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.546363353729248, "rewards/margins": 7.972568511962891, "rewards/rejected": -3.4262046813964844, "step": 240 }, { "epoch": 0.64, "grad_norm": 706.6026662780071, "learning_rate": 3.480474212128766e-08, "logits/chosen": -4.571944236755371, "logits/rejected": -4.786678791046143, "logps/chosen": -240.4440155029297, "logps/rejected": -212.46694946289062, "loss": 0.1403, "rewards/accuracies": 0.9375, "rewards/chosen": 4.128727912902832, "rewards/margins": 6.557607173919678, "rewards/rejected": -2.428879976272583, "step": 250 }, { "epoch": 0.66, "grad_norm": 957.1848027668926, "learning_rate": 3.060771981975726e-08, "logits/chosen": -4.445496082305908, "logits/rejected": -4.674472808837891, "logps/chosen": -244.96701049804688, "logps/rejected": -227.3423614501953, "loss": 0.1506, "rewards/accuracies": 0.90625, "rewards/chosen": 4.540780067443848, "rewards/margins": 8.337722778320312, "rewards/rejected": -3.7969424724578857, "step": 260 }, { "epoch": 0.69, "grad_norm": 1053.6903730937584, "learning_rate": 2.6565944956764818e-08, "logits/chosen": -4.53262996673584, "logits/rejected": -4.71115255355835, "logps/chosen": -252.1263427734375, "logps/rejected": -221.7955322265625, "loss": 0.1551, "rewards/accuracies": 0.9375, "rewards/chosen": 4.453462600708008, "rewards/margins": 7.838715553283691, "rewards/rejected": -3.385251998901367, "step": 270 }, { "epoch": 0.72, "grad_norm": 661.3688193511013, "learning_rate": 2.2711774490274766e-08, "logits/chosen": -4.489356994628906, "logits/rejected": -4.654987812042236, "logps/chosen": -254.680908203125, "logps/rejected": -248.8947296142578, "loss": 0.1253, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 4.912972450256348, "rewards/margins": 7.8906402587890625, "rewards/rejected": -2.9776668548583984, "step": 280 }, { "epoch": 0.74, "grad_norm": 1908.0202284500367, "learning_rate": 1.9076063486687256e-08, "logits/chosen": -4.361441135406494, "logits/rejected": -4.647955417633057, "logps/chosen": -262.6406555175781, "logps/rejected": -221.6370086669922, "loss": 0.1481, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 4.983874320983887, "rewards/margins": 7.9876885414123535, "rewards/rejected": -3.0038130283355713, "step": 290 }, { "epoch": 0.77, "grad_norm": 734.7948711655655, "learning_rate": 1.5687918106563324e-08, "logits/chosen": -4.47251033782959, "logits/rejected": -4.634402275085449, "logps/chosen": -243.4433135986328, "logps/rejected": -222.4509735107422, "loss": 0.1648, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 4.982421398162842, "rewards/margins": 8.225171089172363, "rewards/rejected": -3.2427496910095215, "step": 300 }, { "epoch": 0.77, "eval_logits/chosen": -4.4838151931762695, "eval_logits/rejected": -4.69987678527832, "eval_logps/chosen": -390.0736999511719, "eval_logps/rejected": -516.0419921875, "eval_loss": 1.9448436498641968, "eval_rewards/accuracies": 0.3984375, "eval_rewards/chosen": 0.2003953605890274, "eval_rewards/margins": -0.972442626953125, "eval_rewards/rejected": 1.1728378534317017, "eval_runtime": 97.9077, "eval_samples_per_second": 20.427, "eval_steps_per_second": 0.327, "step": 300 }, { "epoch": 0.79, "grad_norm": 701.7075104371141, "learning_rate": 1.257446259144494e-08, "logits/chosen": -4.397843360900879, "logits/rejected": -4.662208557128906, "logps/chosen": -251.11611938476562, "logps/rejected": -229.4883270263672, "loss": 0.1577, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 5.011745929718018, "rewards/margins": 8.656941413879395, "rewards/rejected": -3.6451950073242188, "step": 310 }, { "epoch": 0.82, "grad_norm": 1065.7098800029996, "learning_rate": 9.760622117187234e-09, "logits/chosen": -4.4547929763793945, "logits/rejected": -4.7404327392578125, "logps/chosen": -235.986083984375, "logps/rejected": -213.1405029296875, "loss": 0.1434, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 4.589102745056152, "rewards/margins": 7.9336113929748535, "rewards/rejected": -3.344507932662964, "step": 320 }, { "epoch": 0.84, "grad_norm": 766.1086061468453, "learning_rate": 7.2689232521989885e-09, "logits/chosen": -4.407891750335693, "logits/rejected": -4.665772914886475, "logps/chosen": -258.3376159667969, "logps/rejected": -240.0522003173828, "loss": 0.1405, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 4.90563440322876, "rewards/margins": 8.58189868927002, "rewards/rejected": -3.6762642860412598, "step": 330 }, { "epoch": 0.87, "grad_norm": 626.4348398301977, "learning_rate": 5.119313618049309e-09, "logits/chosen": -4.429708003997803, "logits/rejected": -4.715014457702637, "logps/chosen": -263.54986572265625, "logps/rejected": -213.69723510742188, "loss": 0.1494, "rewards/accuracies": 0.9375, "rewards/chosen": 5.298083305358887, "rewards/margins": 8.755678176879883, "rewards/rejected": -3.457595109939575, "step": 340 }, { "epoch": 0.9, "grad_norm": 564.1193509438065, "learning_rate": 3.3290021961708158e-09, "logits/chosen": -4.445944309234619, "logits/rejected": -4.576190948486328, "logps/chosen": -247.17697143554688, "logps/rejected": -233.48477172851562, "loss": 0.1576, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 4.14711332321167, "rewards/margins": 7.0045037269592285, "rewards/rejected": -2.8573899269104004, "step": 350 }, { "epoch": 0.92, "grad_norm": 651.5911217726903, "learning_rate": 1.9123215591052013e-09, "logits/chosen": -4.436100482940674, "logits/rejected": -4.62412166595459, "logps/chosen": -253.4558563232422, "logps/rejected": -233.94869995117188, "loss": 0.1582, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.089536190032959, "rewards/margins": 7.332627296447754, "rewards/rejected": -3.243091583251953, "step": 360 }, { "epoch": 0.95, "grad_norm": 610.6809759122384, "learning_rate": 8.806131292167618e-10, "logits/chosen": -4.4610724449157715, "logits/rejected": -4.592678070068359, "logps/chosen": -247.8229217529297, "logps/rejected": -242.95114135742188, "loss": 0.1649, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 4.499147415161133, "rewards/margins": 7.5511980056762695, "rewards/rejected": -3.052050828933716, "step": 370 }, { "epoch": 0.97, "grad_norm": 661.3141861471707, "learning_rate": 2.4213638345040867e-10, "logits/chosen": -4.557965278625488, "logits/rejected": -4.776811122894287, "logps/chosen": -252.97561645507812, "logps/rejected": -227.4269561767578, "loss": 0.1552, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 4.553546905517578, "rewards/margins": 8.004728317260742, "rewards/rejected": -3.4511806964874268, "step": 380 }, { "epoch": 1.0, "grad_norm": 1178.6266155115975, "learning_rate": 2.0027310073833516e-12, "logits/chosen": -4.544768810272217, "logits/rejected": -4.75381326675415, "logps/chosen": -250.1166534423828, "logps/rejected": -226.92916870117188, "loss": 0.1545, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.777965068817139, "rewards/margins": 7.9850053787231445, "rewards/rejected": -3.2070395946502686, "step": 390 }, { "epoch": 1.0, "step": 391, "total_flos": 0.0, "train_loss": 0.20245660769055263, "train_runtime": 6146.5091, "train_samples_per_second": 8.135, "train_steps_per_second": 0.064 } ], "logging_steps": 10, "max_steps": 391, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }