| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 10, | |
| "global_step": 750, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 298.3437587738037, | |
| "epoch": 0.013333333333333334, | |
| "grad_norm": 0.16560879349708557, | |
| "kl": 0.00018159300088882446, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0049, | |
| "reward": 0.15208333674818278, | |
| "reward_std": 0.2552751675248146, | |
| "rewards/accuracy_reward": 0.06666666846722365, | |
| "rewards/format_reward": 0.08541666883975267, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 314.44792423248293, | |
| "epoch": 0.02666666666666667, | |
| "grad_norm": 0.2575075328350067, | |
| "kl": 0.0009123936295509338, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0058, | |
| "reward": 0.20625000558793544, | |
| "reward_std": 0.2727560464292765, | |
| "rewards/accuracy_reward": 0.06458333488553762, | |
| "rewards/format_reward": 0.14166667070239783, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 252.6500078201294, | |
| "epoch": 0.04, | |
| "grad_norm": 0.46988776326179504, | |
| "kl": 0.03377872705459595, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "loss": 0.0205, | |
| "reward": 0.5270833471789956, | |
| "reward_std": 0.4601708807051182, | |
| "rewards/accuracy_reward": 0.07916666902601718, | |
| "rewards/format_reward": 0.44791667852550743, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 242.21250705718995, | |
| "epoch": 0.05333333333333334, | |
| "grad_norm": 0.19061416387557983, | |
| "kl": 0.06111354827880859, | |
| "learning_rate": 1.6e-06, | |
| "loss": 0.048, | |
| "reward": 0.8041666891425848, | |
| "reward_std": 0.4267156172543764, | |
| "rewards/accuracy_reward": 0.06250000167638063, | |
| "rewards/format_reward": 0.7416666835546494, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 197.73125562667846, | |
| "epoch": 0.06666666666666667, | |
| "grad_norm": 0.2476302981376648, | |
| "kl": 0.0408905029296875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.021, | |
| "reward": 0.8895833583548665, | |
| "reward_std": 0.4103152878582478, | |
| "rewards/accuracy_reward": 0.09375000242143869, | |
| "rewards/format_reward": 0.7958333509042859, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 198.86250495910645, | |
| "epoch": 0.08, | |
| "grad_norm": 0.13683611154556274, | |
| "kl": 0.0466217041015625, | |
| "learning_rate": 2.4000000000000003e-06, | |
| "loss": 0.0316, | |
| "reward": 0.9916666936129331, | |
| "reward_std": 0.36507078595459463, | |
| "rewards/accuracy_reward": 0.11666666958481073, | |
| "rewards/format_reward": 0.8750000182539225, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 225.4541726589203, | |
| "epoch": 0.09333333333333334, | |
| "grad_norm": 0.17669692635536194, | |
| "kl": 0.06876106262207031, | |
| "learning_rate": 2.8000000000000003e-06, | |
| "loss": 0.0388, | |
| "reward": 1.0520833637565374, | |
| "reward_std": 0.4207858666777611, | |
| "rewards/accuracy_reward": 0.19375000540167092, | |
| "rewards/format_reward": 0.8583333497866988, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 194.30625553131102, | |
| "epoch": 0.10666666666666667, | |
| "grad_norm": 0.15089309215545654, | |
| "kl": 0.08316802978515625, | |
| "learning_rate": 2.9995938617691924e-06, | |
| "loss": 0.03, | |
| "reward": 1.1625000346451997, | |
| "reward_std": 0.44051293805241587, | |
| "rewards/accuracy_reward": 0.2520833399146795, | |
| "rewards/format_reward": 0.91041667945683, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 202.31667232513428, | |
| "epoch": 0.12, | |
| "grad_norm": 0.2549769878387451, | |
| "kl": 0.06892852783203125, | |
| "learning_rate": 2.9963460753897363e-06, | |
| "loss": 0.0229, | |
| "reward": 1.1770833685994149, | |
| "reward_std": 0.4563848368823528, | |
| "rewards/accuracy_reward": 0.2687500063329935, | |
| "rewards/format_reward": 0.9083333484828472, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 215.91458954811097, | |
| "epoch": 0.13333333333333333, | |
| "grad_norm": 3.687699556350708, | |
| "kl": 0.1230712890625, | |
| "learning_rate": 2.989857536612915e-06, | |
| "loss": 0.0166, | |
| "reward": 1.235416704416275, | |
| "reward_std": 0.4576607421040535, | |
| "rewards/accuracy_reward": 0.32083334121853113, | |
| "rewards/format_reward": 0.914583346247673, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 231.20208988189697, | |
| "epoch": 0.14666666666666667, | |
| "grad_norm": 0.19754290580749512, | |
| "kl": 0.09471588134765625, | |
| "learning_rate": 2.980142298168869e-06, | |
| "loss": 0.0774, | |
| "reward": 1.204166702181101, | |
| "reward_std": 0.5094456784427166, | |
| "rewards/accuracy_reward": 0.329166673310101, | |
| "rewards/format_reward": 0.8750000186264515, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 210.604172039032, | |
| "epoch": 0.16, | |
| "grad_norm": 0.9343002438545227, | |
| "kl": 0.10347900390625, | |
| "learning_rate": 2.9672214011007086e-06, | |
| "loss": 0.0425, | |
| "reward": 1.2375000335276127, | |
| "reward_std": 0.5164696607738734, | |
| "rewards/accuracy_reward": 0.35208334103226663, | |
| "rewards/format_reward": 0.8854166835546493, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 254.06667470932007, | |
| "epoch": 0.17333333333333334, | |
| "grad_norm": 0.25169649720191956, | |
| "kl": 0.148199462890625, | |
| "learning_rate": 2.951122829194296e-06, | |
| "loss": 0.0796, | |
| "reward": 1.1187500342726708, | |
| "reward_std": 0.5015288021415472, | |
| "rewards/accuracy_reward": 0.26250000670552254, | |
| "rewards/format_reward": 0.8562500193715096, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 233.81875615119935, | |
| "epoch": 0.18666666666666668, | |
| "grad_norm": 0.7912704348564148, | |
| "kl": 2.357049560546875, | |
| "learning_rate": 2.9318814483715983e-06, | |
| "loss": 0.1295, | |
| "reward": 0.9895833631977439, | |
| "reward_std": 0.5210714556276799, | |
| "rewards/accuracy_reward": 0.26250000577419996, | |
| "rewards/format_reward": 0.7270833453163504, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 227.87708988189698, | |
| "epoch": 0.2, | |
| "grad_norm": 1.25613272190094, | |
| "kl": 0.46229248046875, | |
| "learning_rate": 2.9095389311788626e-06, | |
| "loss": 0.0646, | |
| "reward": 1.129166703671217, | |
| "reward_std": 0.49879880994558334, | |
| "rewards/accuracy_reward": 0.2687500067055225, | |
| "rewards/format_reward": 0.8604166861623526, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 194.6770890235901, | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 0.4988195300102234, | |
| "kl": 0.613861083984375, | |
| "learning_rate": 2.8841436665331635e-06, | |
| "loss": 0.0366, | |
| "reward": 1.2395833693444729, | |
| "reward_std": 0.48180873580276967, | |
| "rewards/accuracy_reward": 0.31666667591780423, | |
| "rewards/format_reward": 0.922916679084301, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 168.8062547683716, | |
| "epoch": 0.22666666666666666, | |
| "grad_norm": 0.9171387553215027, | |
| "kl": 1.17796630859375, | |
| "learning_rate": 2.855750654922781e-06, | |
| "loss": 0.0156, | |
| "reward": 1.1541667036712169, | |
| "reward_std": 0.5033410575240851, | |
| "rewards/accuracy_reward": 0.25416667219251393, | |
| "rewards/format_reward": 0.9000000156462192, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 241.8708402633667, | |
| "epoch": 0.24, | |
| "grad_norm": 25.64914894104004, | |
| "kl": 114.61384887695313, | |
| "learning_rate": 2.8244213892883906e-06, | |
| "loss": 2.7595, | |
| "reward": 1.181250037252903, | |
| "reward_std": 0.51867739520967, | |
| "rewards/accuracy_reward": 0.31041667331010103, | |
| "rewards/format_reward": 0.8708333529531955, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 314.0875092506409, | |
| "epoch": 0.25333333333333335, | |
| "grad_norm": 0.8180872797966003, | |
| "kl": 2.062310791015625, | |
| "learning_rate": 2.7902237218430485e-06, | |
| "loss": 0.1151, | |
| "reward": 1.1354166999459268, | |
| "reward_std": 0.5186546068638563, | |
| "rewards/accuracy_reward": 0.26458334010094403, | |
| "rewards/format_reward": 0.8708333499729634, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 271.76875801086425, | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 2.5058634281158447, | |
| "kl": 1.689434814453125, | |
| "learning_rate": 2.753231717119405e-06, | |
| "loss": 0.0826, | |
| "reward": 1.1770833715796472, | |
| "reward_std": 0.5468059632927179, | |
| "rewards/accuracy_reward": 0.3104166744276881, | |
| "rewards/format_reward": 0.8666666835546494, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 189.7958378791809, | |
| "epoch": 0.28, | |
| "grad_norm": 0.8333961367607117, | |
| "kl": 2.188385009765625, | |
| "learning_rate": 2.713525491562421e-06, | |
| "loss": 0.159, | |
| "reward": 1.1291666947305203, | |
| "reward_std": 0.48742703087627887, | |
| "rewards/accuracy_reward": 0.2375000048428774, | |
| "rewards/format_reward": 0.8916666850447654, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 169.8958384513855, | |
| "epoch": 0.29333333333333333, | |
| "grad_norm": 0.48866531252861023, | |
| "kl": 1.365594482421875, | |
| "learning_rate": 2.671191040014989e-06, | |
| "loss": 0.1571, | |
| "reward": 1.214583370089531, | |
| "reward_std": 0.45735101476311685, | |
| "rewards/accuracy_reward": 0.2812500067055225, | |
| "rewards/format_reward": 0.9333333462476731, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 210.16042280197144, | |
| "epoch": 0.30666666666666664, | |
| "grad_norm": 0.4208928346633911, | |
| "kl": 1.861163330078125, | |
| "learning_rate": 2.626320049472249e-06, | |
| "loss": 0.2113, | |
| "reward": 1.1687500353902578, | |
| "reward_std": 0.48322329856455326, | |
| "rewards/accuracy_reward": 0.3020833406597376, | |
| "rewards/format_reward": 0.866666679084301, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 213.62292337417603, | |
| "epoch": 0.32, | |
| "grad_norm": 0.9921014308929443, | |
| "kl": 5.52261962890625, | |
| "learning_rate": 2.5790097005079765e-06, | |
| "loss": 0.4186, | |
| "reward": 1.1104166995733977, | |
| "reward_std": 0.5192686680704355, | |
| "rewards/accuracy_reward": 0.26250000651925803, | |
| "rewards/format_reward": 0.8479166831821203, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 173.20625500679017, | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.5788355469703674, | |
| "kl": 0.586700439453125, | |
| "learning_rate": 2.529362456803101e-06, | |
| "loss": 0.0873, | |
| "reward": 1.2354166999459266, | |
| "reward_std": 0.5144322618842125, | |
| "rewards/accuracy_reward": 0.32708334140479567, | |
| "rewards/format_reward": 0.9083333492279053, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 170.66458702087402, | |
| "epoch": 0.3466666666666667, | |
| "grad_norm": 1.29031503200531, | |
| "kl": 3.8255126953125, | |
| "learning_rate": 2.477485843232183e-06, | |
| "loss": 0.2837, | |
| "reward": 1.2604167006909848, | |
| "reward_std": 0.5114245742559433, | |
| "rewards/accuracy_reward": 0.3416666738688946, | |
| "rewards/format_reward": 0.918750012665987, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 167.50208835601808, | |
| "epoch": 0.36, | |
| "grad_norm": 0.6127444505691528, | |
| "kl": 0.5264404296875, | |
| "learning_rate": 2.4234922129884873e-06, | |
| "loss": 0.0213, | |
| "reward": 1.3562500387430192, | |
| "reward_std": 0.42282451651990416, | |
| "rewards/accuracy_reward": 0.38958334233611824, | |
| "rewards/format_reward": 0.9666666738688946, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 177.09583835601808, | |
| "epoch": 0.37333333333333335, | |
| "grad_norm": 0.6640939712524414, | |
| "kl": 1.479010009765625, | |
| "learning_rate": 2.36749850425198e-06, | |
| "loss": 0.1688, | |
| "reward": 1.250000035762787, | |
| "reward_std": 0.4655508127063513, | |
| "rewards/accuracy_reward": 0.3479166755452752, | |
| "rewards/format_reward": 0.9020833499729634, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 198.85000591278077, | |
| "epoch": 0.38666666666666666, | |
| "grad_norm": 0.14313329756259918, | |
| "kl": 1.668316650390625, | |
| "learning_rate": 2.3096259869272697e-06, | |
| "loss": 0.1292, | |
| "reward": 1.289583370834589, | |
| "reward_std": 0.47122009098529816, | |
| "rewards/accuracy_reward": 0.3645833419635892, | |
| "rewards/format_reward": 0.9250000134110451, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 230.99375743865966, | |
| "epoch": 0.4, | |
| "grad_norm": 0.10921728610992432, | |
| "kl": 0.85283203125, | |
| "learning_rate": 2.25e-06, | |
| "loss": 0.0644, | |
| "reward": 1.3041667066514493, | |
| "reward_std": 0.4090763859450817, | |
| "rewards/accuracy_reward": 0.37500000894069674, | |
| "rewards/format_reward": 0.9291666768491268, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 270.9208402633667, | |
| "epoch": 0.41333333333333333, | |
| "grad_norm": 1.005725383758545, | |
| "kl": 0.2567535400390625, | |
| "learning_rate": 2.1887496800805174e-06, | |
| "loss": 0.035, | |
| "reward": 1.333333370089531, | |
| "reward_std": 0.46985283866524696, | |
| "rewards/accuracy_reward": 0.3937500087544322, | |
| "rewards/format_reward": 0.939583346992731, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 238.92500734329224, | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 1.076439380645752, | |
| "kl": 1.3035552978515625, | |
| "learning_rate": 2.126007681722727e-06, | |
| "loss": 0.1024, | |
| "reward": 1.2500000342726707, | |
| "reward_std": 0.48043784201145173, | |
| "rewards/accuracy_reward": 0.35625000931322576, | |
| "rewards/format_reward": 0.8937500141561031, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 414.82501125335693, | |
| "epoch": 0.44, | |
| "grad_norm": 0.6154415011405945, | |
| "kl": 4.590509033203125, | |
| "learning_rate": 2.061909890123868e-06, | |
| "loss": 0.2873, | |
| "reward": 0.777083358168602, | |
| "reward_std": 0.4854964125901461, | |
| "rewards/accuracy_reward": 0.22083333935588598, | |
| "rewards/format_reward": 0.5562500126659871, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 237.14375581741334, | |
| "epoch": 0.4533333333333333, | |
| "grad_norm": 0.4070974886417389, | |
| "kl": 0.4917999267578125, | |
| "learning_rate": 1.9965951268274372e-06, | |
| "loss": 0.0908, | |
| "reward": 1.347916703671217, | |
| "reward_std": 0.5186698414385319, | |
| "rewards/accuracy_reward": 0.4395833432674408, | |
| "rewards/format_reward": 0.9083333507180213, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 211.02292304039003, | |
| "epoch": 0.4666666666666667, | |
| "grad_norm": 0.7467241883277893, | |
| "kl": 1.289691162109375, | |
| "learning_rate": 1.9302048490666355e-06, | |
| "loss": 0.1216, | |
| "reward": 1.3833333637565375, | |
| "reward_std": 0.4452923540025949, | |
| "rewards/accuracy_reward": 0.4604166755452752, | |
| "rewards/format_reward": 0.9229166809469461, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 207.78333921432494, | |
| "epoch": 0.48, | |
| "grad_norm": 0.5213920474052429, | |
| "kl": 0.3299652099609375, | |
| "learning_rate": 1.8628828433995015e-06, | |
| "loss": 0.0802, | |
| "reward": 1.354166705906391, | |
| "reward_std": 0.43137194626033304, | |
| "rewards/accuracy_reward": 0.4145833427086473, | |
| "rewards/format_reward": 0.9395833432674408, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 214.07083945274354, | |
| "epoch": 0.49333333333333335, | |
| "grad_norm": 11.964993476867676, | |
| "kl": 1.5061492919921875, | |
| "learning_rate": 1.7947749142992453e-06, | |
| "loss": 0.1615, | |
| "reward": 1.2437500320374966, | |
| "reward_std": 0.4532905198633671, | |
| "rewards/accuracy_reward": 0.3145833395421505, | |
| "rewards/format_reward": 0.9291666775941849, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 231.10625724792482, | |
| "epoch": 0.5066666666666667, | |
| "grad_norm": 0.7543414235115051, | |
| "kl": 0.93128662109375, | |
| "learning_rate": 1.7260285683742248e-06, | |
| "loss": 0.1419, | |
| "reward": 1.3000000320374965, | |
| "reward_std": 0.5549088928848505, | |
| "rewards/accuracy_reward": 0.4020833412185311, | |
| "rewards/format_reward": 0.8979166842997074, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 234.94167280197144, | |
| "epoch": 0.52, | |
| "grad_norm": 1.8742121458053589, | |
| "kl": 37.191220092773435, | |
| "learning_rate": 1.6567926949014804e-06, | |
| "loss": 0.5318, | |
| "reward": 1.3187500394880771, | |
| "reward_std": 0.5167587421834469, | |
| "rewards/accuracy_reward": 0.40833334214985373, | |
| "rewards/format_reward": 0.9104166820645332, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 188.943754863739, | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 0.14478746056556702, | |
| "kl": 0.573187255859375, | |
| "learning_rate": 1.5872172433657137e-06, | |
| "loss": 0.0723, | |
| "reward": 1.3020833745598792, | |
| "reward_std": 0.4792703174054623, | |
| "rewards/accuracy_reward": 0.3645833421498537, | |
| "rewards/format_reward": 0.9375000111758709, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 178.75833768844603, | |
| "epoch": 0.5466666666666666, | |
| "grad_norm": 1.8087667226791382, | |
| "kl": 0.8369659423828125, | |
| "learning_rate": 1.5174528987020958e-06, | |
| "loss": 0.0923, | |
| "reward": 1.3125000409781933, | |
| "reward_std": 0.48596611246466637, | |
| "rewards/accuracy_reward": 0.39166667591780424, | |
| "rewards/format_reward": 0.9208333469927311, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 256.539590215683, | |
| "epoch": 0.56, | |
| "grad_norm": 0.5763727426528931, | |
| "kl": 2.3948883056640624, | |
| "learning_rate": 1.4476507549462489e-06, | |
| "loss": 0.2606, | |
| "reward": 1.1541667029261589, | |
| "reward_std": 0.6029179282486439, | |
| "rewards/accuracy_reward": 0.35000000819563865, | |
| "rewards/format_reward": 0.8041666833683848, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 163.55833940505983, | |
| "epoch": 0.5733333333333334, | |
| "grad_norm": 0.23491673171520233, | |
| "kl": 0.215203857421875, | |
| "learning_rate": 1.3779619879982127e-06, | |
| "loss": 0.023, | |
| "reward": 1.420833373069763, | |
| "reward_std": 0.41848115585744383, | |
| "rewards/accuracy_reward": 0.4520833443850279, | |
| "rewards/format_reward": 0.9687500074505806, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 183.7666721343994, | |
| "epoch": 0.5866666666666667, | |
| "grad_norm": 0.1774040013551712, | |
| "kl": 0.300738525390625, | |
| "learning_rate": 1.308537528209108e-06, | |
| "loss": 0.0643, | |
| "reward": 1.4270833656191826, | |
| "reward_std": 0.4158630233258009, | |
| "rewards/accuracy_reward": 0.46666667573153975, | |
| "rewards/format_reward": 0.9604166753590107, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 212.7937566280365, | |
| "epoch": 0.6, | |
| "grad_norm": 0.20609110593795776, | |
| "kl": 0.5560638427734375, | |
| "learning_rate": 1.2395277334996047e-06, | |
| "loss": 0.0963, | |
| "reward": 1.3229166999459268, | |
| "reward_std": 0.4623309187591076, | |
| "rewards/accuracy_reward": 0.3937500096857548, | |
| "rewards/format_reward": 0.9291666783392429, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 189.80208778381348, | |
| "epoch": 0.6133333333333333, | |
| "grad_norm": 0.49082836508750916, | |
| "kl": 0.7411895751953125, | |
| "learning_rate": 1.1710820637181448e-06, | |
| "loss": 0.0805, | |
| "reward": 1.320833370834589, | |
| "reward_std": 0.4459747776389122, | |
| "rewards/accuracy_reward": 0.37916667480021715, | |
| "rewards/format_reward": 0.9416666768491269, | |
| "step": 460 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 194.38542304039, | |
| "epoch": 0.6266666666666667, | |
| "grad_norm": 0.48382291197776794, | |
| "kl": 0.4151397705078125, | |
| "learning_rate": 1.103348756944197e-06, | |
| "loss": 0.0534, | |
| "reward": 1.3812500394880771, | |
| "reward_std": 0.480151966586709, | |
| "rewards/accuracy_reward": 0.4250000108033419, | |
| "rewards/format_reward": 0.9562500081956387, | |
| "step": 470 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 220.32292351722717, | |
| "epoch": 0.64, | |
| "grad_norm": 0.2328379899263382, | |
| "kl": 0.377593994140625, | |
| "learning_rate": 1.036474508437579e-06, | |
| "loss": 0.0734, | |
| "reward": 1.318750038743019, | |
| "reward_std": 0.47938378117978575, | |
| "rewards/accuracy_reward": 0.37708334289491174, | |
| "rewards/format_reward": 0.9416666783392429, | |
| "step": 480 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 213.46458911895752, | |
| "epoch": 0.6533333333333333, | |
| "grad_norm": 0.6423675417900085, | |
| "kl": 0.6955718994140625, | |
| "learning_rate": 9.70604152929197e-07, | |
| "loss": 0.1096, | |
| "reward": 1.3416667044162751, | |
| "reward_std": 0.46137820817530156, | |
| "rewards/accuracy_reward": 0.4145833419635892, | |
| "rewards/format_reward": 0.9270833462476731, | |
| "step": 490 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 214.08125677108765, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.14841440320014954, | |
| "kl": 0.7181640625, | |
| "learning_rate": 9.058803509412648e-07, | |
| "loss": 0.0987, | |
| "reward": 1.397916704416275, | |
| "reward_std": 0.44944200329482553, | |
| "rewards/accuracy_reward": 0.46041667945683, | |
| "rewards/format_reward": 0.9375000111758709, | |
| "step": 500 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 226.89792413711547, | |
| "epoch": 0.68, | |
| "grad_norm": 0.20204541087150574, | |
| "kl": 0.37396240234375, | |
| "learning_rate": 8.424432798163837e-07, | |
| "loss": 0.075, | |
| "reward": 1.4020833641290664, | |
| "reward_std": 0.3638565935194492, | |
| "rewards/accuracy_reward": 0.45000000968575476, | |
| "rewards/format_reward": 0.9520833425223827, | |
| "step": 510 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 227.1395909309387, | |
| "epoch": 0.6933333333333334, | |
| "grad_norm": 0.170625239610672, | |
| "kl": 0.7568450927734375, | |
| "learning_rate": 7.804303301246311e-07, | |
| "loss": 0.0984, | |
| "reward": 1.331250037252903, | |
| "reward_std": 0.48546464554965496, | |
| "rewards/accuracy_reward": 0.40000001061707735, | |
| "rewards/format_reward": 0.9312500089406968, | |
| "step": 520 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 222.33542160987855, | |
| "epoch": 0.7066666666666667, | |
| "grad_norm": 0.35064828395843506, | |
| "kl": 0.613885498046875, | |
| "learning_rate": 7.19975808106177e-07, | |
| "loss": 0.0488, | |
| "reward": 1.3625000312924385, | |
| "reward_std": 0.44792898930609226, | |
| "rewards/accuracy_reward": 0.4104166755452752, | |
| "rewards/format_reward": 0.9520833417773247, | |
| "step": 530 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 237.87292375564576, | |
| "epoch": 0.72, | |
| "grad_norm": 0.3742374777793884, | |
| "kl": 0.584375, | |
| "learning_rate": 6.6121064479388e-07, | |
| "loss": 0.0612, | |
| "reward": 1.4020833656191827, | |
| "reward_std": 0.4303410712629557, | |
| "rewards/accuracy_reward": 0.46041667759418486, | |
| "rewards/format_reward": 0.9416666768491269, | |
| "step": 540 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 244.73542499542236, | |
| "epoch": 0.7333333333333333, | |
| "grad_norm": 0.346811980009079, | |
| "kl": 0.83583984375, | |
| "learning_rate": 6.04262112445821e-07, | |
| "loss": 0.1029, | |
| "reward": 1.293750037997961, | |
| "reward_std": 0.48419367931783197, | |
| "rewards/accuracy_reward": 0.3791666770353913, | |
| "rewards/format_reward": 0.9145833477377892, | |
| "step": 550 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 268.9645917892456, | |
| "epoch": 0.7466666666666667, | |
| "grad_norm": 0.14408645033836365, | |
| "kl": 0.513336181640625, | |
| "learning_rate": 5.492535489019345e-07, | |
| "loss": 0.084, | |
| "reward": 1.2875000312924385, | |
| "reward_std": 0.48581976890563966, | |
| "rewards/accuracy_reward": 0.3770833395421505, | |
| "rewards/format_reward": 0.9104166757315397, | |
| "step": 560 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 241.53333921432494, | |
| "epoch": 0.76, | |
| "grad_norm": 1.40211021900177, | |
| "kl": 0.8751678466796875, | |
| "learning_rate": 4.963040904617131e-07, | |
| "loss": 0.1203, | |
| "reward": 1.3145833693444728, | |
| "reward_std": 0.49484665393829347, | |
| "rewards/accuracy_reward": 0.4145833423361182, | |
| "rewards/format_reward": 0.9000000134110451, | |
| "step": 570 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 252.40000743865966, | |
| "epoch": 0.7733333333333333, | |
| "grad_norm": 0.4801377058029175, | |
| "kl": 1.2982696533203124, | |
| "learning_rate": 4.4552841386150737e-07, | |
| "loss": 0.1253, | |
| "reward": 1.2708333659917117, | |
| "reward_std": 0.5930769924074412, | |
| "rewards/accuracy_reward": 0.43541667610406876, | |
| "rewards/format_reward": 0.8354166846722364, | |
| "step": 580 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 259.3583409309387, | |
| "epoch": 0.7866666666666666, | |
| "grad_norm": 1.0408235788345337, | |
| "kl": 1.4053619384765625, | |
| "learning_rate": 3.9703648791025716e-07, | |
| "loss": 0.183, | |
| "reward": 1.1916666943579912, | |
| "reward_std": 0.6191505286842585, | |
| "rewards/accuracy_reward": 0.3854166746139526, | |
| "rewards/format_reward": 0.8062500186264515, | |
| "step": 590 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 243.28750715255737, | |
| "epoch": 0.8, | |
| "grad_norm": 0.2709506154060364, | |
| "kl": 0.96761474609375, | |
| "learning_rate": 3.5093333532153313e-07, | |
| "loss": 0.1034, | |
| "reward": 1.316666693240404, | |
| "reward_std": 0.46122562885284424, | |
| "rewards/accuracy_reward": 0.4354166748002172, | |
| "rewards/format_reward": 0.8812500096857547, | |
| "step": 600 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 218.83542261123657, | |
| "epoch": 0.8133333333333334, | |
| "grad_norm": 0.19969280064105988, | |
| "kl": 0.4908477783203125, | |
| "learning_rate": 3.073188052577282e-07, | |
| "loss": 0.0814, | |
| "reward": 1.3458333656191825, | |
| "reward_std": 0.4190520565956831, | |
| "rewards/accuracy_reward": 0.4041666744276881, | |
| "rewards/format_reward": 0.9416666775941849, | |
| "step": 610 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 216.3875057220459, | |
| "epoch": 0.8266666666666667, | |
| "grad_norm": 0.23758280277252197, | |
| "kl": 0.64254150390625, | |
| "learning_rate": 2.6628735707900655e-07, | |
| "loss": 0.0776, | |
| "reward": 1.3395833685994147, | |
| "reward_std": 0.4737738098949194, | |
| "rewards/accuracy_reward": 0.4250000070780516, | |
| "rewards/format_reward": 0.9145833484828472, | |
| "step": 620 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 245.11459074020385, | |
| "epoch": 0.84, | |
| "grad_norm": 0.4293162226676941, | |
| "kl": 1.1108795166015626, | |
| "learning_rate": 2.2792785576536108e-07, | |
| "loss": 0.1537, | |
| "reward": 1.24375003837049, | |
| "reward_std": 0.520675316080451, | |
| "rewards/accuracy_reward": 0.3729166738688946, | |
| "rewards/format_reward": 0.8708333484828472, | |
| "step": 630 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 249.4729232788086, | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 0.17477993667125702, | |
| "kl": 1.014031982421875, | |
| "learning_rate": 1.9232337945485655e-07, | |
| "loss": 0.1481, | |
| "reward": 1.293750035017729, | |
| "reward_std": 0.5492795780301094, | |
| "rewards/accuracy_reward": 0.4104166742414236, | |
| "rewards/format_reward": 0.8833333525806666, | |
| "step": 640 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 271.8937582015991, | |
| "epoch": 0.8666666666666667, | |
| "grad_norm": 0.683167040348053, | |
| "kl": 1.2961456298828125, | |
| "learning_rate": 1.5955103951488177e-07, | |
| "loss": 0.1557, | |
| "reward": 1.1854166965931654, | |
| "reward_std": 0.5265001837164164, | |
| "rewards/accuracy_reward": 0.33541667349636556, | |
| "rewards/format_reward": 0.8500000169500709, | |
| "step": 650 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 238.02084150314332, | |
| "epoch": 0.88, | |
| "grad_norm": 0.47596192359924316, | |
| "kl": 0.965118408203125, | |
| "learning_rate": 1.2968181353609853e-07, | |
| "loss": 0.1338, | |
| "reward": 1.3020833618938923, | |
| "reward_std": 0.49198094978928564, | |
| "rewards/accuracy_reward": 0.40833334252238274, | |
| "rewards/format_reward": 0.8937500178813934, | |
| "step": 660 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 221.03750729560852, | |
| "epoch": 0.8933333333333333, | |
| "grad_norm": 0.4685460031032562, | |
| "kl": 0.8098968505859375, | |
| "learning_rate": 1.0278039161078634e-07, | |
| "loss": 0.1018, | |
| "reward": 1.3375000290572643, | |
| "reward_std": 0.4632135137915611, | |
| "rewards/accuracy_reward": 0.43750000912696124, | |
| "rewards/format_reward": 0.9000000119209289, | |
| "step": 670 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 253.50209112167357, | |
| "epoch": 0.9066666666666666, | |
| "grad_norm": 0.2506866753101349, | |
| "kl": 0.906805419921875, | |
| "learning_rate": 7.89050362285062e-08, | |
| "loss": 0.1365, | |
| "reward": 1.2375000305473804, | |
| "reward_std": 0.517885773256421, | |
| "rewards/accuracy_reward": 0.3645833428949118, | |
| "rewards/format_reward": 0.8729166854172945, | |
| "step": 680 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 239.60625705718994, | |
| "epoch": 0.92, | |
| "grad_norm": 0.5413881540298462, | |
| "kl": 0.835858154296875, | |
| "learning_rate": 5.810745609252166e-08, | |
| "loss": 0.1281, | |
| "reward": 1.2187500409781933, | |
| "reward_std": 0.5353617053478956, | |
| "rewards/accuracy_reward": 0.3395833415910602, | |
| "rewards/format_reward": 0.8791666828095913, | |
| "step": 690 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 232.96667356491088, | |
| "epoch": 0.9333333333333333, | |
| "grad_norm": 0.41816839575767517, | |
| "kl": 1.4766021728515626, | |
| "learning_rate": 4.0432694130264294e-08, | |
| "loss": 0.1412, | |
| "reward": 1.2375000312924385, | |
| "reward_std": 0.5488180216401816, | |
| "rewards/accuracy_reward": 0.37708334121853115, | |
| "rewards/format_reward": 0.8604166824370623, | |
| "step": 700 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 278.4333411693573, | |
| "epoch": 0.9466666666666667, | |
| "grad_norm": 0.5456490516662598, | |
| "kl": 1.7320938110351562, | |
| "learning_rate": 2.5919029940380145e-08, | |
| "loss": 0.2249, | |
| "reward": 1.2062500290572644, | |
| "reward_std": 0.6055900201201438, | |
| "rewards/accuracy_reward": 0.3833333427086473, | |
| "rewards/format_reward": 0.8229166869074106, | |
| "step": 710 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 239.62500748634338, | |
| "epoch": 0.96, | |
| "grad_norm": 0.32890596985816956, | |
| "kl": 1.458221435546875, | |
| "learning_rate": 1.4597896887644457e-08, | |
| "loss": 0.1827, | |
| "reward": 1.264583370089531, | |
| "reward_std": 0.5755864661186934, | |
| "rewards/accuracy_reward": 0.4187500111758709, | |
| "rewards/format_reward": 0.8458333492279053, | |
| "step": 720 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 274.1791749954224, | |
| "epoch": 0.9733333333333334, | |
| "grad_norm": 0.3116457760334015, | |
| "kl": 1.3982940673828126, | |
| "learning_rate": 6.493814025293476e-09, | |
| "loss": 0.1776, | |
| "reward": 1.231250035017729, | |
| "reward_std": 0.5566708967089653, | |
| "rewards/accuracy_reward": 0.40000000949949027, | |
| "rewards/format_reward": 0.8312500137835741, | |
| "step": 730 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 255.23750734329224, | |
| "epoch": 0.9866666666666667, | |
| "grad_norm": 0.4142087697982788, | |
| "kl": 1.2847900390625, | |
| "learning_rate": 1.624332992213151e-09, | |
| "loss": 0.1855, | |
| "reward": 1.2229166984558106, | |
| "reward_std": 0.5469876442104578, | |
| "rewards/accuracy_reward": 0.38333334159106014, | |
| "rewards/format_reward": 0.8395833514630795, | |
| "step": 740 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 279.8500079154968, | |
| "epoch": 1.0, | |
| "grad_norm": 0.3987623155117035, | |
| "kl": 1.2904449462890626, | |
| "learning_rate": 0.0, | |
| "loss": 0.2137, | |
| "reward": 1.2583333723247052, | |
| "reward_std": 0.6197128046303988, | |
| "rewards/accuracy_reward": 0.4229166766628623, | |
| "rewards/format_reward": 0.8354166867211461, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 750, | |
| "total_flos": 0.0, | |
| "train_loss": 0.1488674604743719, | |
| "train_runtime": 66920.1081, | |
| "train_samples_per_second": 0.09, | |
| "train_steps_per_second": 0.011 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 750, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |