{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08028904054596547, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 200.0, "epoch": 0.00013381506757660912, "grad_norm": 0.6607624292373657, "kl": 0.0, "learning_rate": 7.142857142857144e-08, "loss": -0.0, "reward": 0.007249999791383743, "reward_std": 0.22249864041805267, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.007249999791383743, "step": 1 }, { "completion_length": 173.125, "epoch": 0.00026763013515321824, "grad_norm": 0.6534063816070557, "kl": 0.0, "learning_rate": 1.4285714285714287e-07, "loss": 0.0, "reward": -0.03125, "reward_std": 0.1129370778799057, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03125, "step": 2 }, { "completion_length": 188.5, "epoch": 0.0004014452027298274, "grad_norm": 0.3849407434463501, "kl": 6.552512058988214e-05, "learning_rate": 2.142857142857143e-07, "loss": 0.0, "reward": -0.13124999403953552, "reward_std": 0.1978648453950882, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13125000894069672, "step": 3 }, { "completion_length": 200.0, "epoch": 0.0005352602703064365, "grad_norm": 0.0024058963172137737, "kl": 0.00013980743824504316, "learning_rate": 2.8571428571428575e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 4 }, { "completion_length": 200.0, "epoch": 0.0006690753378830456, "grad_norm": 0.0013197459047660232, "kl": 0.00013644498540088534, "learning_rate": 3.5714285714285716e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 5 }, { "completion_length": 185.5, "epoch": 0.0008028904054596548, "grad_norm": 0.4532129168510437, "kl": 0.00018379972607363015, "learning_rate": 4.285714285714286e-07, "loss": 0.0, "reward": -0.04437500238418579, "reward_std": 0.2802167236804962, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04437500238418579, "step": 6 }, { "completion_length": 161.5, "epoch": 0.0009367054730362638, "grad_norm": 0.7619460225105286, "kl": 0.0002807776036206633, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "reward": -0.061375003308057785, "reward_std": 0.09564956277608871, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.061375003308057785, "step": 7 }, { "completion_length": 187.0, "epoch": 0.001070520540612873, "grad_norm": 0.5761944651603699, "kl": 0.00010479884804226458, "learning_rate": 5.714285714285715e-07, "loss": 0.0, "reward": -0.09650000184774399, "reward_std": 0.16460254788398743, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09649999439716339, "step": 8 }, { "completion_length": 188.5, "epoch": 0.0012043356081894822, "grad_norm": 0.6709027290344238, "kl": 0.00021481592557393014, "learning_rate": 6.428571428571428e-07, "loss": 0.0, "reward": 0.011750001460313797, "reward_std": 0.19893628358840942, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.011750001460313797, "step": 9 }, { "completion_length": 87.375, "epoch": 0.0013381506757660913, "grad_norm": 0.8226869106292725, "kl": 0.0001016025198623538, "learning_rate": 7.142857142857143e-07, "loss": 0.0, "reward": 0.1938749998807907, "reward_std": 0.02696789987385273, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1938749998807907, "step": 10 }, { "completion_length": 159.625, "epoch": 0.0014719657433427003, "grad_norm": 0.7588320374488831, "kl": 0.00018815182556863874, "learning_rate": 7.857142857142857e-07, "loss": 0.0, "reward": 0.27787500619888306, "reward_std": 0.9429591298103333, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03462500125169754, "step": 11 }, { "completion_length": 182.125, "epoch": 0.0016057808109193096, "grad_norm": 0.5654924511909485, "kl": 0.00015180371701717377, "learning_rate": 8.571428571428572e-07, "loss": 0.0, "reward": 0.001499999314546585, "reward_std": 0.08378544449806213, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.001500000013038516, "step": 12 }, { "completion_length": 200.0, "epoch": 0.0017395958784959186, "grad_norm": 0.5385698080062866, "kl": 0.00012345206050667912, "learning_rate": 9.285714285714287e-07, "loss": 0.0, "reward": 0.03962500020861626, "reward_std": 0.1586991250514984, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03962500020861626, "step": 13 }, { "completion_length": 164.375, "epoch": 0.0018734109460725277, "grad_norm": 0.5469967722892761, "kl": 0.00018677486514206976, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": 0.02187499776482582, "reward_std": 0.0841604545712471, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02187499962747097, "step": 14 }, { "completion_length": 200.0, "epoch": 0.002007226013649137, "grad_norm": 0.7570310831069946, "kl": 0.00021116163406986743, "learning_rate": 1.0714285714285714e-06, "loss": 0.0, "reward": -0.06212500110268593, "reward_std": 0.23740287125110626, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06212500110268593, "step": 15 }, { "completion_length": 146.25, "epoch": 0.002141041081225746, "grad_norm": 0.8335115313529968, "kl": 0.00011178754357388243, "learning_rate": 1.142857142857143e-06, "loss": 0.0, "reward": 0.12312500178813934, "reward_std": 0.21859319508075714, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.060624998062849045, "step": 16 }, { "completion_length": 200.0, "epoch": 0.002274856148802355, "grad_norm": 0.0015883075539022684, "kl": 0.00014089327305555344, "learning_rate": 1.2142857142857144e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 17 }, { "completion_length": 189.0, "epoch": 0.0024086712163789645, "grad_norm": 0.517302930355072, "kl": 0.00016497125034220517, "learning_rate": 1.2857142857142856e-06, "loss": 0.0, "reward": 0.15037497878074646, "reward_std": 0.8560577034950256, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.16212499141693115, "step": 18 }, { "completion_length": 153.875, "epoch": 0.0025424862839555735, "grad_norm": 0.674881100654602, "kl": 0.00017893416224978864, "learning_rate": 1.3571428571428572e-06, "loss": 0.0, "reward": 0.009749999269843102, "reward_std": 0.06255912035703659, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.009749999269843102, "step": 19 }, { "completion_length": 178.0, "epoch": 0.0026763013515321826, "grad_norm": 0.675031304359436, "kl": 0.00027570384554564953, "learning_rate": 1.4285714285714286e-06, "loss": 0.0, "reward": -0.1379999965429306, "reward_std": 0.16334013640880585, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1379999965429306, "step": 20 }, { "completion_length": 187.875, "epoch": 0.0028101164191087916, "grad_norm": 0.6267116665840149, "kl": 0.00023580921697430313, "learning_rate": 1.5e-06, "loss": 0.0, "reward": -0.06525000184774399, "reward_std": 0.13207006454467773, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06525000184774399, "step": 21 }, { "completion_length": 200.0, "epoch": 0.0029439314866854006, "grad_norm": 0.7732946872711182, "kl": 0.00024775322526693344, "learning_rate": 1.5714285714285714e-06, "loss": 0.0, "reward": 0.08224999904632568, "reward_std": 0.1209152564406395, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08224999904632568, "step": 22 }, { "completion_length": 136.75, "epoch": 0.00307774655426201, "grad_norm": 0.7155055999755859, "kl": 0.00017247500363737345, "learning_rate": 1.642857142857143e-06, "loss": 0.0, "reward": 0.05249999836087227, "reward_std": 0.14187923073768616, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05250000208616257, "step": 23 }, { "completion_length": 123.875, "epoch": 0.003211561621838619, "grad_norm": 0.932826042175293, "kl": 0.0003397816326469183, "learning_rate": 1.7142857142857145e-06, "loss": 0.0, "reward": 0.07175000011920929, "reward_std": 0.11965157836675644, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07175000011920929, "step": 24 }, { "completion_length": 187.625, "epoch": 0.003345376689415228, "grad_norm": 0.5741406083106995, "kl": 0.0001642967836232856, "learning_rate": 1.7857142857142859e-06, "loss": 0.0, "reward": -0.051750000566244125, "reward_std": 0.20635458827018738, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.05174999311566353, "step": 25 }, { "completion_length": 121.125, "epoch": 0.0034791917569918372, "grad_norm": 0.7288361191749573, "kl": 0.000165597622981295, "learning_rate": 1.8571428571428573e-06, "loss": 0.0, "reward": 0.16362500190734863, "reward_std": 0.06539099663496017, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16362500190734863, "step": 26 }, { "completion_length": 142.25, "epoch": 0.0036130068245684463, "grad_norm": 1.0791475772857666, "kl": 0.0004126343410462141, "learning_rate": 1.928571428571429e-06, "loss": 0.0, "reward": 0.07337499409914017, "reward_std": 0.1588125079870224, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07337500154972076, "step": 27 }, { "completion_length": 187.375, "epoch": 0.0037468218921450553, "grad_norm": 0.5024563074111938, "kl": 0.0001997292274609208, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "reward": -0.01275000348687172, "reward_std": 0.2331772893667221, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.012749999761581421, "step": 28 }, { "completion_length": 200.0, "epoch": 0.003880636959721665, "grad_norm": 0.002470141975209117, "kl": 0.00020731985569000244, "learning_rate": 2.0714285714285717e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 29 }, { "completion_length": 195.125, "epoch": 0.004014452027298274, "grad_norm": 0.5191249251365662, "kl": 0.00023838126799091697, "learning_rate": 2.1428571428571427e-06, "loss": 0.0, "reward": -0.05087500065565109, "reward_std": 0.17549476027488708, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.05087500065565109, "step": 30 }, { "completion_length": 192.75, "epoch": 0.004148267094874883, "grad_norm": 0.6942370533943176, "kl": 0.00031493898131884634, "learning_rate": 2.2142857142857146e-06, "loss": 0.0, "reward": 0.0062499986961483955, "reward_std": 0.11689159274101257, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.00624999962747097, "step": 31 }, { "completion_length": 180.875, "epoch": 0.004282082162451492, "grad_norm": 0.9590561985969543, "kl": 0.0002854971680790186, "learning_rate": 2.285714285714286e-06, "loss": 0.0, "reward": -0.03500000014901161, "reward_std": 0.04888762906193733, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.034999996423721313, "step": 32 }, { "completion_length": 118.875, "epoch": 0.004415897230028101, "grad_norm": 0.8301633596420288, "kl": 0.000467813661089167, "learning_rate": 2.3571428571428574e-06, "loss": 0.0, "reward": 0.06449999660253525, "reward_std": 0.07718807458877563, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06450000405311584, "step": 33 }, { "completion_length": 195.75, "epoch": 0.00454971229760471, "grad_norm": 0.45681193470954895, "kl": 0.00029114686185494065, "learning_rate": 2.428571428571429e-06, "loss": 0.0, "reward": 0.04874999821186066, "reward_std": 0.1104571670293808, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04874999821186066, "step": 34 }, { "completion_length": 185.5, "epoch": 0.004683527365181319, "grad_norm": 0.339918851852417, "kl": 0.00048767327098175883, "learning_rate": 2.5e-06, "loss": 0.0, "reward": 0.06824999302625656, "reward_std": 0.17505407333374023, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06825000047683716, "step": 35 }, { "completion_length": 198.625, "epoch": 0.004817342432757929, "grad_norm": 0.6671655178070068, "kl": 0.0003277310461271554, "learning_rate": 2.571428571428571e-06, "loss": 0.0, "reward": 0.0702499970793724, "reward_std": 0.1537351757287979, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.070250004529953, "step": 36 }, { "completion_length": 119.5, "epoch": 0.004951157500334538, "grad_norm": 0.9154505133628845, "kl": 0.0010961720254272223, "learning_rate": 2.642857142857143e-06, "loss": 0.0, "reward": 0.09600000083446503, "reward_std": 0.09173097461462021, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09600000083446503, "step": 37 }, { "completion_length": 180.25, "epoch": 0.005084972567911147, "grad_norm": 0.4532427489757538, "kl": 0.00017556306556798518, "learning_rate": 2.7142857142857144e-06, "loss": 0.0, "reward": -0.0521249994635582, "reward_std": 0.12402238696813583, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0521249994635582, "step": 38 }, { "completion_length": 126.625, "epoch": 0.005218787635487756, "grad_norm": 0.7945365309715271, "kl": 0.0008515769150108099, "learning_rate": 2.785714285714286e-06, "loss": 0.0, "reward": 0.07462500035762787, "reward_std": 0.0829697698354721, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07462500035762787, "step": 39 }, { "completion_length": 151.25, "epoch": 0.005352602703064365, "grad_norm": 0.6148977875709534, "kl": 0.0004257991095073521, "learning_rate": 2.8571428571428573e-06, "loss": 0.0, "reward": 0.06274999678134918, "reward_std": 0.05740022286772728, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06274999678134918, "step": 40 }, { "completion_length": 175.625, "epoch": 0.005486417770640974, "grad_norm": 0.7329086661338806, "kl": 0.000793412618804723, "learning_rate": 2.928571428571429e-06, "loss": 0.0, "reward": -0.12350000441074371, "reward_std": 0.1900150328874588, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12349999696016312, "step": 41 }, { "completion_length": 200.0, "epoch": 0.005620232838217583, "grad_norm": 0.4883364140987396, "kl": 0.0008033772464841604, "learning_rate": 3e-06, "loss": 0.0, "reward": -0.03174999728798866, "reward_std": 0.16930508613586426, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03175000101327896, "step": 42 }, { "completion_length": 184.625, "epoch": 0.005754047905794192, "grad_norm": 1.0290544033050537, "kl": 0.0011950215557590127, "learning_rate": 3.071428571428572e-06, "loss": 0.0, "reward": 0.10474999994039536, "reward_std": 0.04017728567123413, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10475000739097595, "step": 43 }, { "completion_length": 194.25, "epoch": 0.005887862973370801, "grad_norm": 0.5598155856132507, "kl": 0.0009121394250541925, "learning_rate": 3.142857142857143e-06, "loss": 0.0, "reward": -0.04149999842047691, "reward_std": 0.17160585522651672, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04150000214576721, "step": 44 }, { "completion_length": 196.125, "epoch": 0.00602167804094741, "grad_norm": 0.5631980299949646, "kl": 0.0006102789775468409, "learning_rate": 3.2142857142857147e-06, "loss": 0.0, "reward": -0.14949999749660492, "reward_std": 0.1413860023021698, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14949999749660492, "step": 45 }, { "completion_length": 116.75, "epoch": 0.00615549310852402, "grad_norm": 0.9640089273452759, "kl": 0.001926671713590622, "learning_rate": 3.285714285714286e-06, "loss": 0.0001, "reward": 0.09724999964237213, "reward_std": 0.11875996738672256, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09724999964237213, "step": 46 }, { "completion_length": 198.125, "epoch": 0.006289308176100629, "grad_norm": 0.48923370242118835, "kl": 0.0005758957122452557, "learning_rate": 3.357142857142857e-06, "loss": 0.0, "reward": 0.3738750219345093, "reward_std": 0.772754967212677, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.061375007033348083, "step": 47 }, { "completion_length": 177.5, "epoch": 0.006423123243677238, "grad_norm": 4.731001853942871, "kl": 0.0015442727599292994, "learning_rate": 3.428571428571429e-06, "loss": 0.0001, "reward": 0.09962499886751175, "reward_std": 0.049520377069711685, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09962499886751175, "step": 48 }, { "completion_length": 139.25, "epoch": 0.006556938311253847, "grad_norm": 0.6496549844741821, "kl": 0.0012009841157123446, "learning_rate": 3.5e-06, "loss": 0.0, "reward": 0.06300000101327896, "reward_std": 0.2599862515926361, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0004999972879886627, "step": 49 }, { "completion_length": 196.5, "epoch": 0.006690753378830456, "grad_norm": 0.38595905900001526, "kl": 0.001274331472814083, "learning_rate": 3.5714285714285718e-06, "loss": 0.0001, "reward": 0.17212501168251038, "reward_std": 0.9314478039741516, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14037500321865082, "step": 50 }, { "completion_length": 150.625, "epoch": 0.006824568446407065, "grad_norm": 0.6706291437149048, "kl": 0.004620950203388929, "learning_rate": 3.642857142857143e-06, "loss": 0.0002, "reward": 0.045249998569488525, "reward_std": 0.10856038331985474, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.045249998569488525, "step": 51 }, { "completion_length": 168.125, "epoch": 0.0069583835139836745, "grad_norm": 0.5202582478523254, "kl": 0.0028160405345261097, "learning_rate": 3.7142857142857146e-06, "loss": 0.0001, "reward": 0.33550000190734863, "reward_std": 0.8468546271324158, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.023000000044703484, "step": 52 }, { "completion_length": 134.0, "epoch": 0.0070921985815602835, "grad_norm": 0.8285565972328186, "kl": 0.0057279570028185844, "learning_rate": 3.785714285714286e-06, "loss": 0.0002, "reward": 0.042249999940395355, "reward_std": 0.08305376768112183, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.042250003665685654, "step": 53 }, { "completion_length": 152.25, "epoch": 0.0072260136491368926, "grad_norm": 0.6896976828575134, "kl": 0.002261719200760126, "learning_rate": 3.857142857142858e-06, "loss": 0.0001, "reward": 0.004999995231628418, "reward_std": 0.19244888424873352, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.004999998956918716, "step": 54 }, { "completion_length": 198.5, "epoch": 0.007359828716713502, "grad_norm": 0.6962910294532776, "kl": 0.0006728660082444549, "learning_rate": 3.928571428571429e-06, "loss": 0.0, "reward": -0.028874998912215233, "reward_std": 0.17428912222385406, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.028874997049570084, "step": 55 }, { "completion_length": 200.0, "epoch": 0.007493643784290111, "grad_norm": 0.006168184336274862, "kl": 0.0004406818188726902, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 56 }, { "completion_length": 153.125, "epoch": 0.0076274588518667205, "grad_norm": 0.6700598001480103, "kl": 0.002835752908140421, "learning_rate": 4.071428571428572e-06, "loss": 0.0001, "reward": -0.06575000286102295, "reward_std": 0.14158062636852264, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06574999541044235, "step": 57 }, { "completion_length": 184.625, "epoch": 0.00776127391944333, "grad_norm": 0.6356731653213501, "kl": 0.0033719383645802736, "learning_rate": 4.1428571428571435e-06, "loss": 0.0001, "reward": -0.02550000138580799, "reward_std": 0.15240922570228577, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.025499997660517693, "step": 58 }, { "completion_length": 130.625, "epoch": 0.007895088987019938, "grad_norm": 1.1287301778793335, "kl": 0.014363477006554604, "learning_rate": 4.2142857142857145e-06, "loss": 0.0006, "reward": 0.09087499976158142, "reward_std": 0.08011498302221298, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09087499976158142, "step": 59 }, { "completion_length": 195.625, "epoch": 0.008028904054596548, "grad_norm": 0.5961847901344299, "kl": 0.001502531231380999, "learning_rate": 4.2857142857142855e-06, "loss": 0.0001, "reward": 0.05887499824166298, "reward_std": 0.15392060577869415, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05887499824166298, "step": 60 }, { "completion_length": 200.0, "epoch": 0.008162719122173156, "grad_norm": 0.9219453930854797, "kl": 0.0023608591873198748, "learning_rate": 4.357142857142857e-06, "loss": 0.0001, "reward": 0.07212500274181366, "reward_std": 0.1495530903339386, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07212500274181366, "step": 61 }, { "completion_length": 160.75, "epoch": 0.008296534189749766, "grad_norm": 0.7371984720230103, "kl": 0.010458258911967278, "learning_rate": 4.428571428571429e-06, "loss": 0.0004, "reward": -0.045125000178813934, "reward_std": 0.08707211911678314, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.045125000178813934, "step": 62 }, { "completion_length": 147.125, "epoch": 0.008430349257326376, "grad_norm": 0.6950094103813171, "kl": 0.007362588308751583, "learning_rate": 4.5e-06, "loss": 0.0003, "reward": 0.0182499997317791, "reward_std": 0.05184248462319374, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0182499997317791, "step": 63 }, { "completion_length": 175.0, "epoch": 0.008564164324902984, "grad_norm": 0.713733971118927, "kl": 0.002926561050117016, "learning_rate": 4.571428571428572e-06, "loss": 0.0001, "reward": -0.10050000250339508, "reward_std": 0.15801718831062317, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10050000250339508, "step": 64 }, { "completion_length": 196.875, "epoch": 0.008697979392479594, "grad_norm": 0.556557297706604, "kl": 0.0027996408753097057, "learning_rate": 4.642857142857144e-06, "loss": 0.0001, "reward": 0.02562500163912773, "reward_std": 0.2600933611392975, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03687499836087227, "step": 65 }, { "completion_length": 153.875, "epoch": 0.008831794460056202, "grad_norm": 0.7218765020370483, "kl": 0.009235844947397709, "learning_rate": 4.714285714285715e-06, "loss": 0.0004, "reward": -0.05662500858306885, "reward_std": 0.14146673679351807, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.05662500113248825, "step": 66 }, { "completion_length": 165.125, "epoch": 0.008965609527632812, "grad_norm": 0.7870226502418518, "kl": 0.015462895855307579, "learning_rate": 4.785714285714287e-06, "loss": 0.0006, "reward": 0.019999997690320015, "reward_std": 0.1040288433432579, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019999999552965164, "step": 67 }, { "completion_length": 147.625, "epoch": 0.00909942459520942, "grad_norm": 0.5590029358863831, "kl": 0.011768238618969917, "learning_rate": 4.857142857142858e-06, "loss": 0.0005, "reward": 0.05937499925494194, "reward_std": 0.1167622059583664, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05937499552965164, "step": 68 }, { "completion_length": 126.0, "epoch": 0.00923323966278603, "grad_norm": 0.6137294173240662, "kl": 0.022856852039694786, "learning_rate": 4.928571428571429e-06, "loss": 0.0009, "reward": 0.12837499380111694, "reward_std": 0.044823262840509415, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12837500870227814, "step": 69 }, { "completion_length": 160.25, "epoch": 0.009367054730362638, "grad_norm": 0.5659866333007812, "kl": 0.02101122960448265, "learning_rate": 5e-06, "loss": 0.0008, "reward": 0.03974999859929085, "reward_std": 0.11098358780145645, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.039750002324581146, "step": 70 }, { "completion_length": 191.125, "epoch": 0.009500869797939248, "grad_norm": 0.6181241273880005, "kl": 0.018157683312892914, "learning_rate": 4.9999689166542295e-06, "loss": 0.0007, "reward": -0.0025000013411045074, "reward_std": 0.11320652067661285, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0025000013411045074, "step": 71 }, { "completion_length": 170.5, "epoch": 0.009634684865515858, "grad_norm": 0.5695949792861938, "kl": 0.020333707332611084, "learning_rate": 4.999875667389858e-06, "loss": 0.0008, "reward": 1.0255000591278076, "reward_std": 1.126177191734314, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.025499999523162842, "step": 72 }, { "completion_length": 162.375, "epoch": 0.009768499933092466, "grad_norm": 0.6084307432174683, "kl": 0.02367868646979332, "learning_rate": 4.999720254525684e-06, "loss": 0.0009, "reward": 1.6545000076293945, "reward_std": 1.2861894369125366, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09200000017881393, "step": 73 }, { "completion_length": 184.5, "epoch": 0.009902315000669076, "grad_norm": 0.3787034749984741, "kl": 0.01594972237944603, "learning_rate": 4.999502681926309e-06, "loss": 0.0006, "reward": 0.22587500512599945, "reward_std": 0.8986221551895142, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08662500232458115, "step": 74 }, { "completion_length": 169.875, "epoch": 0.010036130068245684, "grad_norm": 0.5691292881965637, "kl": 0.030152076855301857, "learning_rate": 4.999222955002041e-06, "loss": 0.0012, "reward": 1.2243750095367432, "reward_std": 1.4460337162017822, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.025624997913837433, "step": 75 }, { "completion_length": 100.125, "epoch": 0.010169945135822294, "grad_norm": 0.9836642742156982, "kl": 0.11397238820791245, "learning_rate": 4.998881080708759e-06, "loss": 0.0046, "reward": 0.18599998950958252, "reward_std": 0.03580503165721893, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1860000044107437, "step": 76 }, { "completion_length": 104.0, "epoch": 0.010303760203398902, "grad_norm": 0.6849879026412964, "kl": 0.045376699417829514, "learning_rate": 4.99847706754774e-06, "loss": 0.0018, "reward": 0.5061249732971191, "reward_std": 0.907471239566803, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19362500309944153, "step": 77 }, { "completion_length": 129.0, "epoch": 0.010437575270975512, "grad_norm": 0.6051425337791443, "kl": 0.04764317721128464, "learning_rate": 4.998010925565449e-06, "loss": 0.0019, "reward": 2.3172500133514404, "reward_std": 0.918697714805603, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12974999845027924, "step": 78 }, { "completion_length": 95.375, "epoch": 0.01057139033855212, "grad_norm": 0.9993979334831238, "kl": 0.04854699969291687, "learning_rate": 4.997482666353287e-06, "loss": 0.0019, "reward": 0.2381249964237213, "reward_std": 0.018216459080576897, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2381249964237213, "step": 79 }, { "completion_length": 185.0, "epoch": 0.01070520540612873, "grad_norm": 0.5381810665130615, "kl": 0.028288230299949646, "learning_rate": 4.996892303047306e-06, "loss": 0.0011, "reward": -0.021500002592802048, "reward_std": 0.17986345291137695, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.021500004455447197, "step": 80 }, { "completion_length": 164.125, "epoch": 0.010839020473705338, "grad_norm": 0.6517502069473267, "kl": 0.06343915313482285, "learning_rate": 4.99623985032788e-06, "loss": 0.0025, "reward": 0.15412500500679016, "reward_std": 0.08364454656839371, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15412500500679016, "step": 81 }, { "completion_length": 90.875, "epoch": 0.010972835541281948, "grad_norm": 1.2636921405792236, "kl": 0.05725828558206558, "learning_rate": 4.995525324419338e-06, "loss": 0.0023, "reward": 1.8042500019073486, "reward_std": 1.2858073711395264, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24175000190734863, "step": 82 }, { "completion_length": 117.25, "epoch": 0.011106650608858558, "grad_norm": 0.9008481502532959, "kl": 0.05242697522044182, "learning_rate": 4.994748743089566e-06, "loss": 0.0021, "reward": 2.042875051498413, "reward_std": 1.154105544090271, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1678750067949295, "step": 83 }, { "completion_length": 137.0, "epoch": 0.011240465676435166, "grad_norm": 1.0723590850830078, "kl": 0.05238647013902664, "learning_rate": 4.993910125649561e-06, "loss": 0.0021, "reward": 2.0889999866485596, "reward_std": 0.8499669432640076, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08900000154972076, "step": 84 }, { "completion_length": 130.5, "epoch": 0.011374280744011776, "grad_norm": 0.772548496723175, "kl": 0.06698866188526154, "learning_rate": 4.993009492952951e-06, "loss": 0.0027, "reward": 1.0326249599456787, "reward_std": 1.295006513595581, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0951249897480011, "step": 85 }, { "completion_length": 144.75, "epoch": 0.011508095811588384, "grad_norm": 2.0472888946533203, "kl": 0.051987532526254654, "learning_rate": 4.992046867395478e-06, "loss": 0.0021, "reward": 0.10949999839067459, "reward_std": 0.0911952406167984, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10950000584125519, "step": 86 }, { "completion_length": 74.75, "epoch": 0.011641910879164994, "grad_norm": 0.92760169506073, "kl": 0.05159628763794899, "learning_rate": 4.99102227291444e-06, "loss": 0.0021, "reward": 2.786875009536743, "reward_std": 0.010260041803121567, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28687500953674316, "step": 87 }, { "completion_length": 178.5, "epoch": 0.011775725946741603, "grad_norm": 0.5472368001937866, "kl": 0.04497537761926651, "learning_rate": 4.989935734988098e-06, "loss": 0.0018, "reward": 1.568750023841858, "reward_std": 1.3699387311935425, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.006250000558793545, "step": 88 }, { "completion_length": 144.75, "epoch": 0.011909541014318212, "grad_norm": 0.7784441113471985, "kl": 0.08736249059438705, "learning_rate": 4.9887872806350376e-06, "loss": 0.0035, "reward": 2.5823750495910645, "reward_std": 0.06645716726779938, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08237498998641968, "step": 89 }, { "completion_length": 140.625, "epoch": 0.01204335608189482, "grad_norm": 0.7564622163772583, "kl": 0.03849857673048973, "learning_rate": 4.987576938413504e-06, "loss": 0.0015, "reward": 2.6227500438690186, "reward_std": 0.07378294318914413, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12274999916553497, "step": 90 }, { "completion_length": 179.875, "epoch": 0.01217717114947143, "grad_norm": 1.115612268447876, "kl": 0.044431254267692566, "learning_rate": 4.986304738420684e-06, "loss": 0.0018, "reward": 0.18500001728534698, "reward_std": 0.3051360547542572, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06000000238418579, "step": 91 }, { "completion_length": 114.375, "epoch": 0.01231098621704804, "grad_norm": 1.0594428777694702, "kl": 0.06427611410617828, "learning_rate": 4.984970712291963e-06, "loss": 0.0026, "reward": 2.3924999237060547, "reward_std": 0.6584362983703613, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14250001311302185, "step": 92 }, { "completion_length": 121.75, "epoch": 0.012444801284624649, "grad_norm": 0.7914606332778931, "kl": 0.03704307600855827, "learning_rate": 4.983574893200139e-06, "loss": 0.0015, "reward": 2.600749969482422, "reward_std": 0.05770058557391167, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10074999928474426, "step": 93 }, { "completion_length": 147.25, "epoch": 0.012578616352201259, "grad_norm": 0.7845139503479004, "kl": 0.06193498894572258, "learning_rate": 4.982117315854594e-06, "loss": 0.0025, "reward": 0.7480000257492065, "reward_std": 1.1190948486328125, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12299999594688416, "step": 94 }, { "completion_length": 59.75, "epoch": 0.012712431419777867, "grad_norm": 0.9708481431007385, "kl": 0.11553163081407547, "learning_rate": 4.980598016500431e-06, "loss": 0.0046, "reward": 0.8492500185966492, "reward_std": 0.01791846565902233, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3492499887943268, "step": 95 }, { "completion_length": 101.125, "epoch": 0.012846246487354477, "grad_norm": 0.7145942449569702, "kl": 0.08723154664039612, "learning_rate": 4.979017032917576e-06, "loss": 0.0035, "reward": 1.9397499561309814, "reward_std": 1.103899359703064, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25224998593330383, "step": 96 }, { "completion_length": 115.875, "epoch": 0.012980061554931085, "grad_norm": 0.830101728439331, "kl": 0.09398140758275986, "learning_rate": 4.977374404419838e-06, "loss": 0.0038, "reward": 2.686875104904175, "reward_std": 0.06629462540149689, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18687498569488525, "step": 97 }, { "completion_length": 128.375, "epoch": 0.013113876622507695, "grad_norm": 1.0607072114944458, "kl": 0.11273806542158127, "learning_rate": 4.975670171853926e-06, "loss": 0.0045, "reward": 0.46299999952316284, "reward_std": 0.9311861991882324, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15049999952316284, "step": 98 }, { "completion_length": 121.25, "epoch": 0.013247691690084303, "grad_norm": 1.0286757946014404, "kl": 0.060726530849933624, "learning_rate": 4.973904377598443e-06, "loss": 0.0024, "reward": 2.393125057220459, "reward_std": 0.6692179441452026, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1431249976158142, "step": 99 }, { "completion_length": 96.5, "epoch": 0.013381506757660913, "grad_norm": 0.808530330657959, "kl": 0.0426044799387455, "learning_rate": 4.9720770655628216e-06, "loss": 0.0017, "reward": 2.445125102996826, "reward_std": 0.7625398635864258, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19512499868869781, "step": 100 }, { "completion_length": 145.375, "epoch": 0.013515321825237521, "grad_norm": 0.8428481817245483, "kl": 0.04572984576225281, "learning_rate": 4.970188281186241e-06, "loss": 0.0018, "reward": 0.8115000128746033, "reward_std": 1.142754077911377, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12399999797344208, "step": 101 }, { "completion_length": 161.625, "epoch": 0.01364913689281413, "grad_norm": 0.5956131815910339, "kl": 0.07098394632339478, "learning_rate": 4.9682380714364895e-06, "loss": 0.0028, "reward": 1.537750005722046, "reward_std": 1.094956636428833, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1002500057220459, "step": 102 }, { "completion_length": 96.0, "epoch": 0.01378295196039074, "grad_norm": 0.6832714080810547, "kl": 0.08033100515604019, "learning_rate": 4.966226484808804e-06, "loss": 0.0032, "reward": 2.740875005722046, "reward_std": 0.09044087678194046, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2408750057220459, "step": 103 }, { "completion_length": 113.625, "epoch": 0.013916767027967349, "grad_norm": 0.7095988392829895, "kl": 0.12121561914682388, "learning_rate": 4.964153571324658e-06, "loss": 0.0048, "reward": 2.1283750534057617, "reward_std": 1.1695889234542847, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25337499380111694, "step": 104 }, { "completion_length": 195.75, "epoch": 0.014050582095543959, "grad_norm": 0.4752798080444336, "kl": 0.02968657575547695, "learning_rate": 4.962019382530521e-06, "loss": 0.0012, "reward": 0.7012500762939453, "reward_std": 1.1305185556411743, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.013750001788139343, "step": 105 }, { "completion_length": 109.625, "epoch": 0.014184397163120567, "grad_norm": 0.7191503643989563, "kl": 0.059147223830223083, "learning_rate": 4.959823971496575e-06, "loss": 0.0024, "reward": 2.6631250381469727, "reward_std": 0.019613727927207947, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16312500834465027, "step": 106 }, { "completion_length": 179.625, "epoch": 0.014318212230697177, "grad_norm": 0.8023127317428589, "kl": 0.04188587889075279, "learning_rate": 4.957567392815396e-06, "loss": 0.0017, "reward": 1.3443750143051147, "reward_std": 1.3044664859771729, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09437499940395355, "step": 107 }, { "completion_length": 196.5, "epoch": 0.014452027298273785, "grad_norm": 0.7125875353813171, "kl": 0.027316324412822723, "learning_rate": 4.955249702600598e-06, "loss": 0.0011, "reward": 0.48624998331069946, "reward_std": 0.7794494032859802, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.013750001788139343, "step": 108 }, { "completion_length": 176.625, "epoch": 0.014585842365850395, "grad_norm": 0.6480623483657837, "kl": 0.12005632370710373, "learning_rate": 4.9528709584854316e-06, "loss": 0.0048, "reward": 1.7725000381469727, "reward_std": 1.1268444061279297, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08500000089406967, "step": 109 }, { "completion_length": 119.25, "epoch": 0.014719657433427003, "grad_norm": 0.6438884735107422, "kl": 0.0923544317483902, "learning_rate": 4.9504312196213596e-06, "loss": 0.0037, "reward": 2.658750057220459, "reward_std": 0.05824027210474014, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1587499976158142, "step": 110 }, { "completion_length": 148.75, "epoch": 0.014853472501003613, "grad_norm": 0.8156433701515198, "kl": 0.07639375329017639, "learning_rate": 4.9479305466765796e-06, "loss": 0.0031, "reward": 2.5415000915527344, "reward_std": 0.12251647561788559, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04149999842047691, "step": 111 }, { "completion_length": 171.5, "epoch": 0.014987287568580221, "grad_norm": 0.6420042514801025, "kl": 0.07187381386756897, "learning_rate": 4.9453690018345144e-06, "loss": 0.0029, "reward": 2.284749984741211, "reward_std": 0.7274911403656006, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03474999964237213, "step": 112 }, { "completion_length": 125.75, "epoch": 0.015121102636156831, "grad_norm": 0.55785071849823, "kl": 0.08430134505033493, "learning_rate": 4.942746648792274e-06, "loss": 0.0034, "reward": 2.712124824523926, "reward_std": 0.03973281756043434, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2121249884366989, "step": 113 }, { "completion_length": 141.5, "epoch": 0.015254917703733441, "grad_norm": 0.668449878692627, "kl": 0.09252829849720001, "learning_rate": 4.940063552759061e-06, "loss": 0.0037, "reward": 2.6091251373291016, "reward_std": 0.1137847974896431, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10912499576807022, "step": 114 }, { "completion_length": 119.625, "epoch": 0.01538873277131005, "grad_norm": 0.7444556355476379, "kl": 0.08986318111419678, "learning_rate": 4.937319780454559e-06, "loss": 0.0036, "reward": 2.755624771118164, "reward_std": 0.05602283030748367, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25562500953674316, "step": 115 }, { "completion_length": 168.625, "epoch": 0.01552254783888666, "grad_norm": 0.6890146136283875, "kl": 0.06985297054052353, "learning_rate": 4.934515400107266e-06, "loss": 0.0028, "reward": 2.11899995803833, "reward_std": 0.9300675392150879, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11900000274181366, "step": 116 }, { "completion_length": 136.625, "epoch": 0.01565636290646327, "grad_norm": 0.9527840614318848, "kl": 0.0956442803144455, "learning_rate": 4.931650481452801e-06, "loss": 0.0038, "reward": 2.461750030517578, "reward_std": 0.815294086933136, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21175000071525574, "step": 117 }, { "completion_length": 138.75, "epoch": 0.015790177974039876, "grad_norm": 0.04666705057024956, "kl": 0.09591816365718842, "learning_rate": 4.9287250957321685e-06, "loss": 0.0038, "reward": 2.75, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 118 }, { "completion_length": 117.0, "epoch": 0.015923993041616485, "grad_norm": 0.8810484409332275, "kl": 0.11881041526794434, "learning_rate": 4.925739315689991e-06, "loss": 0.0048, "reward": 2.5, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 119 }, { "completion_length": 176.75, "epoch": 0.016057808109193095, "grad_norm": 0.9243731498718262, "kl": 0.10825902223587036, "learning_rate": 4.922693215572695e-06, "loss": 0.0043, "reward": 2.1383750438690186, "reward_std": 1.0670230388641357, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20087499916553497, "step": 120 }, { "completion_length": 171.875, "epoch": 0.016191623176769705, "grad_norm": 0.765018105506897, "kl": 0.06866879761219025, "learning_rate": 4.919586871126667e-06, "loss": 0.0027, "reward": 0.7035000324249268, "reward_std": 0.868400514125824, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20350000262260437, "step": 121 }, { "completion_length": 80.75, "epoch": 0.01632543824434631, "grad_norm": 1.307355523109436, "kl": 0.2525603473186493, "learning_rate": 4.916420359596369e-06, "loss": 0.0101, "reward": 2.7763748168945312, "reward_std": 0.05005120486021042, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2763749957084656, "step": 122 }, { "completion_length": 84.25, "epoch": 0.01645925331192292, "grad_norm": 0.9614888429641724, "kl": 0.19615454971790314, "learning_rate": 4.913193759722419e-06, "loss": 0.0078, "reward": 2.5314998626708984, "reward_std": 0.6703745722770691, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.281499981880188, "step": 123 }, { "completion_length": 199.375, "epoch": 0.01659306837949953, "grad_norm": 0.6785680651664734, "kl": 0.039982832968235016, "learning_rate": 4.909907151739634e-06, "loss": 0.0016, "reward": 0.28125, "reward_std": 0.2893187701702118, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15625, "step": 124 }, { "completion_length": 94.125, "epoch": 0.01672688344707614, "grad_norm": 0.30756527185440063, "kl": 0.1932661533355713, "learning_rate": 4.90656061737503e-06, "loss": 0.0077, "reward": 2.75, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 125 }, { "completion_length": 185.25, "epoch": 0.01686069851465275, "grad_norm": 0.5849815011024475, "kl": 0.124032162129879, "learning_rate": 4.903154239845798e-06, "loss": 0.005, "reward": 2.187624931335449, "reward_std": 1.093903660774231, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250124990940094, "step": 126 }, { "completion_length": 126.375, "epoch": 0.016994513582229358, "grad_norm": 1.0223439931869507, "kl": 0.10580593347549438, "learning_rate": 4.899688103857223e-06, "loss": 0.0042, "reward": 2.4826250076293945, "reward_std": 0.7085382342338562, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23262500762939453, "step": 127 }, { "completion_length": 87.0, "epoch": 0.017128328649805968, "grad_norm": 1.8181575536727905, "kl": 0.15453141927719116, "learning_rate": 4.8961622956005895e-06, "loss": 0.0062, "reward": 2.765749931335449, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.265749990940094, "step": 128 }, { "completion_length": 120.875, "epoch": 0.017262143717382578, "grad_norm": 1.2288424968719482, "kl": 0.11860083043575287, "learning_rate": 4.892576902751031e-06, "loss": 0.0047, "reward": 2.2814998626708984, "reward_std": 0.9468604922294617, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.281499981880188, "step": 129 }, { "completion_length": 152.5, "epoch": 0.017395958784959187, "grad_norm": 0.8429675102233887, "kl": 0.0919552594423294, "learning_rate": 4.8889320144653525e-06, "loss": 0.0037, "reward": 0.781499981880188, "reward_std": 0.05832665413618088, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.281499981880188, "step": 130 }, { "completion_length": 82.5, "epoch": 0.017529773852535794, "grad_norm": 1.081695556640625, "kl": 0.339735209941864, "learning_rate": 4.885227721379811e-06, "loss": 0.0136, "reward": 2.7756247520446777, "reward_std": 0.0973593145608902, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27562499046325684, "step": 131 }, { "completion_length": 82.5, "epoch": 0.017663588920112404, "grad_norm": 1.2999768257141113, "kl": 0.14531441032886505, "learning_rate": 4.881464115607866e-06, "loss": 0.0058, "reward": 2.8287501335144043, "reward_std": 0.06521117687225342, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32874998450279236, "step": 132 }, { "completion_length": 126.375, "epoch": 0.017797403987689014, "grad_norm": 0.8508877158164978, "kl": 0.1247982382774353, "learning_rate": 4.8776412907378845e-06, "loss": 0.005, "reward": 2.8602499961853027, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 133 }, { "completion_length": 125.125, "epoch": 0.017931219055265624, "grad_norm": 0.901710033416748, "kl": 0.1852751076221466, "learning_rate": 4.873759341830816e-06, "loss": 0.0074, "reward": 2.555999994277954, "reward_std": 0.686331033706665, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3059999942779541, "step": 134 }, { "completion_length": 106.25, "epoch": 0.018065034122842234, "grad_norm": 0.9589298367500305, "kl": 0.12752312421798706, "learning_rate": 4.86981836541783e-06, "loss": 0.0051, "reward": 2.0768747329711914, "reward_std": 1.1054009199142456, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3268749713897705, "step": 135 }, { "completion_length": 154.5, "epoch": 0.01819884919041884, "grad_norm": 1.1417564153671265, "kl": 0.09896090626716614, "learning_rate": 4.865818459497911e-06, "loss": 0.004, "reward": 2.375999927520752, "reward_std": 0.9258201122283936, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 136 }, { "completion_length": 99.125, "epoch": 0.01833266425799545, "grad_norm": 0.16242574155330658, "kl": 0.1598663628101349, "learning_rate": 4.861759723535427e-06, "loss": 0.0064, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 137 }, { "completion_length": 179.0, "epoch": 0.01846647932557206, "grad_norm": 0.5544370412826538, "kl": 0.09943100810050964, "learning_rate": 4.8576422584576515e-06, "loss": 0.004, "reward": 2.7472500801086426, "reward_std": 0.2400759607553482, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24724999070167542, "step": 138 }, { "completion_length": 112.625, "epoch": 0.01860029439314867, "grad_norm": 0.04756677895784378, "kl": 0.11926989257335663, "learning_rate": 4.853466166652259e-06, "loss": 0.0048, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 139 }, { "completion_length": 139.25, "epoch": 0.018734109460725276, "grad_norm": 0.7633950710296631, "kl": 0.12160079181194305, "learning_rate": 4.849231551964771e-06, "loss": 0.0049, "reward": 2.860374927520752, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36037498712539673, "step": 140 }, { "completion_length": 170.625, "epoch": 0.018867924528301886, "grad_norm": 0.7672199010848999, "kl": 0.09167491644620895, "learning_rate": 4.844938519695985e-06, "loss": 0.0037, "reward": 1.891374945640564, "reward_std": 1.3605663776397705, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32887500524520874, "step": 141 }, { "completion_length": 106.125, "epoch": 0.019001739595878496, "grad_norm": 0.605826199054718, "kl": 0.14297348260879517, "learning_rate": 4.8405871765993435e-06, "loss": 0.0057, "reward": 2.838749885559082, "reward_std": 0.08693470060825348, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3387500047683716, "step": 142 }, { "completion_length": 94.375, "epoch": 0.019135554663455106, "grad_norm": 0.1924724280834198, "kl": 0.16810902953147888, "learning_rate": 4.836177630878289e-06, "loss": 0.0067, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 143 }, { "completion_length": 144.375, "epoch": 0.019269369731031716, "grad_norm": 0.7961549162864685, "kl": 0.11281492561101913, "learning_rate": 4.8317099921835695e-06, "loss": 0.0045, "reward": 2.313499927520752, "reward_std": 1.0500850677490234, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 144 }, { "completion_length": 136.5, "epoch": 0.019403184798608322, "grad_norm": 0.5828819870948792, "kl": 0.10745584964752197, "learning_rate": 4.827184371610511e-06, "loss": 0.0043, "reward": 0.8603749871253967, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36037498712539673, "step": 145 }, { "completion_length": 91.125, "epoch": 0.019536999866184932, "grad_norm": 0.10943334549665451, "kl": 0.12454424798488617, "learning_rate": 4.822600881696256e-06, "loss": 0.005, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 146 }, { "completion_length": 126.75, "epoch": 0.019670814933761542, "grad_norm": 1.3740291595458984, "kl": 0.09184177964925766, "learning_rate": 4.817959636416969e-06, "loss": 0.0037, "reward": 2.860374927520752, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36037498712539673, "step": 147 }, { "completion_length": 119.875, "epoch": 0.019804630001338152, "grad_norm": 0.08919162303209305, "kl": 0.11414283514022827, "learning_rate": 4.813260751184992e-06, "loss": 0.0046, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 148 }, { "completion_length": 175.25, "epoch": 0.01993844506891476, "grad_norm": 0.45028820633888245, "kl": 0.07411466538906097, "learning_rate": 4.8085043428459865e-06, "loss": 0.003, "reward": 2.2821249961853027, "reward_std": 1.1178104877471924, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 149 }, { "completion_length": 169.5, "epoch": 0.02007226013649137, "grad_norm": 0.9182133078575134, "kl": 0.10268989205360413, "learning_rate": 4.80369052967602e-06, "loss": 0.0041, "reward": 2.2977499961853027, "reward_std": 1.075366497039795, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 150 }, { "completion_length": 192.375, "epoch": 0.020206075204067978, "grad_norm": 0.8185623288154602, "kl": 0.04303750395774841, "learning_rate": 4.7988194313786275e-06, "loss": 0.0017, "reward": 0.531499981880188, "reward_std": 0.9517138004302979, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21899999678134918, "step": 151 }, { "completion_length": 94.5, "epoch": 0.020339890271644588, "grad_norm": 0.07721685618162155, "kl": 0.13051648437976837, "learning_rate": 4.793891169081835e-06, "loss": 0.0052, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 152 }, { "completion_length": 79.75, "epoch": 0.020473705339221198, "grad_norm": 0.08421813696622849, "kl": 0.196038618683815, "learning_rate": 4.7889058653351485e-06, "loss": 0.0078, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 153 }, { "completion_length": 200.0, "epoch": 0.020607520406797804, "grad_norm": 0.4328741431236267, "kl": 0.04231194406747818, "learning_rate": 4.783863644106502e-06, "loss": 0.0017, "reward": 0.453125, "reward_std": 0.9280776381492615, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.140625, "step": 154 }, { "completion_length": 141.125, "epoch": 0.020741335474374414, "grad_norm": 0.8119263648986816, "kl": 0.10133925080299377, "learning_rate": 4.778764630779184e-06, "loss": 0.0041, "reward": 2.125999927520752, "reward_std": 1.0350983142852783, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 155 }, { "completion_length": 88.75, "epoch": 0.020875150541951024, "grad_norm": 0.595395028591156, "kl": 0.11204063892364502, "learning_rate": 4.773608952148706e-06, "loss": 0.0045, "reward": 0.9909999370574951, "reward_std": 1.1637746095657349, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3659999966621399, "step": 156 }, { "completion_length": 120.75, "epoch": 0.021008965609527634, "grad_norm": 0.734516978263855, "kl": 0.11002637445926666, "learning_rate": 4.7683967364196624e-06, "loss": 0.0044, "reward": 2.175874948501587, "reward_std": 1.2138733863830566, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3008750081062317, "step": 157 }, { "completion_length": 104.125, "epoch": 0.02114278067710424, "grad_norm": 0.7840255498886108, "kl": 0.10139716416597366, "learning_rate": 4.7631281132025374e-06, "loss": 0.0041, "reward": 2.856374979019165, "reward_std": 0.05550778657197952, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35637497901916504, "step": 158 }, { "completion_length": 142.0, "epoch": 0.02127659574468085, "grad_norm": 0.8949174880981445, "kl": 0.06621480733156204, "learning_rate": 4.75780321351048e-06, "loss": 0.0026, "reward": 1.0048749446868896, "reward_std": 1.13569176197052, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19237498939037323, "step": 159 }, { "completion_length": 97.625, "epoch": 0.02141041081225746, "grad_norm": 1.2893048524856567, "kl": 0.11432349681854248, "learning_rate": 4.752422169756048e-06, "loss": 0.0046, "reward": 2.856374979019165, "reward_std": 0.05550778657197952, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3563750088214874, "step": 160 }, { "completion_length": 104.25, "epoch": 0.02154422587983407, "grad_norm": 0.6083778142929077, "kl": 0.13512399792671204, "learning_rate": 4.746985115747918e-06, "loss": 0.0054, "reward": 2.563499927520752, "reward_std": 0.8838834762573242, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 161 }, { "completion_length": 134.0, "epoch": 0.021678040947410677, "grad_norm": 0.1701803058385849, "kl": 0.11121401935815811, "learning_rate": 4.741492186687552e-06, "loss": 0.0044, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 162 }, { "completion_length": 160.375, "epoch": 0.021811856014987287, "grad_norm": 0.6269301772117615, "kl": 0.09858278930187225, "learning_rate": 4.735943519165843e-06, "loss": 0.0039, "reward": 1.369499921798706, "reward_std": 1.2905350923538208, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3070000112056732, "step": 163 }, { "completion_length": 111.125, "epoch": 0.021945671082563897, "grad_norm": 0.027580501511693, "kl": 0.09376178681850433, "learning_rate": 4.730339251159709e-06, "loss": 0.0038, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 164 }, { "completion_length": 188.25, "epoch": 0.022079486150140507, "grad_norm": 0.48957961797714233, "kl": 0.024275891482830048, "learning_rate": 4.724679522028672e-06, "loss": 0.001, "reward": 0.906624972820282, "reward_std": 1.2428367137908936, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21912500262260437, "step": 165 }, { "completion_length": 196.875, "epoch": 0.022213301217717116, "grad_norm": 0.6642765998840332, "kl": 0.022328466176986694, "learning_rate": 4.718964472511386e-06, "loss": 0.0009, "reward": 0.828374981880188, "reward_std": 1.2645572423934937, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20337499678134918, "step": 166 }, { "completion_length": 107.5, "epoch": 0.022347116285293723, "grad_norm": 0.5368108749389648, "kl": 0.07350773364305496, "learning_rate": 4.713194244722138e-06, "loss": 0.0029, "reward": 2.5321249961853027, "reward_std": 0.9726253151893616, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 167 }, { "completion_length": 109.625, "epoch": 0.022480931352870333, "grad_norm": 0.014060646295547485, "kl": 0.10311411321163177, "learning_rate": 4.707368982147318e-06, "loss": 0.0041, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 168 }, { "completion_length": 110.75, "epoch": 0.022614746420446943, "grad_norm": 0.17670457065105438, "kl": 0.15290237963199615, "learning_rate": 4.701488829641845e-06, "loss": 0.0061, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 169 }, { "completion_length": 130.0, "epoch": 0.022748561488023553, "grad_norm": 0.7835637927055359, "kl": 0.10438250750303268, "learning_rate": 4.6955539334255714e-06, "loss": 0.0042, "reward": 2.8504998683929443, "reward_std": 0.0721249133348465, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3504999876022339, "step": 170 }, { "completion_length": 187.625, "epoch": 0.02288237655560016, "grad_norm": 0.8391973376274109, "kl": 0.1589171439409256, "learning_rate": 4.6895644410796415e-06, "loss": 0.0064, "reward": 1.6410000324249268, "reward_std": 1.3087522983551025, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.26600000262260437, "step": 171 }, { "completion_length": 195.75, "epoch": 0.02301619162317677, "grad_norm": 0.5057268738746643, "kl": 0.029908571392297745, "learning_rate": 4.683520501542825e-06, "loss": 0.0012, "reward": 0.578374981880188, "reward_std": 0.9638639688491821, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.203374981880188, "step": 172 }, { "completion_length": 200.0, "epoch": 0.02315000669075338, "grad_norm": 0.01851646415889263, "kl": 0.01410270482301712, "learning_rate": 4.6774222651078104e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 173 }, { "completion_length": 101.875, "epoch": 0.02328382175832999, "grad_norm": 0.03979469835758209, "kl": 0.11010269075632095, "learning_rate": 4.671269883417473e-06, "loss": 0.0044, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 174 }, { "completion_length": 105.25, "epoch": 0.0234176368259066, "grad_norm": 1.0305979251861572, "kl": 0.35665902495384216, "learning_rate": 4.665063509461098e-06, "loss": 0.0143, "reward": 2.6034998893737793, "reward_std": 0.7008515000343323, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3534999489784241, "step": 175 }, { "completion_length": 162.875, "epoch": 0.023551451893483205, "grad_norm": 0.3956729471683502, "kl": 0.050653666257858276, "learning_rate": 4.658803297570578e-06, "loss": 0.002, "reward": 2.5007500648498535, "reward_std": 0.9639577269554138, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31325000524520874, "step": 176 }, { "completion_length": 193.0, "epoch": 0.023685266961059815, "grad_norm": 0.49547824263572693, "kl": 0.04203198850154877, "learning_rate": 4.652489403416579e-06, "loss": 0.0017, "reward": 1.531749963760376, "reward_std": 1.4378401041030884, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.281749963760376, "step": 177 }, { "completion_length": 189.375, "epoch": 0.023819082028636425, "grad_norm": 0.5608797669410706, "kl": 0.03855575621128082, "learning_rate": 4.646121984004666e-06, "loss": 0.0015, "reward": 0.922249972820282, "reward_std": 1.232347846031189, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23475000262260437, "step": 178 }, { "completion_length": 104.625, "epoch": 0.023952897096213035, "grad_norm": 0.7825055718421936, "kl": 0.10920294374227524, "learning_rate": 4.639701197671397e-06, "loss": 0.0044, "reward": 2.610374927520752, "reward_std": 0.7021570801734924, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36037498712539673, "step": 179 }, { "completion_length": 138.375, "epoch": 0.02408671216378964, "grad_norm": 0.5411584377288818, "kl": 0.08357924222946167, "learning_rate": 4.633227204080389e-06, "loss": 0.0033, "reward": 2.8388748168945312, "reward_std": 0.10500533133745193, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3388749957084656, "step": 180 }, { "completion_length": 162.5, "epoch": 0.02422052723136625, "grad_norm": 0.3942194879055023, "kl": 0.05526455491781235, "learning_rate": 4.626700164218349e-06, "loss": 0.0022, "reward": 1.938249945640564, "reward_std": 1.3148058652877808, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31325000524520874, "step": 181 }, { "completion_length": 126.25, "epoch": 0.02435434229894286, "grad_norm": 0.5942544341087341, "kl": 0.0846291184425354, "learning_rate": 4.620120240391065e-06, "loss": 0.0034, "reward": 2.563499927520752, "reward_std": 0.8838834762573242, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 182 }, { "completion_length": 132.375, "epoch": 0.02448815736651947, "grad_norm": 0.7408034205436707, "kl": 0.0697813481092453, "learning_rate": 4.613487596219376e-06, "loss": 0.0028, "reward": 2.045875072479248, "reward_std": 1.0378040075302124, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29587501287460327, "step": 183 }, { "completion_length": 115.375, "epoch": 0.02462197243409608, "grad_norm": 0.8724160194396973, "kl": 0.08768169581890106, "learning_rate": 4.606802396635098e-06, "loss": 0.0035, "reward": 2.19350004196167, "reward_std": 1.2267396450042725, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31849998235702515, "step": 184 }, { "completion_length": 110.625, "epoch": 0.024755787501672687, "grad_norm": 1.1736552715301514, "kl": 0.10424770414829254, "learning_rate": 4.600064807876929e-06, "loss": 0.0042, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 185 }, { "completion_length": 121.125, "epoch": 0.024889602569249297, "grad_norm": 0.8237473368644714, "kl": 0.09245938062667847, "learning_rate": 4.593274997486309e-06, "loss": 0.0037, "reward": 1.375999927520752, "reward_std": 0.9258201122283936, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 186 }, { "completion_length": 108.5, "epoch": 0.025023417636825907, "grad_norm": 0.01539183035492897, "kl": 0.08160560578107834, "learning_rate": 4.586433134303257e-06, "loss": 0.0033, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 187 }, { "completion_length": 139.375, "epoch": 0.025157232704402517, "grad_norm": 0.7563353776931763, "kl": 0.09520469605922699, "learning_rate": 4.5795393884621735e-06, "loss": 0.0038, "reward": 2.3602499961853027, "reward_std": 0.9171299934387207, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 188 }, { "completion_length": 134.375, "epoch": 0.025291047771979124, "grad_norm": 0.3767932653427124, "kl": 0.08863071352243423, "learning_rate": 4.572593931387604e-06, "loss": 0.0035, "reward": 2.5477499961853027, "reward_std": 0.9284311532974243, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 189 }, { "completion_length": 98.125, "epoch": 0.025424862839555733, "grad_norm": 0.05503019690513611, "kl": 0.11068596690893173, "learning_rate": 4.565596935789987e-06, "loss": 0.0044, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 190 }, { "completion_length": 142.25, "epoch": 0.025558677907132343, "grad_norm": 0.013801062479615211, "kl": 0.07021953165531158, "learning_rate": 4.558548575661348e-06, "loss": 0.0028, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 191 }, { "completion_length": 129.875, "epoch": 0.025692492974708953, "grad_norm": 0.5550726652145386, "kl": 0.12393409013748169, "learning_rate": 4.551449026270979e-06, "loss": 0.005, "reward": 2.563499927520752, "reward_std": 0.8838834762573242, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 192 }, { "completion_length": 143.0, "epoch": 0.02582630804228556, "grad_norm": 0.013020144775509834, "kl": 0.07056255638599396, "learning_rate": 4.544298464161079e-06, "loss": 0.0028, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 193 }, { "completion_length": 119.0, "epoch": 0.02596012310986217, "grad_norm": 0.051235951483249664, "kl": 0.07050454616546631, "learning_rate": 4.537097067142363e-06, "loss": 0.0028, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 194 }, { "completion_length": 168.375, "epoch": 0.02609393817743878, "grad_norm": 0.6408933997154236, "kl": 0.056091420352458954, "learning_rate": 4.529845014289642e-06, "loss": 0.0022, "reward": 1.3443748950958252, "reward_std": 1.305866003036499, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28187498450279236, "step": 195 }, { "completion_length": 128.0, "epoch": 0.02622775324501539, "grad_norm": 0.02805783413350582, "kl": 0.08952876925468445, "learning_rate": 4.522542485937369e-06, "loss": 0.0036, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 196 }, { "completion_length": 124.125, "epoch": 0.026361568312592, "grad_norm": 0.5922777056694031, "kl": 0.07994011044502258, "learning_rate": 4.5151896636751554e-06, "loss": 0.0032, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 197 }, { "completion_length": 160.75, "epoch": 0.026495383380168606, "grad_norm": 0.6661368608474731, "kl": 0.0835915058851242, "learning_rate": 4.507786730343255e-06, "loss": 0.0033, "reward": 1.1102499961853027, "reward_std": 0.7148317694664001, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 198 }, { "completion_length": 112.375, "epoch": 0.026629198447745216, "grad_norm": 0.6084420084953308, "kl": 0.08125758171081543, "learning_rate": 4.500333870028017e-06, "loss": 0.0033, "reward": 2.8446249961853027, "reward_std": 0.08874184638261795, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 199 }, { "completion_length": 200.0, "epoch": 0.026763013515321826, "grad_norm": 0.39239463210105896, "kl": 0.005889165215194225, "learning_rate": 4.492831268057307e-06, "loss": 0.0002, "reward": 0.140625, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.140625, "step": 200 }, { "completion_length": 148.625, "epoch": 0.026896828582898435, "grad_norm": 0.4224630296230316, "kl": 0.1279647946357727, "learning_rate": 4.485279110995903e-06, "loss": 0.0051, "reward": 2.78125, "reward_std": 0.1766147017478943, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28125, "step": 201 }, { "completion_length": 152.25, "epoch": 0.027030643650475042, "grad_norm": 0.020336441695690155, "kl": 0.08182498812675476, "learning_rate": 4.477677586640854e-06, "loss": 0.0033, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 202 }, { "completion_length": 165.25, "epoch": 0.027164458718051652, "grad_norm": 0.3059023916721344, "kl": 0.06548270583152771, "learning_rate": 4.470026884016805e-06, "loss": 0.0026, "reward": 2.8602499961853027, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 203 }, { "completion_length": 151.5, "epoch": 0.02729827378562826, "grad_norm": 0.6000714302062988, "kl": 0.06207146868109703, "learning_rate": 4.4623271933713065e-06, "loss": 0.0025, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 204 }, { "completion_length": 181.375, "epoch": 0.02743208885320487, "grad_norm": 0.4132672846317291, "kl": 0.060861602425575256, "learning_rate": 4.454578706170075e-06, "loss": 0.0024, "reward": 1.563249945640564, "reward_std": 1.4065916538238525, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31325000524520874, "step": 205 }, { "completion_length": 91.625, "epoch": 0.02756590392078148, "grad_norm": 0.06006154417991638, "kl": 0.1321602314710617, "learning_rate": 4.446781615092235e-06, "loss": 0.0053, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 206 }, { "completion_length": 186.625, "epoch": 0.027699718988358088, "grad_norm": 0.5306841135025024, "kl": 0.04687389358878136, "learning_rate": 4.438936114025531e-06, "loss": 0.0019, "reward": 1.594249963760376, "reward_std": 1.3577654361724854, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28174999356269836, "step": 207 }, { "completion_length": 139.75, "epoch": 0.027833534055934698, "grad_norm": 0.7287383675575256, "kl": 0.08763079345226288, "learning_rate": 4.431042398061499e-06, "loss": 0.0035, "reward": 2.5321249961853027, "reward_std": 0.9726253151893616, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 208 }, { "completion_length": 157.125, "epoch": 0.027967349123511308, "grad_norm": 0.47190096974372864, "kl": 0.06745409965515137, "learning_rate": 4.423100663490622e-06, "loss": 0.0027, "reward": 2.8602499961853027, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 209 }, { "completion_length": 145.875, "epoch": 0.028101164191087918, "grad_norm": 0.6960317492485046, "kl": 0.07130832970142365, "learning_rate": 4.415111107797445e-06, "loss": 0.0029, "reward": 2.8172500133514404, "reward_std": 0.16616998612880707, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31724998354911804, "step": 210 }, { "completion_length": 150.125, "epoch": 0.028234979258664524, "grad_norm": 0.6259533762931824, "kl": 0.06684979796409607, "learning_rate": 4.4070739296556665e-06, "loss": 0.0027, "reward": 2.2977499961853027, "reward_std": 1.0837033987045288, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 211 }, { "completion_length": 142.125, "epoch": 0.028368794326241134, "grad_norm": 1.0033406019210815, "kl": 0.08088065683841705, "learning_rate": 4.398989328923196e-06, "loss": 0.0032, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 212 }, { "completion_length": 135.0, "epoch": 0.028502609393817744, "grad_norm": 0.7785513401031494, "kl": 0.0884198471903801, "learning_rate": 4.390857506637184e-06, "loss": 0.0035, "reward": 1.909749984741211, "reward_std": 1.3143266439437866, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28474998474121094, "step": 213 }, { "completion_length": 186.875, "epoch": 0.028636424461394354, "grad_norm": 0.5992149710655212, "kl": 0.04777985066175461, "learning_rate": 4.382678665009028e-06, "loss": 0.0019, "reward": 1.953874945640564, "reward_std": 1.2907007932662964, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32887500524520874, "step": 214 }, { "completion_length": 120.375, "epoch": 0.028770239528970964, "grad_norm": 0.5147307515144348, "kl": 0.13163134455680847, "learning_rate": 4.374453007419336e-06, "loss": 0.0053, "reward": 2.8451249599456787, "reward_std": 0.08732764422893524, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3451249599456787, "step": 215 }, { "completion_length": 141.375, "epoch": 0.02890405459654757, "grad_norm": 0.5681592226028442, "kl": 0.06564359366893768, "learning_rate": 4.366180738412876e-06, "loss": 0.0026, "reward": 2.610374927520752, "reward_std": 0.7021570801734924, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36037498712539673, "step": 216 }, { "completion_length": 185.125, "epoch": 0.02903786966412418, "grad_norm": 0.4995934069156647, "kl": 0.05177675187587738, "learning_rate": 4.357862063693486e-06, "loss": 0.0021, "reward": 1.594249963760376, "reward_std": 1.3577654361724854, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28174999356269836, "step": 217 }, { "completion_length": 152.25, "epoch": 0.02917168473170079, "grad_norm": 0.5478213429450989, "kl": 0.14589357376098633, "learning_rate": 4.3494971901189605e-06, "loss": 0.0058, "reward": 2.842750072479248, "reward_std": 0.09404517710208893, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3427499830722809, "step": 218 }, { "completion_length": 107.0, "epoch": 0.0293054997992774, "grad_norm": 0.06388695538043976, "kl": 0.11791018396615982, "learning_rate": 4.341086325695905e-06, "loss": 0.0047, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 219 }, { "completion_length": 193.125, "epoch": 0.029439314866854006, "grad_norm": 0.5061191916465759, "kl": 0.02456263080239296, "learning_rate": 4.332629679574566e-06, "loss": 0.001, "reward": 0.656624972820282, "reward_std": 0.9587939977645874, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21912500262260437, "step": 220 }, { "completion_length": 164.0, "epoch": 0.029573129934430616, "grad_norm": 0.616869330406189, "kl": 0.05638063699007034, "learning_rate": 4.324127462043628e-06, "loss": 0.0023, "reward": 2.2507500648498535, "reward_std": 1.1021733283996582, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31325000524520874, "step": 221 }, { "completion_length": 192.125, "epoch": 0.029706945002007226, "grad_norm": 0.5864559412002563, "kl": 0.023192185908555984, "learning_rate": 4.315579884524983e-06, "loss": 0.0009, "reward": 0.656624972820282, "reward_std": 0.9587939977645874, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21912500262260437, "step": 222 }, { "completion_length": 168.625, "epoch": 0.029840760069583836, "grad_norm": 0.033604927361011505, "kl": 0.06973608583211899, "learning_rate": 4.3069871595684795e-06, "loss": 0.0028, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 223 }, { "completion_length": 102.5, "epoch": 0.029974575137160443, "grad_norm": 0.040112487971782684, "kl": 0.1290511041879654, "learning_rate": 4.2983495008466285e-06, "loss": 0.0052, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 224 }, { "completion_length": 109.75, "epoch": 0.030108390204737052, "grad_norm": 0.027540424838662148, "kl": 0.10206885635852814, "learning_rate": 4.289667123149296e-06, "loss": 0.0041, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 225 }, { "completion_length": 111.5, "epoch": 0.030242205272313662, "grad_norm": 0.047440025955438614, "kl": 0.10963985323905945, "learning_rate": 4.280940242378363e-06, "loss": 0.0044, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 226 }, { "completion_length": 170.625, "epoch": 0.030376020339890272, "grad_norm": 0.5483851432800293, "kl": 0.05589936301112175, "learning_rate": 4.2721690755423485e-06, "loss": 0.0022, "reward": 1.9537500143051147, "reward_std": 1.2821447849273682, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32874998450279236, "step": 227 }, { "completion_length": 183.625, "epoch": 0.030509835407466882, "grad_norm": 0.522140383720398, "kl": 0.08230659365653992, "learning_rate": 4.263353840751023e-06, "loss": 0.0033, "reward": 2.5320000648498535, "reward_std": 0.9231207370758057, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34450000524520874, "step": 228 }, { "completion_length": 178.125, "epoch": 0.03064365047504349, "grad_norm": 0.6429304480552673, "kl": 0.11478687077760696, "learning_rate": 4.2544947572099795e-06, "loss": 0.0046, "reward": 2.4912500381469727, "reward_std": 0.9193102121353149, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3037499785423279, "step": 229 }, { "completion_length": 184.0, "epoch": 0.0307774655426201, "grad_norm": 0.5347118377685547, "kl": 0.030306361615657806, "learning_rate": 4.245592045215182e-06, "loss": 0.0012, "reward": 1.500499963760376, "reward_std": 1.4704713821411133, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25049999356269836, "step": 230 }, { "completion_length": 148.625, "epoch": 0.03091128061019671, "grad_norm": 0.6875784993171692, "kl": 0.06860403716564178, "learning_rate": 4.236645926147493e-06, "loss": 0.0027, "reward": 2.5321249961853027, "reward_std": 0.9726253151893616, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 231 }, { "completion_length": 172.0, "epoch": 0.03104509567777332, "grad_norm": 0.4069645404815674, "kl": 0.06288283318281174, "learning_rate": 4.227656622467162e-06, "loss": 0.0025, "reward": 1.250749945640564, "reward_std": 1.3498412370681763, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31324997544288635, "step": 232 }, { "completion_length": 81.75, "epoch": 0.031178910745349925, "grad_norm": 0.019883032888174057, "kl": 0.10283654183149338, "learning_rate": 4.218624357708296e-06, "loss": 0.0041, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 233 }, { "completion_length": 195.875, "epoch": 0.03131272581292654, "grad_norm": 0.5214827656745911, "kl": 0.024527832865715027, "learning_rate": 4.2095493564733e-06, "loss": 0.001, "reward": 1.140874981880188, "reward_std": 1.4025797843933105, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.203374981880188, "step": 234 }, { "completion_length": 109.875, "epoch": 0.031446540880503145, "grad_norm": 0.8858596682548523, "kl": 0.20951178669929504, "learning_rate": 4.200431844427299e-06, "loss": 0.0084, "reward": 2.2718749046325684, "reward_std": 1.1195122003555298, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3343749940395355, "step": 235 }, { "completion_length": 171.0, "epoch": 0.03158035594807975, "grad_norm": 0.41783320903778076, "kl": 0.06993798911571503, "learning_rate": 4.191272048292514e-06, "loss": 0.0028, "reward": 2.563499927520752, "reward_std": 0.8838834762573242, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 236 }, { "completion_length": 199.5, "epoch": 0.031714171015656364, "grad_norm": 0.37598511576652527, "kl": 0.017386702820658684, "learning_rate": 4.182070195842632e-06, "loss": 0.0007, "reward": 0.484499990940094, "reward_std": 0.9673013687133789, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.171999990940094, "step": 237 }, { "completion_length": 134.875, "epoch": 0.03184798608323297, "grad_norm": 0.011362828314304352, "kl": 0.06715571880340576, "learning_rate": 4.172826515897146e-06, "loss": 0.0027, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 238 }, { "completion_length": 157.625, "epoch": 0.031981801150809584, "grad_norm": 0.6809982061386108, "kl": 0.06758665293455124, "learning_rate": 4.1635412383156535e-06, "loss": 0.0027, "reward": 1.776249885559082, "reward_std": 1.4638855457305908, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21375000476837158, "step": 239 }, { "completion_length": 200.0, "epoch": 0.03211561621838619, "grad_norm": 0.030529417097568512, "kl": 0.015553316101431847, "learning_rate": 4.154214593992149e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 240 }, { "completion_length": 200.0, "epoch": 0.0322494312859628, "grad_norm": 0.022231485694646835, "kl": 0.016756955534219742, "learning_rate": 4.144846814849282e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 241 }, { "completion_length": 187.125, "epoch": 0.03238324635353941, "grad_norm": 0.4926639795303345, "kl": 0.06131409481167793, "learning_rate": 4.135438133832586e-06, "loss": 0.0025, "reward": 1.2818748950958252, "reward_std": 1.3429245948791504, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28187498450279236, "step": 242 }, { "completion_length": 121.375, "epoch": 0.03251706142111602, "grad_norm": 1.4475477933883667, "kl": 0.07874596118927002, "learning_rate": 4.125988784904691e-06, "loss": 0.0031, "reward": 1.392124891281128, "reward_std": 1.2887601852416992, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2671250104904175, "step": 243 }, { "completion_length": 106.625, "epoch": 0.03265087648869262, "grad_norm": 0.027621116489171982, "kl": 0.0873013436794281, "learning_rate": 4.116499003039499e-06, "loss": 0.0035, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 244 }, { "completion_length": 143.875, "epoch": 0.03278469155626924, "grad_norm": 0.7832835912704468, "kl": 0.05882447212934494, "learning_rate": 4.106969024216348e-06, "loss": 0.0024, "reward": 1.766374945640564, "reward_std": 1.2093526124954224, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32887500524520874, "step": 245 }, { "completion_length": 138.375, "epoch": 0.03291850662384584, "grad_norm": 0.037107665091753006, "kl": 0.08382681757211685, "learning_rate": 4.09739908541414e-06, "loss": 0.0034, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 246 }, { "completion_length": 129.125, "epoch": 0.03305232169142246, "grad_norm": 0.8398261070251465, "kl": 0.09861581027507782, "learning_rate": 4.087789424605447e-06, "loss": 0.0039, "reward": 2.5464999675750732, "reward_std": 0.9319666624069214, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35899996757507324, "step": 247 }, { "completion_length": 134.0, "epoch": 0.03318613675899906, "grad_norm": 0.6909288167953491, "kl": 0.09271075576543808, "learning_rate": 4.078140280750598e-06, "loss": 0.0037, "reward": 2.8602499961853027, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 248 }, { "completion_length": 134.875, "epoch": 0.03331995182657567, "grad_norm": 0.6626593470573425, "kl": 0.07245510816574097, "learning_rate": 4.068451893791732e-06, "loss": 0.0029, "reward": 2.5320000648498535, "reward_std": 0.8730131983757019, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34450000524520874, "step": 249 }, { "completion_length": 196.875, "epoch": 0.03345376689415228, "grad_norm": 0.3789691925048828, "kl": 0.021791741251945496, "learning_rate": 4.058724504646834e-06, "loss": 0.0009, "reward": 1.1566250324249268, "reward_std": 1.390452265739441, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21912500262260437, "step": 250 }, { "completion_length": 148.625, "epoch": 0.03358758196172889, "grad_norm": 0.016400832682847977, "kl": 0.05332140624523163, "learning_rate": 4.048958355203746e-06, "loss": 0.0021, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 251 }, { "completion_length": 96.25, "epoch": 0.0337213970293055, "grad_norm": 0.01429223082959652, "kl": 0.0820155069231987, "learning_rate": 4.039153688314146e-06, "loss": 0.0033, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 252 }, { "completion_length": 200.0, "epoch": 0.03385521209688211, "grad_norm": 0.019409598782658577, "kl": 0.012331460602581501, "learning_rate": 4.029310747787516e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 253 }, { "completion_length": 122.25, "epoch": 0.033989027164458716, "grad_norm": 0.025042589753866196, "kl": 0.08574492484331131, "learning_rate": 4.019429778385076e-06, "loss": 0.0034, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 254 }, { "completion_length": 148.0, "epoch": 0.03412284223203533, "grad_norm": 0.8560662269592285, "kl": 0.04505892097949982, "learning_rate": 4.009511025813694e-06, "loss": 0.0018, "reward": 2.125999927520752, "reward_std": 1.0350983142852783, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 255 }, { "completion_length": 133.375, "epoch": 0.034256657299611935, "grad_norm": 0.027326101437211037, "kl": 0.07807411253452301, "learning_rate": 3.999554736719785e-06, "loss": 0.0031, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 256 }, { "completion_length": 154.25, "epoch": 0.03439047236718855, "grad_norm": 0.6151906251907349, "kl": 0.09878014773130417, "learning_rate": 3.989561158683168e-06, "loss": 0.004, "reward": 2.001124858856201, "reward_std": 1.1283208131790161, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3136249780654907, "step": 257 }, { "completion_length": 200.0, "epoch": 0.034524287434765155, "grad_norm": 0.0310263279825449, "kl": 0.012863817624747753, "learning_rate": 3.97953054021092e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 258 }, { "completion_length": 127.125, "epoch": 0.03465810250234176, "grad_norm": 0.14287994801998138, "kl": 0.07523702085018158, "learning_rate": 3.969463130731183e-06, "loss": 0.003, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 259 }, { "completion_length": 190.5, "epoch": 0.034791917569918375, "grad_norm": 0.523107647895813, "kl": 0.04187753051519394, "learning_rate": 3.9593591805869755e-06, "loss": 0.0017, "reward": 2.1725001335144043, "reward_std": 1.2644920349121094, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29749998450279236, "step": 260 }, { "completion_length": 109.125, "epoch": 0.03492573263749498, "grad_norm": 0.8329552412033081, "kl": 0.07708024233579636, "learning_rate": 3.949218941029956e-06, "loss": 0.0031, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 261 }, { "completion_length": 200.0, "epoch": 0.03505954770507159, "grad_norm": 0.023976868018507957, "kl": 0.010437151417136192, "learning_rate": 3.939042664214185e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 262 }, { "completion_length": 97.25, "epoch": 0.0351933627726482, "grad_norm": 0.039311762899160385, "kl": 0.09688901156187057, "learning_rate": 3.9288306031898435e-06, "loss": 0.0039, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 263 }, { "completion_length": 167.5, "epoch": 0.03532717784022481, "grad_norm": 0.5073589086532593, "kl": 0.0493597537279129, "learning_rate": 3.918583011896955e-06, "loss": 0.002, "reward": 0.907124936580658, "reward_std": 0.8498950600624084, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 264 }, { "completion_length": 161.375, "epoch": 0.03546099290780142, "grad_norm": 0.5521835088729858, "kl": 0.05903391167521477, "learning_rate": 3.908300145159055e-06, "loss": 0.0024, "reward": 1.5164999961853027, "reward_std": 1.1524944305419922, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32899999618530273, "step": 265 }, { "completion_length": 196.375, "epoch": 0.03559480797537803, "grad_norm": 0.5117291808128357, "kl": 0.03451118618249893, "learning_rate": 3.897982258676867e-06, "loss": 0.0014, "reward": 0.5318750143051147, "reward_std": 0.37694767117500305, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28187498450279236, "step": 266 }, { "completion_length": 104.0, "epoch": 0.035728623042954634, "grad_norm": 1.0141657590866089, "kl": 0.11430563032627106, "learning_rate": 3.887629609021938e-06, "loss": 0.0046, "reward": 2.563499927520752, "reward_std": 0.8838834762573242, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 267 }, { "completion_length": 196.375, "epoch": 0.03586243811053125, "grad_norm": 0.6651949286460876, "kl": 0.02111053466796875, "learning_rate": 3.8772424536302565e-06, "loss": 0.0008, "reward": 0.906624972820282, "reward_std": 1.2428367137908936, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21912498772144318, "step": 268 }, { "completion_length": 135.125, "epoch": 0.035996253178107854, "grad_norm": 0.645706295967102, "kl": 0.07286133617162704, "learning_rate": 3.866821050795859e-06, "loss": 0.0029, "reward": 2.5321249961853027, "reward_std": 0.9726253151893616, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 269 }, { "completion_length": 106.125, "epoch": 0.03613006824568447, "grad_norm": 0.024054456502199173, "kl": 0.08228373527526855, "learning_rate": 3.856365659664399e-06, "loss": 0.0033, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 270 }, { "completion_length": 144.125, "epoch": 0.036263883313261074, "grad_norm": 0.02275257185101509, "kl": 0.06845724582672119, "learning_rate": 3.845876540226707e-06, "loss": 0.0027, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 271 }, { "completion_length": 169.5, "epoch": 0.03639769838083768, "grad_norm": 0.7437669634819031, "kl": 0.059088997542858124, "learning_rate": 3.835353953312322e-06, "loss": 0.0024, "reward": 1.7996249198913574, "reward_std": 1.3356761932373047, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2371249943971634, "step": 272 }, { "completion_length": 159.875, "epoch": 0.03653151344841429, "grad_norm": 0.5732333064079285, "kl": 0.06474175304174423, "learning_rate": 3.824798160583012e-06, "loss": 0.0026, "reward": 2.8602499961853027, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 273 }, { "completion_length": 200.0, "epoch": 0.0366653285159909, "grad_norm": 0.33648428320884705, "kl": 0.01363813504576683, "learning_rate": 3.8142094245262617e-06, "loss": 0.0005, "reward": 0.21875, "reward_std": 0.21906864643096924, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15625, "step": 274 }, { "completion_length": 103.5, "epoch": 0.036799143583567506, "grad_norm": 0.013237239792943, "kl": 0.09653908759355545, "learning_rate": 3.8035880084487454e-06, "loss": 0.0039, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 275 }, { "completion_length": 182.375, "epoch": 0.03693295865114412, "grad_norm": 0.5243744850158691, "kl": 0.04375709593296051, "learning_rate": 3.792934176469782e-06, "loss": 0.0018, "reward": 2.5164999961853027, "reward_std": 1.0168194770812988, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32899999618530273, "step": 276 }, { "completion_length": 197.375, "epoch": 0.037066773718720726, "grad_norm": 0.6301177740097046, "kl": 0.02924898825585842, "learning_rate": 3.782248193514766e-06, "loss": 0.0012, "reward": 1.500499963760376, "reward_std": 1.4704713821411133, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25049999356269836, "step": 277 }, { "completion_length": 128.0, "epoch": 0.03720058878629734, "grad_norm": 0.5154142379760742, "kl": 0.08466359227895737, "learning_rate": 3.7715303253085796e-06, "loss": 0.0034, "reward": 2.848374843597412, "reward_std": 0.07813525199890137, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34837496280670166, "step": 278 }, { "completion_length": 138.375, "epoch": 0.037334403853873946, "grad_norm": 0.48267894983291626, "kl": 0.08784278482198715, "learning_rate": 3.760780838368986e-06, "loss": 0.0035, "reward": 2.2821249961853027, "reward_std": 1.1178104877471924, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 279 }, { "completion_length": 163.75, "epoch": 0.03746821892145055, "grad_norm": 0.5836043953895569, "kl": 0.05314195528626442, "learning_rate": 3.7500000000000005e-06, "loss": 0.0021, "reward": 2.5633749961853027, "reward_std": 0.744371771812439, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31337499618530273, "step": 280 }, { "completion_length": 153.625, "epoch": 0.037602033989027166, "grad_norm": 0.031671296805143356, "kl": 0.07184700667858124, "learning_rate": 3.7391880782852446e-06, "loss": 0.0029, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 281 }, { "completion_length": 187.0, "epoch": 0.03773584905660377, "grad_norm": 0.41911765933036804, "kl": 0.037687089294195175, "learning_rate": 3.7283453420812786e-06, "loss": 0.0015, "reward": 1.516124963760376, "reward_std": 1.4543431997299194, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.26612499356269836, "step": 282 }, { "completion_length": 126.5, "epoch": 0.037869664124180386, "grad_norm": 0.4970996677875519, "kl": 0.06710168719291687, "learning_rate": 3.7174720610109184e-06, "loss": 0.0027, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 283 }, { "completion_length": 181.75, "epoch": 0.03800347919175699, "grad_norm": 0.6069383025169373, "kl": 0.042086225003004074, "learning_rate": 3.7065685054565277e-06, "loss": 0.0017, "reward": 0.82874995470047, "reward_std": 0.9015674591064453, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.26624998450279236, "step": 284 }, { "completion_length": 137.25, "epoch": 0.0381372942593336, "grad_norm": 0.01716587506234646, "kl": 0.058556392788887024, "learning_rate": 3.695634946553296e-06, "loss": 0.0023, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 285 }, { "completion_length": 147.0, "epoch": 0.03827110932691021, "grad_norm": 0.017993135377764702, "kl": 0.07365481555461884, "learning_rate": 3.684671656182497e-06, "loss": 0.0029, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 286 }, { "completion_length": 91.25, "epoch": 0.03840492439448682, "grad_norm": 0.04788558930158615, "kl": 0.1206275001168251, "learning_rate": 3.6736789069647273e-06, "loss": 0.0048, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 287 }, { "completion_length": 170.75, "epoch": 0.03853873946206343, "grad_norm": 0.6263538002967834, "kl": 0.05778669938445091, "learning_rate": 3.6626569722531268e-06, "loss": 0.0023, "reward": 2.1882500648498535, "reward_std": 1.2734655141830444, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31325000524520874, "step": 288 }, { "completion_length": 181.0, "epoch": 0.03867255452964004, "grad_norm": 0.6299733519554138, "kl": 0.04019211232662201, "learning_rate": 3.6516061261265813e-06, "loss": 0.0016, "reward": 0.8496249914169312, "reward_std": 1.2548407316207886, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16212499141693115, "step": 289 }, { "completion_length": 116.5, "epoch": 0.038806369597216644, "grad_norm": 0.6891494989395142, "kl": 0.10363874584436417, "learning_rate": 3.640526643382908e-06, "loss": 0.0041, "reward": 2.563499927520752, "reward_std": 0.8838834762573242, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 290 }, { "completion_length": 135.125, "epoch": 0.03894018466479326, "grad_norm": 0.026428066194057465, "kl": 0.08730591088533401, "learning_rate": 3.6294187995320214e-06, "loss": 0.0035, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 291 }, { "completion_length": 113.75, "epoch": 0.039073999732369864, "grad_norm": 0.042514655739068985, "kl": 0.11761242151260376, "learning_rate": 3.6182828707890816e-06, "loss": 0.0047, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 292 }, { "completion_length": 156.875, "epoch": 0.03920781479994647, "grad_norm": 0.6949495673179626, "kl": 0.0679459422826767, "learning_rate": 3.607119134067629e-06, "loss": 0.0027, "reward": 2.5946249961853027, "reward_std": 0.7958486080169678, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 293 }, { "completion_length": 157.5, "epoch": 0.039341629867523084, "grad_norm": 0.6361478567123413, "kl": 0.06278131902217865, "learning_rate": 3.595927866972694e-06, "loss": 0.0025, "reward": 2.2663750648498535, "reward_std": 1.0992358922958374, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32887500524520874, "step": 294 }, { "completion_length": 146.75, "epoch": 0.03947544493509969, "grad_norm": 0.4439238905906677, "kl": 0.08064507693052292, "learning_rate": 3.5847093477938955e-06, "loss": 0.0032, "reward": 2.563499927520752, "reward_std": 0.8838834762573242, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 295 }, { "completion_length": 110.5, "epoch": 0.039609260002676304, "grad_norm": 1.096075415611267, "kl": 0.09411236643791199, "learning_rate": 3.5734638554985234e-06, "loss": 0.0038, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 296 }, { "completion_length": 125.75, "epoch": 0.03974307507025291, "grad_norm": 0.672204852104187, "kl": 0.07630673795938492, "learning_rate": 3.5621916697245966e-06, "loss": 0.0031, "reward": 2.844749927520752, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34474998712539673, "step": 297 }, { "completion_length": 166.5, "epoch": 0.03987689013782952, "grad_norm": 0.7818819880485535, "kl": 0.11378637701272964, "learning_rate": 3.5508930707739143e-06, "loss": 0.0046, "reward": 1.875749945640564, "reward_std": 1.3826476335525513, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31325000524520874, "step": 298 }, { "completion_length": 82.375, "epoch": 0.04001070520540613, "grad_norm": 0.0297747403383255, "kl": 0.09141701459884644, "learning_rate": 3.5395683396050826e-06, "loss": 0.0037, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 299 }, { "completion_length": 160.625, "epoch": 0.04014452027298274, "grad_norm": 0.03765996918082237, "kl": 0.06939936429262161, "learning_rate": 3.5282177578265295e-06, "loss": 0.0028, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 300 }, { "completion_length": 193.375, "epoch": 0.04027833534055935, "grad_norm": 0.6114082932472229, "kl": 0.02720811776816845, "learning_rate": 3.516841607689501e-06, "loss": 0.0011, "reward": 0.937749981880188, "reward_std": 1.2123767137527466, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250249981880188, "step": 301 }, { "completion_length": 116.0, "epoch": 0.040412150408135956, "grad_norm": 0.04674949496984482, "kl": 0.0826873928308487, "learning_rate": 3.5054401720810437e-06, "loss": 0.0033, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 302 }, { "completion_length": 200.0, "epoch": 0.04054596547571256, "grad_norm": 0.013670495711266994, "kl": 0.006112328730523586, "learning_rate": 3.4940137345169713e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 303 }, { "completion_length": 146.125, "epoch": 0.040679780543289176, "grad_norm": 0.02069338597357273, "kl": 0.0730745792388916, "learning_rate": 3.4825625791348093e-06, "loss": 0.0029, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 304 }, { "completion_length": 140.0, "epoch": 0.04081359561086578, "grad_norm": 0.029404906556010246, "kl": 0.07228542864322662, "learning_rate": 3.471086990686737e-06, "loss": 0.0029, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 305 }, { "completion_length": 183.125, "epoch": 0.040947410678442396, "grad_norm": 0.45872148871421814, "kl": 0.07083412259817123, "learning_rate": 3.4595872545325017e-06, "loss": 0.0028, "reward": 2.2038750648498535, "reward_std": 1.2449820041656494, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32887500524520874, "step": 306 }, { "completion_length": 192.25, "epoch": 0.041081225746019, "grad_norm": 0.8031287789344788, "kl": 0.025236405432224274, "learning_rate": 3.4480636566323215e-06, "loss": 0.001, "reward": 0.234499990940094, "reward_std": 0.26286986470222473, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.171999990940094, "step": 307 }, { "completion_length": 179.75, "epoch": 0.04121504081359561, "grad_norm": 0.5177518129348755, "kl": 0.05021588131785393, "learning_rate": 3.436516483539781e-06, "loss": 0.002, "reward": 1.547374963760376, "reward_std": 1.4209489822387695, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29737499356269836, "step": 308 }, { "completion_length": 74.5, "epoch": 0.04134885588117222, "grad_norm": 0.03511876240372658, "kl": 0.1506022810935974, "learning_rate": 3.4249460223946978e-06, "loss": 0.006, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 309 }, { "completion_length": 156.375, "epoch": 0.04148267094874883, "grad_norm": 0.01822076179087162, "kl": 0.06797613203525543, "learning_rate": 3.4133525609159883e-06, "loss": 0.0027, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 310 }, { "completion_length": 174.5, "epoch": 0.041616486016325435, "grad_norm": 0.6905167698860168, "kl": 0.053532786667346954, "learning_rate": 3.4017363873945098e-06, "loss": 0.0021, "reward": 1.2821249961853027, "reward_std": 1.0173144340515137, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 311 }, { "completion_length": 192.0, "epoch": 0.04175030108390205, "grad_norm": 0.616634726524353, "kl": 0.030378200113773346, "learning_rate": 3.3900977906858923e-06, "loss": 0.0012, "reward": 0.922249972820282, "reward_std": 1.2323477268218994, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23475000262260437, "step": 312 }, { "completion_length": 151.5, "epoch": 0.041884116151478655, "grad_norm": 0.7508221864700317, "kl": 0.07898285239934921, "learning_rate": 3.3784370602033572e-06, "loss": 0.0032, "reward": 2.1882500648498535, "reward_std": 1.2734655141830444, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31324997544288635, "step": 313 }, { "completion_length": 161.375, "epoch": 0.04201793121905527, "grad_norm": 0.874470055103302, "kl": 0.10175217688083649, "learning_rate": 3.3667544859105186e-06, "loss": 0.0041, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 314 }, { "completion_length": 177.5, "epoch": 0.042151746286631875, "grad_norm": 0.5973278284072876, "kl": 0.06022553890943527, "learning_rate": 3.3550503583141726e-06, "loss": 0.0024, "reward": 2.2038750648498535, "reward_std": 1.2449820041656494, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32887500524520874, "step": 315 }, { "completion_length": 127.875, "epoch": 0.04228556135420848, "grad_norm": 0.8243934512138367, "kl": 0.09004293382167816, "learning_rate": 3.3433249684570757e-06, "loss": 0.0036, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 316 }, { "completion_length": 176.125, "epoch": 0.042419376421785095, "grad_norm": 0.8847985863685608, "kl": 0.04679393768310547, "learning_rate": 3.3315786079107053e-06, "loss": 0.0019, "reward": 1.6882500648498535, "reward_std": 1.3010985851287842, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31325000524520874, "step": 317 }, { "completion_length": 151.125, "epoch": 0.0425531914893617, "grad_norm": 0.03703625500202179, "kl": 0.08068917691707611, "learning_rate": 3.3198115687680115e-06, "loss": 0.0032, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 318 }, { "completion_length": 107.125, "epoch": 0.042687006556938314, "grad_norm": 0.03186451643705368, "kl": 0.12436903268098831, "learning_rate": 3.3080241436361505e-06, "loss": 0.005, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 319 }, { "completion_length": 149.875, "epoch": 0.04282082162451492, "grad_norm": 0.046915553510189056, "kl": 0.06779299676418304, "learning_rate": 3.2962166256292116e-06, "loss": 0.0027, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 320 }, { "completion_length": 106.875, "epoch": 0.04295463669209153, "grad_norm": 0.6020116806030273, "kl": 0.11311566084623337, "learning_rate": 3.2843893083609267e-06, "loss": 0.0045, "reward": 2.8602499961853027, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 321 }, { "completion_length": 176.625, "epoch": 0.04308845175966814, "grad_norm": 0.7055139541625977, "kl": 0.03980642557144165, "learning_rate": 3.272542485937369e-06, "loss": 0.0016, "reward": 1.9539999961853027, "reward_std": 1.25544273853302, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32899999618530273, "step": 322 }, { "completion_length": 72.75, "epoch": 0.04322226682724475, "grad_norm": 0.04835575073957443, "kl": 0.1312403678894043, "learning_rate": 3.2606764529496413e-06, "loss": 0.0052, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 323 }, { "completion_length": 170.125, "epoch": 0.043356081894821354, "grad_norm": 0.6021599173545837, "kl": 0.04381553828716278, "learning_rate": 3.2487915044665485e-06, "loss": 0.0018, "reward": 1.8443750143051147, "reward_std": 1.42377769947052, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28187498450279236, "step": 324 }, { "completion_length": 113.0, "epoch": 0.04348989696239797, "grad_norm": 0.08526396751403809, "kl": 0.1292935013771057, "learning_rate": 3.236887936027261e-06, "loss": 0.0052, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 325 }, { "completion_length": 107.0, "epoch": 0.04362371202997457, "grad_norm": 0.9376106262207031, "kl": 0.09921862930059433, "learning_rate": 3.224966043633966e-06, "loss": 0.004, "reward": 2.860374927520752, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36037498712539673, "step": 326 }, { "completion_length": 186.75, "epoch": 0.04375752709755119, "grad_norm": 0.4538845419883728, "kl": 0.05095275491476059, "learning_rate": 3.213026123744506e-06, "loss": 0.002, "reward": 2.1726250648498535, "reward_std": 1.2645572423934937, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29762500524520874, "step": 327 }, { "completion_length": 175.875, "epoch": 0.04389134216512779, "grad_norm": 0.5928269624710083, "kl": 0.04448448121547699, "learning_rate": 3.201068473265007e-06, "loss": 0.0018, "reward": 2.1882500648498535, "reward_std": 1.2734655141830444, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31325000524520874, "step": 328 }, { "completion_length": 132.125, "epoch": 0.0440251572327044, "grad_norm": 0.5945946574211121, "kl": 0.08295081555843353, "learning_rate": 3.189093389542498e-06, "loss": 0.0033, "reward": 2.8602499961853027, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 329 }, { "completion_length": 150.75, "epoch": 0.04415897230028101, "grad_norm": 0.6357448697090149, "kl": 0.06506805866956711, "learning_rate": 3.1771011703575134e-06, "loss": 0.0026, "reward": 2.2821249961853027, "reward_std": 1.1178104877471924, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 330 }, { "completion_length": 195.75, "epoch": 0.04429278736785762, "grad_norm": 0.46064579486846924, "kl": 0.025753460824489594, "learning_rate": 3.165092113916688e-06, "loss": 0.001, "reward": 0.8026249408721924, "reward_std": 1.2828905582427979, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17762500047683716, "step": 331 }, { "completion_length": 189.0, "epoch": 0.04442660243543423, "grad_norm": 0.4432630240917206, "kl": 0.05971987172961235, "learning_rate": 3.1530665188453463e-06, "loss": 0.0024, "reward": 2.5477499961853027, "reward_std": 0.9284311532974243, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 332 }, { "completion_length": 189.625, "epoch": 0.04456041750301084, "grad_norm": 0.47911959886550903, "kl": 0.025781670585274696, "learning_rate": 3.1410246841800714e-06, "loss": 0.001, "reward": 1.1017498970031738, "reward_std": 1.3865821361541748, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16425000131130219, "step": 333 }, { "completion_length": 186.125, "epoch": 0.044694232570587446, "grad_norm": 0.5730496048927307, "kl": 0.04161279648542404, "learning_rate": 3.128966909361272e-06, "loss": 0.0017, "reward": 1.8443750143051147, "reward_std": 1.42377769947052, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28187501430511475, "step": 334 }, { "completion_length": 196.5, "epoch": 0.04482804763816406, "grad_norm": 0.6438575983047485, "kl": 0.02815292403101921, "learning_rate": 3.116893494225734e-06, "loss": 0.0011, "reward": 1.156499981880188, "reward_std": 1.3902884721755981, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21899999678134918, "step": 335 }, { "completion_length": 181.75, "epoch": 0.044961862705740666, "grad_norm": 0.7255552411079407, "kl": 0.038968879729509354, "learning_rate": 3.1048047389991693e-06, "loss": 0.0016, "reward": 1.1722500324249268, "reward_std": 1.4114717245101929, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23475000262260437, "step": 336 }, { "completion_length": 108.875, "epoch": 0.04509567777331728, "grad_norm": 1.06059992313385, "kl": 0.137029767036438, "learning_rate": 3.092700944288744e-06, "loss": 0.0055, "reward": 0.4440000057220459, "reward_std": 0.2897742986679077, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3190000057220459, "step": 337 }, { "completion_length": 167.375, "epoch": 0.045229492840893885, "grad_norm": 0.39545005559921265, "kl": 0.07446862012147903, "learning_rate": 3.0805824110756066e-06, "loss": 0.003, "reward": 2.7829999923706055, "reward_std": 0.26304367184638977, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28299999237060547, "step": 338 }, { "completion_length": 130.0, "epoch": 0.04536330790847049, "grad_norm": 0.6566721200942993, "kl": 0.09755611419677734, "learning_rate": 3.0684494407074037e-06, "loss": 0.0039, "reward": 2.860374927520752, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36037498712539673, "step": 339 }, { "completion_length": 155.0, "epoch": 0.045497122976047105, "grad_norm": 0.024315236136317253, "kl": 0.059822697192430496, "learning_rate": 3.056302334890786e-06, "loss": 0.0024, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 340 }, { "completion_length": 134.875, "epoch": 0.04563093804362371, "grad_norm": 0.5765925049781799, "kl": 0.0861860066652298, "learning_rate": 3.044141395683906e-06, "loss": 0.0034, "reward": 2.8602499961853027, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 341 }, { "completion_length": 72.25, "epoch": 0.04576475311120032, "grad_norm": 0.07990053296089172, "kl": 0.13637614250183105, "learning_rate": 3.0319669254889054e-06, "loss": 0.0055, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 342 }, { "completion_length": 119.75, "epoch": 0.04589856817877693, "grad_norm": 1.349136471748352, "kl": 0.10270408540964127, "learning_rate": 3.019779227044398e-06, "loss": 0.0041, "reward": 2.551499843597412, "reward_std": 0.7084000110626221, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.30149999260902405, "step": 343 }, { "completion_length": 125.75, "epoch": 0.04603238324635354, "grad_norm": 0.018481381237506866, "kl": 0.08274512737989426, "learning_rate": 3.0075786034179407e-06, "loss": 0.0033, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 344 }, { "completion_length": 197.125, "epoch": 0.04616619831393015, "grad_norm": 1.1671421527862549, "kl": 0.01607135869562626, "learning_rate": 2.9953653579984945e-06, "loss": 0.0006, "reward": 0.156374990940094, "reward_std": 0.08874189853668213, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.156374990940094, "step": 345 }, { "completion_length": 175.0, "epoch": 0.04630001338150676, "grad_norm": 0.4250999987125397, "kl": 0.04828311875462532, "learning_rate": 2.9831397944888833e-06, "loss": 0.0019, "reward": 0.937749981880188, "reward_std": 1.1823102235794067, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250249981880188, "step": 346 }, { "completion_length": 107.25, "epoch": 0.046433828449083364, "grad_norm": 0.02923331782221794, "kl": 0.08198081701993942, "learning_rate": 2.9709022168982427e-06, "loss": 0.0033, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 347 }, { "completion_length": 192.875, "epoch": 0.04656764351665998, "grad_norm": 0.5486526489257812, "kl": 0.011629603803157806, "learning_rate": 2.958652929534456e-06, "loss": 0.0005, "reward": 0.484499990940094, "reward_std": 0.9673013687133789, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.171999990940094, "step": 348 }, { "completion_length": 183.625, "epoch": 0.046701458584236584, "grad_norm": 0.5216718912124634, "kl": 0.05133568495512009, "learning_rate": 2.946392236996592e-06, "loss": 0.0021, "reward": 1.938249945640564, "reward_std": 1.3148058652877808, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31325000524520874, "step": 349 }, { "completion_length": 132.25, "epoch": 0.0468352736518132, "grad_norm": 1.3864895105361938, "kl": 0.09642195701599121, "learning_rate": 2.9341204441673267e-06, "loss": 0.0039, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 350 }, { "completion_length": 173.5, "epoch": 0.046969088719389804, "grad_norm": 0.585511326789856, "kl": 0.056327059864997864, "learning_rate": 2.921837856205362e-06, "loss": 0.0023, "reward": 2.2821249961853027, "reward_std": 1.1178104877471924, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 351 }, { "completion_length": 198.25, "epoch": 0.04710290378696641, "grad_norm": 0.5214681029319763, "kl": 0.029943913221359253, "learning_rate": 2.9095447785378446e-06, "loss": 0.0012, "reward": 0.640874981880188, "reward_std": 0.955705463886261, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.203374981880188, "step": 352 }, { "completion_length": 130.75, "epoch": 0.047236718854543024, "grad_norm": 0.4581518769264221, "kl": 0.18232449889183044, "learning_rate": 2.8972415168527583e-06, "loss": 0.0073, "reward": 2.8571248054504395, "reward_std": 0.05338657274842262, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3571249842643738, "step": 353 }, { "completion_length": 190.25, "epoch": 0.04737053392211963, "grad_norm": 0.5401055216789246, "kl": 0.04765722155570984, "learning_rate": 2.884928377091334e-06, "loss": 0.0019, "reward": 1.516124963760376, "reward_std": 1.4543431997299194, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.26612499356269836, "step": 354 }, { "completion_length": 190.625, "epoch": 0.047504348989696236, "grad_norm": 0.4501241147518158, "kl": 0.05021478980779648, "learning_rate": 2.872605665440436e-06, "loss": 0.002, "reward": 1.8756250143051147, "reward_std": 1.3811874389648438, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31312498450279236, "step": 355 }, { "completion_length": 196.75, "epoch": 0.04763816405727285, "grad_norm": 0.5028612613677979, "kl": 0.013500398956239223, "learning_rate": 2.8602736883249504e-06, "loss": 0.0005, "reward": 0.484499990940094, "reward_std": 0.9673013687133789, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.171999990940094, "step": 356 }, { "completion_length": 144.25, "epoch": 0.047771979124849456, "grad_norm": 0.05043628439307213, "kl": 0.09568338841199875, "learning_rate": 2.847932752400164e-06, "loss": 0.0038, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 357 }, { "completion_length": 149.75, "epoch": 0.04790579419242607, "grad_norm": 0.5144177675247192, "kl": 0.08929523825645447, "learning_rate": 2.835583164544139e-06, "loss": 0.0036, "reward": 2.610374927520752, "reward_std": 0.7021570801734924, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36037498712539673, "step": 358 }, { "completion_length": 146.25, "epoch": 0.048039609260002676, "grad_norm": 0.8806883692741394, "kl": 0.09334827959537506, "learning_rate": 2.8232252318500836e-06, "loss": 0.0037, "reward": 2.098374843597412, "reward_std": 1.0753893852233887, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34837496280670166, "step": 359 }, { "completion_length": 197.625, "epoch": 0.04817342432757928, "grad_norm": 0.635985255241394, "kl": 0.024005752056837082, "learning_rate": 2.8108592616187135e-06, "loss": 0.001, "reward": 0.500124990940094, "reward_std": 0.9616578221321106, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.187624990940094, "step": 360 }, { "completion_length": 135.875, "epoch": 0.048307239395155896, "grad_norm": 0.7149239182472229, "kl": 0.0762256383895874, "learning_rate": 2.7984855613506107e-06, "loss": 0.003, "reward": 2.2821249961853027, "reward_std": 1.1178104877471924, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 361 }, { "completion_length": 185.25, "epoch": 0.0484410544627325, "grad_norm": 0.7005431056022644, "kl": 0.055451616644859314, "learning_rate": 2.78610443873858e-06, "loss": 0.0022, "reward": 1.8443750143051147, "reward_std": 1.42377769947052, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28187501430511475, "step": 362 }, { "completion_length": 184.125, "epoch": 0.048574869530309116, "grad_norm": 0.4957987368106842, "kl": 0.04241981357336044, "learning_rate": 2.773716201659993e-06, "loss": 0.0017, "reward": 1.500499963760376, "reward_std": 1.4704713821411133, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25049999356269836, "step": 363 }, { "completion_length": 92.75, "epoch": 0.04870868459788572, "grad_norm": 0.5573947429656982, "kl": 0.15773630142211914, "learning_rate": 2.761321158169134e-06, "loss": 0.0063, "reward": 2.8561248779296875, "reward_std": 0.05621498450636864, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35612499713897705, "step": 364 }, { "completion_length": 131.75, "epoch": 0.04884249966546233, "grad_norm": 0.7171053290367126, "kl": 0.060316819697618484, "learning_rate": 2.748919616489542e-06, "loss": 0.0024, "reward": 1.875999927520752, "reward_std": 1.0690449476242065, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 365 }, { "completion_length": 130.875, "epoch": 0.04897631473303894, "grad_norm": 0.042846277356147766, "kl": 0.09420768171548843, "learning_rate": 2.736511885006343e-06, "loss": 0.0038, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 366 }, { "completion_length": 137.75, "epoch": 0.04911012980061555, "grad_norm": 0.034398604184389114, "kl": 0.08529643714427948, "learning_rate": 2.724098272258584e-06, "loss": 0.0034, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 367 }, { "completion_length": 75.25, "epoch": 0.04924394486819216, "grad_norm": 0.09623858332633972, "kl": 0.14050956070423126, "learning_rate": 2.7116790869315583e-06, "loss": 0.0056, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 368 }, { "completion_length": 114.25, "epoch": 0.04937775993576877, "grad_norm": 0.6930121183395386, "kl": 0.11655654013156891, "learning_rate": 2.6992546378491317e-06, "loss": 0.0047, "reward": 2.8602499961853027, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 369 }, { "completion_length": 109.0, "epoch": 0.049511575003345375, "grad_norm": 0.945068895816803, "kl": 0.11966826021671295, "learning_rate": 2.686825233966061e-06, "loss": 0.0048, "reward": 2.844749927520752, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34474998712539673, "step": 370 }, { "completion_length": 151.875, "epoch": 0.04964539007092199, "grad_norm": 0.4595087766647339, "kl": 0.0745183527469635, "learning_rate": 2.6743911843603134e-06, "loss": 0.003, "reward": 2.5477499961853027, "reward_std": 0.9284311532974243, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 371 }, { "completion_length": 153.25, "epoch": 0.049779205138498595, "grad_norm": 0.06826778501272202, "kl": 0.08434748649597168, "learning_rate": 2.6619527982253796e-06, "loss": 0.0034, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 372 }, { "completion_length": 167.25, "epoch": 0.0499130202060752, "grad_norm": 0.025867884978652, "kl": 0.07445389032363892, "learning_rate": 2.649510384862586e-06, "loss": 0.003, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 373 }, { "completion_length": 200.0, "epoch": 0.050046835273651814, "grad_norm": 0.5973170399665833, "kl": 0.03745676577091217, "learning_rate": 2.6370642536734005e-06, "loss": 0.0015, "reward": 0.234499990940094, "reward_std": 0.2162022739648819, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.171999990940094, "step": 374 }, { "completion_length": 200.0, "epoch": 0.05018065034122842, "grad_norm": 0.01913980394601822, "kl": 0.009219279512763023, "learning_rate": 2.624614714151743e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 375 }, { "completion_length": 198.25, "epoch": 0.050314465408805034, "grad_norm": 0.5759661793708801, "kl": 0.025891784578561783, "learning_rate": 2.6121620758762877e-06, "loss": 0.001, "reward": 0.296999990940094, "reward_std": 0.3202574849128723, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.171999990940094, "step": 376 }, { "completion_length": 116.0, "epoch": 0.05044828047638164, "grad_norm": 0.07995796948671341, "kl": 0.11114312708377838, "learning_rate": 2.5997066485027626e-06, "loss": 0.0044, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 377 }, { "completion_length": 132.0, "epoch": 0.05058209554395825, "grad_norm": 0.7011977434158325, "kl": 0.07148827612400055, "learning_rate": 2.587248741756253e-06, "loss": 0.0029, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 378 }, { "completion_length": 127.25, "epoch": 0.05071591061153486, "grad_norm": 0.05236940458416939, "kl": 0.06597031652927399, "learning_rate": 2.5747886654234967e-06, "loss": 0.0026, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 379 }, { "completion_length": 115.0, "epoch": 0.05084972567911147, "grad_norm": 0.02161761187016964, "kl": 0.09634871780872345, "learning_rate": 2.5623267293451827e-06, "loss": 0.0039, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 380 }, { "completion_length": 120.0, "epoch": 0.05098354074668808, "grad_norm": 0.03281255438923836, "kl": 0.08390858769416809, "learning_rate": 2.5498632434082454e-06, "loss": 0.0034, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 381 }, { "completion_length": 134.625, "epoch": 0.05111735581426469, "grad_norm": 0.048469725996255875, "kl": 0.09256488084793091, "learning_rate": 2.5373985175381595e-06, "loss": 0.0037, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 382 }, { "completion_length": 192.375, "epoch": 0.05125117088184129, "grad_norm": 0.5812839865684509, "kl": 0.047234926372766495, "learning_rate": 2.5249328616912317e-06, "loss": 0.0019, "reward": 1.2347500324249268, "reward_std": 1.3754099607467651, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23475000262260437, "step": 383 }, { "completion_length": 151.0, "epoch": 0.051384985949417906, "grad_norm": 0.016363635659217834, "kl": 0.06750474870204926, "learning_rate": 2.5124665858468956e-06, "loss": 0.0027, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 384 }, { "completion_length": 175.5, "epoch": 0.05151880101699451, "grad_norm": 0.016084877774119377, "kl": 0.07160551100969315, "learning_rate": 2.5e-06, "loss": 0.0029, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 385 }, { "completion_length": 158.5, "epoch": 0.05165261608457112, "grad_norm": 0.4706639051437378, "kl": 0.06855463981628418, "learning_rate": 2.4875334141531052e-06, "loss": 0.0027, "reward": 2.5321249961853027, "reward_std": 0.9726253151893616, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 386 }, { "completion_length": 167.125, "epoch": 0.05178643115214773, "grad_norm": 0.61845862865448, "kl": 0.070584237575531, "learning_rate": 2.475067138308769e-06, "loss": 0.0028, "reward": 1.9436249732971191, "reward_std": 1.3075156211853027, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31862497329711914, "step": 387 }, { "completion_length": 112.0, "epoch": 0.05192024621972434, "grad_norm": 0.02084559202194214, "kl": 0.08049716055393219, "learning_rate": 2.4626014824618418e-06, "loss": 0.0032, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 388 }, { "completion_length": 139.25, "epoch": 0.05205406128730095, "grad_norm": 0.6785424947738647, "kl": 0.07016098499298096, "learning_rate": 2.4501367565917554e-06, "loss": 0.0028, "reward": 0.8602499961853027, "reward_std": 0.04454774037003517, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 389 }, { "completion_length": 126.125, "epoch": 0.05218787635487756, "grad_norm": 0.05194733291864395, "kl": 0.07832348346710205, "learning_rate": 2.4376732706548185e-06, "loss": 0.0031, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 390 }, { "completion_length": 135.125, "epoch": 0.052321691422454165, "grad_norm": 0.023008067160844803, "kl": 0.07224376499652863, "learning_rate": 2.4252113345765045e-06, "loss": 0.0029, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 391 }, { "completion_length": 144.375, "epoch": 0.05245550649003078, "grad_norm": 0.03731316700577736, "kl": 0.0671522319316864, "learning_rate": 2.4127512582437486e-06, "loss": 0.0027, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 392 }, { "completion_length": 151.625, "epoch": 0.052589321557607385, "grad_norm": 0.06965499371290207, "kl": 0.10002456605434418, "learning_rate": 2.4002933514972387e-06, "loss": 0.004, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 393 }, { "completion_length": 195.75, "epoch": 0.052723136625184, "grad_norm": 0.6095194816589355, "kl": 0.027808405458927155, "learning_rate": 2.3878379241237136e-06, "loss": 0.0011, "reward": 1.1566250324249268, "reward_std": 1.42377769947052, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21912498772144318, "step": 394 }, { "completion_length": 136.5, "epoch": 0.052856951692760605, "grad_norm": 0.02448575384914875, "kl": 0.0723317563533783, "learning_rate": 2.375385285848257e-06, "loss": 0.0029, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 395 }, { "completion_length": 118.875, "epoch": 0.05299076676033721, "grad_norm": 0.018849877640604973, "kl": 0.08972451090812683, "learning_rate": 2.3629357463266e-06, "loss": 0.0036, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 396 }, { "completion_length": 156.625, "epoch": 0.053124581827913825, "grad_norm": 0.715349018573761, "kl": 0.07580310851335526, "learning_rate": 2.3504896151374145e-06, "loss": 0.003, "reward": 1.625999927520752, "reward_std": 1.3363062143325806, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 397 }, { "completion_length": 186.25, "epoch": 0.05325839689549043, "grad_norm": 0.464533269405365, "kl": 0.04519539326429367, "learning_rate": 2.3380472017746204e-06, "loss": 0.0018, "reward": 1.828624963760376, "reward_std": 1.4113785028457642, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.26612499356269836, "step": 398 }, { "completion_length": 171.0, "epoch": 0.053392211963067045, "grad_norm": 0.014431565999984741, "kl": 0.06554807722568512, "learning_rate": 2.325608815639687e-06, "loss": 0.0026, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 399 }, { "completion_length": 169.625, "epoch": 0.05352602703064365, "grad_norm": 0.7028098702430725, "kl": 0.05088828131556511, "learning_rate": 2.3131747660339396e-06, "loss": 0.002, "reward": 1.7820000648498535, "reward_std": 1.185692310333252, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34450000524520874, "step": 400 }, { "completion_length": 155.125, "epoch": 0.05365984209822026, "grad_norm": 0.04986615478992462, "kl": 0.07950672507286072, "learning_rate": 2.300745362150869e-06, "loss": 0.0032, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 401 }, { "completion_length": 131.375, "epoch": 0.05379365716579687, "grad_norm": 0.020077943801879883, "kl": 0.0838509202003479, "learning_rate": 2.288320913068442e-06, "loss": 0.0034, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 402 }, { "completion_length": 196.625, "epoch": 0.05392747223337348, "grad_norm": 0.5647266507148743, "kl": 0.022733887657523155, "learning_rate": 2.2759017277414165e-06, "loss": 0.0009, "reward": 0.578374981880188, "reward_std": 0.9638639688491821, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.203374981880188, "step": 403 }, { "completion_length": 178.5, "epoch": 0.054061287300950084, "grad_norm": 0.7138876914978027, "kl": 0.04037192836403847, "learning_rate": 2.2634881149936576e-06, "loss": 0.0016, "reward": 0.85999995470047, "reward_std": 0.8878408670425415, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29750001430511475, "step": 404 }, { "completion_length": 138.375, "epoch": 0.0541951023685267, "grad_norm": 0.6774787306785583, "kl": 0.08483107388019562, "learning_rate": 2.251080383510459e-06, "loss": 0.0034, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 405 }, { "completion_length": 113.625, "epoch": 0.054328917436103304, "grad_norm": 0.021111685782670975, "kl": 0.09744316339492798, "learning_rate": 2.238678841830867e-06, "loss": 0.0039, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 406 }, { "completion_length": 152.5, "epoch": 0.05446273250367992, "grad_norm": 0.009523588232696056, "kl": 0.052605949342250824, "learning_rate": 2.226283798340008e-06, "loss": 0.0021, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 407 }, { "completion_length": 132.5, "epoch": 0.05459654757125652, "grad_norm": 0.6554896831512451, "kl": 0.0891679972410202, "learning_rate": 2.2138955612614206e-06, "loss": 0.0036, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 408 }, { "completion_length": 152.5, "epoch": 0.05473036263883313, "grad_norm": 0.5683344006538391, "kl": 0.07335890829563141, "learning_rate": 2.2015144386493898e-06, "loss": 0.0029, "reward": 2.5477499961853027, "reward_std": 0.9284311532974243, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 409 }, { "completion_length": 177.75, "epoch": 0.05486417770640974, "grad_norm": 0.9421977400779724, "kl": 0.028944583609700203, "learning_rate": 2.1891407383812878e-06, "loss": 0.0012, "reward": 0.406624972820282, "reward_std": 0.3886794149875641, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21912498772144318, "step": 410 }, { "completion_length": 132.625, "epoch": 0.05499799277398635, "grad_norm": 0.8083810806274414, "kl": 0.06083378195762634, "learning_rate": 2.1767747681499176e-06, "loss": 0.0024, "reward": 2.860374927520752, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36037498712539673, "step": 411 }, { "completion_length": 177.625, "epoch": 0.05513180784156296, "grad_norm": 0.5404098033905029, "kl": 0.031011439859867096, "learning_rate": 2.1644168354558623e-06, "loss": 0.0012, "reward": 1.531749963760376, "reward_std": 1.4378401041030884, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28174999356269836, "step": 412 }, { "completion_length": 157.375, "epoch": 0.05526562290913957, "grad_norm": 0.4953736960887909, "kl": 0.06132303178310394, "learning_rate": 2.1520672475998374e-06, "loss": 0.0025, "reward": 2.860374927520752, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36037498712539673, "step": 413 }, { "completion_length": 139.375, "epoch": 0.055399437976716176, "grad_norm": 0.037774477154016495, "kl": 0.10745426267385483, "learning_rate": 2.1397263116750504e-06, "loss": 0.0043, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 414 }, { "completion_length": 84.875, "epoch": 0.05553325304429279, "grad_norm": 0.994339108467102, "kl": 0.13498741388320923, "learning_rate": 2.1273943345595637e-06, "loss": 0.0054, "reward": 2.563499927520752, "reward_std": 0.8838834762573242, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 415 }, { "completion_length": 137.75, "epoch": 0.055667068111869396, "grad_norm": 0.5073060393333435, "kl": 0.08192146569490433, "learning_rate": 2.115071622908666e-06, "loss": 0.0033, "reward": 2.8602499961853027, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 416 }, { "completion_length": 117.375, "epoch": 0.055800883179446, "grad_norm": 0.027711402624845505, "kl": 0.0911334902048111, "learning_rate": 2.102758483147242e-06, "loss": 0.0036, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 417 }, { "completion_length": 149.0, "epoch": 0.055934698247022616, "grad_norm": 0.015585114248096943, "kl": 0.06714257597923279, "learning_rate": 2.090455221462156e-06, "loss": 0.0027, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 418 }, { "completion_length": 178.625, "epoch": 0.05606851331459922, "grad_norm": 0.7107576727867126, "kl": 0.05091589689254761, "learning_rate": 2.078162143794638e-06, "loss": 0.002, "reward": 2.5321249961853027, "reward_std": 0.9726253151893616, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 419 }, { "completion_length": 138.875, "epoch": 0.056202328382175835, "grad_norm": 0.7282541394233704, "kl": 0.06512701511383057, "learning_rate": 2.0658795558326745e-06, "loss": 0.0026, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 420 }, { "completion_length": 181.0, "epoch": 0.05633614344975244, "grad_norm": 0.48365283012390137, "kl": 0.043255969882011414, "learning_rate": 2.053607763003409e-06, "loss": 0.0017, "reward": 1.8756250143051147, "reward_std": 1.3811874389648438, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31312498450279236, "step": 421 }, { "completion_length": 123.0, "epoch": 0.05646995851732905, "grad_norm": 0.025365835055708885, "kl": 0.07185974717140198, "learning_rate": 2.0413470704655442e-06, "loss": 0.0029, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 422 }, { "completion_length": 197.625, "epoch": 0.05660377358490566, "grad_norm": 0.7775915861129761, "kl": 0.012414833530783653, "learning_rate": 2.029097783101758e-06, "loss": 0.0005, "reward": 0.468874990940094, "reward_std": 0.9726253747940063, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.156374990940094, "step": 423 }, { "completion_length": 136.125, "epoch": 0.05673758865248227, "grad_norm": 0.019394518807530403, "kl": 0.06297829747200012, "learning_rate": 2.0168602055111175e-06, "loss": 0.0025, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 424 }, { "completion_length": 198.25, "epoch": 0.05687140372005888, "grad_norm": 0.6754842400550842, "kl": 0.021924972534179688, "learning_rate": 2.004634642001507e-06, "loss": 0.0009, "reward": 1.2503750324249268, "reward_std": 1.3616429567337036, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250374972820282, "step": 425 }, { "completion_length": 198.25, "epoch": 0.05700521878763549, "grad_norm": 0.6574854254722595, "kl": 0.04047918692231178, "learning_rate": 1.99242139658206e-06, "loss": 0.0016, "reward": 1.8600000143051147, "reward_std": 1.4027438163757324, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29750001430511475, "step": 426 }, { "completion_length": 143.875, "epoch": 0.057139033855212094, "grad_norm": 0.5605131387710571, "kl": 0.06166316568851471, "learning_rate": 1.9802207729556023e-06, "loss": 0.0025, "reward": 2.5477499961853027, "reward_std": 0.9284311532974243, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 427 }, { "completion_length": 120.25, "epoch": 0.05727284892278871, "grad_norm": 0.016515597701072693, "kl": 0.12526576220989227, "learning_rate": 1.9680330745110954e-06, "loss": 0.005, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 428 }, { "completion_length": 144.375, "epoch": 0.057406663990365314, "grad_norm": 0.574342668056488, "kl": 0.062450721859931946, "learning_rate": 1.9558586043160944e-06, "loss": 0.0025, "reward": 0.8123749494552612, "reward_std": 0.17995865643024445, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.374875009059906, "step": 429 }, { "completion_length": 178.0, "epoch": 0.05754047905794193, "grad_norm": 1.617727279663086, "kl": 0.043791525065898895, "learning_rate": 1.9436976651092143e-06, "loss": 0.0018, "reward": 1.9382500648498535, "reward_std": 1.3148058652877808, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31325000524520874, "step": 430 }, { "completion_length": 186.125, "epoch": 0.057674294125518534, "grad_norm": 0.47106054425239563, "kl": 0.05467758700251579, "learning_rate": 1.9315505592925967e-06, "loss": 0.0022, "reward": 1.1724998950958252, "reward_std": 1.0980879068374634, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29749998450279236, "step": 431 }, { "completion_length": 126.5, "epoch": 0.05780810919309514, "grad_norm": 0.012111375108361244, "kl": 0.08726546913385391, "learning_rate": 1.9194175889243942e-06, "loss": 0.0035, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 432 }, { "completion_length": 199.0, "epoch": 0.057941924260671754, "grad_norm": 0.4242580831050873, "kl": 0.04138268530368805, "learning_rate": 1.9072990557112567e-06, "loss": 0.0017, "reward": 0.828374981880188, "reward_std": 1.2645572423934937, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20337499678134918, "step": 433 }, { "completion_length": 200.0, "epoch": 0.05807573932824836, "grad_norm": 0.018054109066724777, "kl": 0.009824749082326889, "learning_rate": 1.895195261000831e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 434 }, { "completion_length": 177.0, "epoch": 0.05820955439582497, "grad_norm": 0.49160170555114746, "kl": 0.04283319413661957, "learning_rate": 1.8831065057742658e-06, "loss": 0.0017, "reward": 2.5321249961853027, "reward_std": 0.9726253151893616, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 435 }, { "completion_length": 107.25, "epoch": 0.05834336946340158, "grad_norm": 0.021617529913783073, "kl": 0.11269970238208771, "learning_rate": 1.8710330906387288e-06, "loss": 0.0045, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 436 }, { "completion_length": 186.0, "epoch": 0.058477184530978187, "grad_norm": 0.6204230785369873, "kl": 0.045380815863609314, "learning_rate": 1.8589753158199292e-06, "loss": 0.0018, "reward": 0.59437495470047, "reward_std": 0.3886794149875641, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28187501430511475, "step": 437 }, { "completion_length": 113.5, "epoch": 0.0586109995985548, "grad_norm": 1.8613377809524536, "kl": 0.6283275485038757, "learning_rate": 1.8469334811546543e-06, "loss": 0.0251, "reward": 2.563499927520752, "reward_std": 0.8838834762573242, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 438 }, { "completion_length": 115.75, "epoch": 0.058744814666131406, "grad_norm": 0.047285065054893494, "kl": 0.10523299872875214, "learning_rate": 1.8349078860833125e-06, "loss": 0.0042, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 439 }, { "completion_length": 130.875, "epoch": 0.05887862973370801, "grad_norm": 0.5824537873268127, "kl": 0.08082161843776703, "learning_rate": 1.8228988296424877e-06, "loss": 0.0032, "reward": 2.563499927520752, "reward_std": 0.8838834762573242, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 440 }, { "completion_length": 183.5, "epoch": 0.059012444801284626, "grad_norm": 0.5198855996131897, "kl": 0.05057942122220993, "learning_rate": 1.8109066104575023e-06, "loss": 0.002, "reward": 2.2038750648498535, "reward_std": 1.2449820041656494, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32887497544288635, "step": 441 }, { "completion_length": 176.625, "epoch": 0.05914625986886123, "grad_norm": 0.5852713584899902, "kl": 0.054400622844696045, "learning_rate": 1.7989315267349936e-06, "loss": 0.0022, "reward": 2.2977499961853027, "reward_std": 1.0837033987045288, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 442 }, { "completion_length": 152.125, "epoch": 0.059280074936437846, "grad_norm": 1.7595407962799072, "kl": 0.4095238447189331, "learning_rate": 1.786973876255495e-06, "loss": 0.0164, "reward": 2.5320000648498535, "reward_std": 0.9231207370758057, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34450000524520874, "step": 443 }, { "completion_length": 136.375, "epoch": 0.05941389000401445, "grad_norm": 0.9685487747192383, "kl": 0.07697839289903641, "learning_rate": 1.7750339563660346e-06, "loss": 0.0031, "reward": 2.1102499961853027, "reward_std": 1.0575505495071411, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 444 }, { "completion_length": 140.0, "epoch": 0.05954770507159106, "grad_norm": 0.7236015796661377, "kl": 0.08824901282787323, "learning_rate": 1.7631120639727396e-06, "loss": 0.0035, "reward": 1.375999927520752, "reward_std": 1.2535663843154907, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 445 }, { "completion_length": 108.375, "epoch": 0.05968152013916767, "grad_norm": 0.027989843860268593, "kl": 0.10036399960517883, "learning_rate": 1.751208495533452e-06, "loss": 0.004, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 446 }, { "completion_length": 183.875, "epoch": 0.05981533520674428, "grad_norm": 0.5824564695358276, "kl": 0.04695302993059158, "learning_rate": 1.7393235470503595e-06, "loss": 0.0019, "reward": 2.2821249961853027, "reward_std": 1.1178104877471924, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 447 }, { "completion_length": 131.125, "epoch": 0.059949150274320885, "grad_norm": 0.5594069957733154, "kl": 0.08685468137264252, "learning_rate": 1.7274575140626318e-06, "loss": 0.0035, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 448 }, { "completion_length": 179.125, "epoch": 0.0600829653418975, "grad_norm": 0.0334378145635128, "kl": 0.06836998462677002, "learning_rate": 1.7156106916390744e-06, "loss": 0.0027, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 449 }, { "completion_length": 199.75, "epoch": 0.060216780409474105, "grad_norm": 0.5849582552909851, "kl": 0.006071791052818298, "learning_rate": 1.7037833743707892e-06, "loss": 0.0002, "reward": 0.203249990940094, "reward_std": 0.22132441401481628, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.140749990940094, "step": 450 }, { "completion_length": 169.375, "epoch": 0.06035059547705072, "grad_norm": 0.01197835523635149, "kl": 0.06328416615724564, "learning_rate": 1.6919758563638506e-06, "loss": 0.0025, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 451 }, { "completion_length": 126.625, "epoch": 0.060484410544627325, "grad_norm": 0.03237679600715637, "kl": 0.07159559428691864, "learning_rate": 1.6801884312319893e-06, "loss": 0.0029, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 452 }, { "completion_length": 181.375, "epoch": 0.06061822561220393, "grad_norm": 0.4030317962169647, "kl": 0.05052907019853592, "learning_rate": 1.6684213920892956e-06, "loss": 0.002, "reward": 1.8443750143051147, "reward_std": 1.42377769947052, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28187498450279236, "step": 453 }, { "completion_length": 189.0, "epoch": 0.060752040679780545, "grad_norm": 0.45015203952789307, "kl": 0.0394001342356205, "learning_rate": 1.6566750315429254e-06, "loss": 0.0016, "reward": 1.1566250324249268, "reward_std": 1.42377769947052, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21912500262260437, "step": 454 }, { "completion_length": 128.75, "epoch": 0.06088585574735715, "grad_norm": 0.7070989012718201, "kl": 0.06388413161039352, "learning_rate": 1.6449496416858285e-06, "loss": 0.0026, "reward": 2.860374927520752, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36037498712539673, "step": 455 }, { "completion_length": 150.0, "epoch": 0.061019670814933764, "grad_norm": 0.41116276383399963, "kl": 0.10549188405275345, "learning_rate": 1.633245514089482e-06, "loss": 0.0042, "reward": 2.8602499961853027, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 456 }, { "completion_length": 185.375, "epoch": 0.06115348588251037, "grad_norm": 0.5525246262550354, "kl": 0.050936780869960785, "learning_rate": 1.6215629397966432e-06, "loss": 0.002, "reward": 2.8602499961853027, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 457 }, { "completion_length": 167.5, "epoch": 0.06128730095008698, "grad_norm": 0.6805121302604675, "kl": 0.05624924972653389, "learning_rate": 1.609902209314108e-06, "loss": 0.0023, "reward": 2.5477499961853027, "reward_std": 0.9284311532974243, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 458 }, { "completion_length": 173.875, "epoch": 0.06142111601766359, "grad_norm": 0.5799795389175415, "kl": 0.04773347079753876, "learning_rate": 1.5982636126054909e-06, "loss": 0.0019, "reward": 0.719499945640564, "reward_std": 0.2569919228553772, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28200000524520874, "step": 459 }, { "completion_length": 178.375, "epoch": 0.0615549310852402, "grad_norm": 0.6381800174713135, "kl": 0.057099997997283936, "learning_rate": 1.5866474390840126e-06, "loss": 0.0023, "reward": 1.6882500648498535, "reward_std": 1.3010985851287842, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31325000524520874, "step": 460 }, { "completion_length": 140.375, "epoch": 0.06168874615281681, "grad_norm": 0.4703540503978729, "kl": 0.08957012742757797, "learning_rate": 1.575053977605303e-06, "loss": 0.0036, "reward": 2.5321249961853027, "reward_std": 0.9231545329093933, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 461 }, { "completion_length": 195.0, "epoch": 0.06182256122039342, "grad_norm": 0.5757173299789429, "kl": 0.030964083969593048, "learning_rate": 1.56348351646022e-06, "loss": 0.0012, "reward": 0.922249972820282, "reward_std": 1.232347846031189, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23475000262260437, "step": 462 }, { "completion_length": 161.625, "epoch": 0.06195637628797002, "grad_norm": 0.5668472051620483, "kl": 0.05526159703731537, "learning_rate": 1.5519363433676794e-06, "loss": 0.0022, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 463 }, { "completion_length": 169.25, "epoch": 0.06209019135554664, "grad_norm": 0.07676387578248978, "kl": 0.07294314354658127, "learning_rate": 1.5404127454674994e-06, "loss": 0.0029, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 464 }, { "completion_length": 187.75, "epoch": 0.06222400642312324, "grad_norm": 0.5265212655067444, "kl": 0.04017285630106926, "learning_rate": 1.5289130093132634e-06, "loss": 0.0016, "reward": 1.8443750143051147, "reward_std": 1.42377769947052, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28187498450279236, "step": 465 }, { "completion_length": 148.5, "epoch": 0.06235782149069985, "grad_norm": 0.061476338654756546, "kl": 0.10678229480981827, "learning_rate": 1.5174374208651913e-06, "loss": 0.0043, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 466 }, { "completion_length": 175.0, "epoch": 0.06249163655827646, "grad_norm": 0.5222886800765991, "kl": 0.05900602787733078, "learning_rate": 1.5059862654830298e-06, "loss": 0.0024, "reward": 2.5321249961853027, "reward_std": 0.9726253151893616, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 467 }, { "completion_length": 134.125, "epoch": 0.06262545162585308, "grad_norm": 0.5920611619949341, "kl": 0.06972117722034454, "learning_rate": 1.4945598279189565e-06, "loss": 0.0028, "reward": 2.860374927520752, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36037498712539673, "step": 468 }, { "completion_length": 104.0, "epoch": 0.06275926669342968, "grad_norm": 0.0188764501363039, "kl": 0.10782687366008759, "learning_rate": 1.4831583923105e-06, "loss": 0.0043, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 469 }, { "completion_length": 190.625, "epoch": 0.06289308176100629, "grad_norm": 0.6377516388893127, "kl": 0.04221629351377487, "learning_rate": 1.4717822421734717e-06, "loss": 0.0017, "reward": 1.0943748950958252, "reward_std": 1.1118839979171753, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28187498450279236, "step": 470 }, { "completion_length": 163.375, "epoch": 0.0630268968285829, "grad_norm": 0.0870915949344635, "kl": 0.07754913717508316, "learning_rate": 1.4604316603949186e-06, "loss": 0.0031, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 471 }, { "completion_length": 142.875, "epoch": 0.0631607118961595, "grad_norm": 0.04023164510726929, "kl": 0.08847596496343613, "learning_rate": 1.4491069292260867e-06, "loss": 0.0035, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 472 }, { "completion_length": 200.0, "epoch": 0.06329452696373612, "grad_norm": 0.5020349621772766, "kl": 0.014002653770148754, "learning_rate": 1.4378083302754043e-06, "loss": 0.0006, "reward": 0.484499990940094, "reward_std": 0.9673013687133789, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.171999990940094, "step": 473 }, { "completion_length": 198.5, "epoch": 0.06342834203131273, "grad_norm": 0.49262621998786926, "kl": 0.02544192038476467, "learning_rate": 1.426536144501477e-06, "loss": 0.001, "reward": 0.937874972820282, "reward_std": 1.2215403318405151, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25037500262260437, "step": 474 }, { "completion_length": 185.125, "epoch": 0.06356215709888934, "grad_norm": 0.588193416595459, "kl": 0.03286545351147652, "learning_rate": 1.415290652206105e-06, "loss": 0.0013, "reward": 0.312749981880188, "reward_std": 0.34764543175697327, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.187749981880188, "step": 475 }, { "completion_length": 200.0, "epoch": 0.06369597216646594, "grad_norm": 0.41891810297966003, "kl": 0.016301969066262245, "learning_rate": 1.4040721330273063e-06, "loss": 0.0007, "reward": 0.468874990940094, "reward_std": 0.9726253747940063, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.156374990940094, "step": 476 }, { "completion_length": 96.375, "epoch": 0.06382978723404255, "grad_norm": 0.022894730791449547, "kl": 0.08891279995441437, "learning_rate": 1.3928808659323717e-06, "loss": 0.0036, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 477 }, { "completion_length": 165.125, "epoch": 0.06396360230161917, "grad_norm": 0.5448727011680603, "kl": 0.06901370733976364, "learning_rate": 1.3817171292109182e-06, "loss": 0.0028, "reward": 2.5321249961853027, "reward_std": 0.9726253151893616, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 478 }, { "completion_length": 116.5, "epoch": 0.06409741736919577, "grad_norm": 0.5375649333000183, "kl": 0.17788663506507874, "learning_rate": 1.3705812004679796e-06, "loss": 0.0071, "reward": 0.8416249752044678, "reward_std": 0.06394626945257187, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34162500500679016, "step": 479 }, { "completion_length": 131.0, "epoch": 0.06423123243677238, "grad_norm": 0.9283366799354553, "kl": 0.09214182943105698, "learning_rate": 1.3594733566170925e-06, "loss": 0.0037, "reward": 2.2664999961853027, "reward_std": 1.1526027917861938, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32899999618530273, "step": 480 }, { "completion_length": 150.875, "epoch": 0.06436504750434899, "grad_norm": 0.7640847563743591, "kl": 0.11605776846408844, "learning_rate": 1.3483938738734197e-06, "loss": 0.0046, "reward": 2.563499927520752, "reward_std": 0.8838834762573242, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 481 }, { "completion_length": 159.5, "epoch": 0.0644988625719256, "grad_norm": 0.5165765285491943, "kl": 0.07449682056903839, "learning_rate": 1.337343027746874e-06, "loss": 0.003, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 482 }, { "completion_length": 132.75, "epoch": 0.06463267763950221, "grad_norm": 0.022092213854193687, "kl": 0.0806502029299736, "learning_rate": 1.3263210930352737e-06, "loss": 0.0032, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 483 }, { "completion_length": 200.0, "epoch": 0.06476649270707882, "grad_norm": 0.4168114960193634, "kl": 0.01730101928114891, "learning_rate": 1.3153283438175036e-06, "loss": 0.0007, "reward": 0.78125, "reward_std": 1.2151389122009277, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15625, "step": 484 }, { "completion_length": 175.5, "epoch": 0.06490030777465543, "grad_norm": 0.5690428614616394, "kl": 0.06238940730690956, "learning_rate": 1.3043650534467054e-06, "loss": 0.0025, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 485 }, { "completion_length": 157.625, "epoch": 0.06503412284223203, "grad_norm": 0.8219457268714905, "kl": 0.06742949783802032, "learning_rate": 1.2934314945434734e-06, "loss": 0.0027, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 486 }, { "completion_length": 166.375, "epoch": 0.06516793790980864, "grad_norm": 0.5514267086982727, "kl": 0.07967308908700943, "learning_rate": 1.2825279389890818e-06, "loss": 0.0032, "reward": 2.8445000648498535, "reward_std": 0.058326635509729385, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34450000524520874, "step": 487 }, { "completion_length": 152.0, "epoch": 0.06530175297738525, "grad_norm": 0.02159760519862175, "kl": 0.06032474339008331, "learning_rate": 1.271654657918722e-06, "loss": 0.0024, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 488 }, { "completion_length": 116.25, "epoch": 0.06543556804496187, "grad_norm": 0.6609854102134705, "kl": 0.1305571347475052, "learning_rate": 1.260811921714756e-06, "loss": 0.0052, "reward": 2.8564999103546143, "reward_std": 0.055154331028461456, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3564999997615814, "step": 489 }, { "completion_length": 200.0, "epoch": 0.06556938311253847, "grad_norm": 0.008528229780495167, "kl": 0.0020565292797982693, "learning_rate": 1.2500000000000007e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 490 }, { "completion_length": 190.125, "epoch": 0.06570319818011508, "grad_norm": 0.4148860275745392, "kl": 0.030032819136977196, "learning_rate": 1.2392191616310149e-06, "loss": 0.0012, "reward": 1.500499963760376, "reward_std": 1.4382281303405762, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25049999356269836, "step": 491 }, { "completion_length": 126.875, "epoch": 0.06583701324769169, "grad_norm": 0.07466115057468414, "kl": 0.10359154641628265, "learning_rate": 1.2284696746914216e-06, "loss": 0.0041, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 492 }, { "completion_length": 84.875, "epoch": 0.06597082831526829, "grad_norm": 0.01725056581199169, "kl": 0.08937688171863556, "learning_rate": 1.217751806485235e-06, "loss": 0.0036, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 493 }, { "completion_length": 173.25, "epoch": 0.06610464338284491, "grad_norm": 0.4079590439796448, "kl": 0.07844868302345276, "learning_rate": 1.2070658235302181e-06, "loss": 0.0031, "reward": 2.828624963760376, "reward_std": 0.13399668037891388, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32862499356269836, "step": 494 }, { "completion_length": 183.0, "epoch": 0.06623845845042152, "grad_norm": 0.5026001334190369, "kl": 0.03279690816998482, "learning_rate": 1.196411991551255e-06, "loss": 0.0013, "reward": 1.547374963760376, "reward_std": 1.4209489822387695, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29737499356269836, "step": 495 }, { "completion_length": 138.0, "epoch": 0.06637227351799813, "grad_norm": 1.609360933303833, "kl": 0.10265137255191803, "learning_rate": 1.185790575473738e-06, "loss": 0.0041, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 496 }, { "completion_length": 172.875, "epoch": 0.06650608858557473, "grad_norm": 0.47285860776901245, "kl": 0.06823530793190002, "learning_rate": 1.1752018394169882e-06, "loss": 0.0027, "reward": 2.5321249961853027, "reward_std": 0.9726253151893616, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 497 }, { "completion_length": 198.5, "epoch": 0.06663990365315134, "grad_norm": 0.5939863920211792, "kl": 0.027758851647377014, "learning_rate": 1.1646460466876783e-06, "loss": 0.0011, "reward": 0.906624972820282, "reward_std": 1.2428367137908936, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21912500262260437, "step": 498 }, { "completion_length": 171.0, "epoch": 0.06677371872072796, "grad_norm": 0.47271528840065, "kl": 0.07039660215377808, "learning_rate": 1.1541234597732947e-06, "loss": 0.0028, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 499 }, { "completion_length": 150.125, "epoch": 0.06690753378830457, "grad_norm": 0.021916870027780533, "kl": 0.05909294635057449, "learning_rate": 1.1436343403356019e-06, "loss": 0.0024, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 500 }, { "completion_length": 166.875, "epoch": 0.06704134885588117, "grad_norm": 0.5901755094528198, "kl": 0.058194730430841446, "learning_rate": 1.1331789492041413e-06, "loss": 0.0023, "reward": 2.0321249961853027, "reward_std": 1.1874943971633911, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 501 }, { "completion_length": 170.125, "epoch": 0.06717516392345778, "grad_norm": 0.6071956753730774, "kl": 0.07166877388954163, "learning_rate": 1.122757546369744e-06, "loss": 0.0029, "reward": 1.9382500648498535, "reward_std": 1.3148058652877808, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31325000524520874, "step": 502 }, { "completion_length": 145.875, "epoch": 0.06730897899103438, "grad_norm": 0.6624757051467896, "kl": 0.09455478191375732, "learning_rate": 1.112370390978063e-06, "loss": 0.0038, "reward": 2.822499990463257, "reward_std": 0.15132081508636475, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32249999046325684, "step": 503 }, { "completion_length": 104.25, "epoch": 0.067442794058611, "grad_norm": 0.023288747295737267, "kl": 0.08157818764448166, "learning_rate": 1.1020177413231334e-06, "loss": 0.0033, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 504 }, { "completion_length": 122.375, "epoch": 0.06757660912618761, "grad_norm": 0.7371475100517273, "kl": 0.08876486122608185, "learning_rate": 1.0916998548409449e-06, "loss": 0.0036, "reward": 2.860374927520752, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36037498712539673, "step": 505 }, { "completion_length": 94.75, "epoch": 0.06771042419376422, "grad_norm": 0.9014264345169067, "kl": 0.09251242130994797, "learning_rate": 1.081416988103046e-06, "loss": 0.0037, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 506 }, { "completion_length": 132.25, "epoch": 0.06784423926134082, "grad_norm": 0.7173821926116943, "kl": 0.09341318905353546, "learning_rate": 1.0711693968101563e-06, "loss": 0.0037, "reward": 2.563499927520752, "reward_std": 0.8838834762573242, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 507 }, { "completion_length": 180.5, "epoch": 0.06797805432891743, "grad_norm": 0.5452911257743835, "kl": 0.042653538286685944, "learning_rate": 1.0609573357858166e-06, "loss": 0.0017, "reward": 2.5163750648498535, "reward_std": 0.9672667980194092, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32887497544288635, "step": 508 }, { "completion_length": 139.75, "epoch": 0.06811186939649405, "grad_norm": 0.03266792371869087, "kl": 0.0783013254404068, "learning_rate": 1.0507810589700446e-06, "loss": 0.0031, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 509 }, { "completion_length": 193.875, "epoch": 0.06824568446407066, "grad_norm": 0.4969862997531891, "kl": 0.03490892052650452, "learning_rate": 1.040640819413026e-06, "loss": 0.0014, "reward": 1.1722500324249268, "reward_std": 1.4114717245101929, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23475000262260437, "step": 510 }, { "completion_length": 121.875, "epoch": 0.06837949953164726, "grad_norm": 0.7441927790641785, "kl": 0.15234751999378204, "learning_rate": 1.0305368692688175e-06, "loss": 0.0061, "reward": 2.5738749504089355, "reward_std": 0.8545385003089905, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3238750100135803, "step": 511 }, { "completion_length": 126.75, "epoch": 0.06851331459922387, "grad_norm": 0.8319272994995117, "kl": 0.16322901844978333, "learning_rate": 1.0204694597890814e-06, "loss": 0.0065, "reward": 1.5321249961853027, "reward_std": 1.131591796875, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 512 }, { "completion_length": 185.375, "epoch": 0.06864712966680048, "grad_norm": 0.8821345567703247, "kl": 0.17697496712207794, "learning_rate": 1.0104388413168308e-06, "loss": 0.0071, "reward": 1.4762499332427979, "reward_std": 1.4168943166732788, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16374999284744263, "step": 513 }, { "completion_length": 102.125, "epoch": 0.0687809447343771, "grad_norm": 0.020036092028021812, "kl": 0.09659949690103531, "learning_rate": 1.0004452632802158e-06, "loss": 0.0039, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 514 }, { "completion_length": 186.0, "epoch": 0.0689147598019537, "grad_norm": 0.7651040554046631, "kl": 0.05099139362573624, "learning_rate": 9.90488974186306e-07, "loss": 0.002, "reward": 2.2821249961853027, "reward_std": 1.1178104877471924, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 515 }, { "completion_length": 190.75, "epoch": 0.06904857486953031, "grad_norm": 0.42695412039756775, "kl": 0.049865543842315674, "learning_rate": 9.805702216149252e-07, "loss": 0.002, "reward": 1.562999963760376, "reward_std": 1.403656005859375, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31299999356269836, "step": 516 }, { "completion_length": 116.5, "epoch": 0.06918238993710692, "grad_norm": 0.023945944383740425, "kl": 0.09146615117788315, "learning_rate": 9.70689252212484e-07, "loss": 0.0037, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 517 }, { "completion_length": 171.0, "epoch": 0.06931620500468352, "grad_norm": 0.7296186685562134, "kl": 0.09797731786966324, "learning_rate": 9.608463116858544e-07, "loss": 0.0039, "reward": 1.5109999179840088, "reward_std": 1.3577779531478882, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.26099997758865356, "step": 518 }, { "completion_length": 189.125, "epoch": 0.06945002007226013, "grad_norm": 0.5418848991394043, "kl": 0.049279242753982544, "learning_rate": 9.510416447962545e-07, "loss": 0.002, "reward": 1.844249963760376, "reward_std": 1.3903591632843018, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28174999356269836, "step": 519 }, { "completion_length": 116.125, "epoch": 0.06958383513983675, "grad_norm": 0.02910609357059002, "kl": 0.09516607224941254, "learning_rate": 9.412754953531664e-07, "loss": 0.0038, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 520 }, { "completion_length": 119.375, "epoch": 0.06971765020741336, "grad_norm": 0.7130349278450012, "kl": 0.10331179201602936, "learning_rate": 9.315481062082688e-07, "loss": 0.0041, "reward": 2.5477499961853027, "reward_std": 0.9284311532974243, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 521 }, { "completion_length": 136.0, "epoch": 0.06985146527498996, "grad_norm": 0.5340465307235718, "kl": 0.08746315538883209, "learning_rate": 9.21859719249403e-07, "loss": 0.0035, "reward": 2.827624797821045, "reward_std": 0.13682518899440765, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32762497663497925, "step": 522 }, { "completion_length": 157.125, "epoch": 0.06998528034256657, "grad_norm": 0.5538584589958191, "kl": 0.07469462603330612, "learning_rate": 9.122105753945532e-07, "loss": 0.003, "reward": 2.8602499961853027, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 523 }, { "completion_length": 139.875, "epoch": 0.07011909541014318, "grad_norm": 0.025441091507673264, "kl": 0.08562923967838287, "learning_rate": 9.026009145858608e-07, "loss": 0.0034, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 524 }, { "completion_length": 182.0, "epoch": 0.0702529104777198, "grad_norm": 0.5950530767440796, "kl": 0.058806002140045166, "learning_rate": 8.930309757836517e-07, "loss": 0.0024, "reward": 1.5632500648498535, "reward_std": 1.406591773033142, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31325000524520874, "step": 525 }, { "completion_length": 80.125, "epoch": 0.0703867255452964, "grad_norm": 0.055731289088726044, "kl": 0.1456955224275589, "learning_rate": 8.835009969605013e-07, "loss": 0.0058, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 526 }, { "completion_length": 182.875, "epoch": 0.07052054061287301, "grad_norm": 0.4541008174419403, "kl": 0.05849134922027588, "learning_rate": 8.740112150953095e-07, "loss": 0.0023, "reward": 2.5163750648498535, "reward_std": 0.9672667980194092, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32887500524520874, "step": 527 }, { "completion_length": 187.25, "epoch": 0.07065435568044962, "grad_norm": 0.4512054920196533, "kl": 0.05276969447731972, "learning_rate": 8.645618661674144e-07, "loss": 0.0021, "reward": 0.609624981880188, "reward_std": 0.9557148218154907, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17212499678134918, "step": 528 }, { "completion_length": 91.625, "epoch": 0.07078817074802622, "grad_norm": 0.025568531826138496, "kl": 0.0966460257768631, "learning_rate": 8.551531851507186e-07, "loss": 0.0039, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 529 }, { "completion_length": 178.25, "epoch": 0.07092198581560284, "grad_norm": 0.47066354751586914, "kl": 0.046781882643699646, "learning_rate": 8.457854060078521e-07, "loss": 0.0019, "reward": 2.1882500648498535, "reward_std": 1.2734655141830444, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31325000524520874, "step": 530 }, { "completion_length": 89.625, "epoch": 0.07105580088317945, "grad_norm": 0.0411115325987339, "kl": 0.09715510904788971, "learning_rate": 8.364587616843478e-07, "loss": 0.0039, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 531 }, { "completion_length": 195.5, "epoch": 0.07118961595075605, "grad_norm": 0.47734642028808594, "kl": 0.02505526691675186, "learning_rate": 8.271734841028553e-07, "loss": 0.001, "reward": 0.593999981880188, "reward_std": 0.9564490914344788, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.218999981880188, "step": 532 }, { "completion_length": 129.875, "epoch": 0.07132343101833266, "grad_norm": 0.018141288310289383, "kl": 0.07236053049564362, "learning_rate": 8.179298041573671e-07, "loss": 0.0029, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 533 }, { "completion_length": 163.875, "epoch": 0.07145724608590927, "grad_norm": 0.5051037669181824, "kl": 0.0811697319149971, "learning_rate": 8.08727951707487e-07, "loss": 0.0032, "reward": 1.8913750648498535, "reward_std": 1.3605663776397705, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32887500524520874, "step": 534 }, { "completion_length": 181.125, "epoch": 0.07159106115348589, "grad_norm": 0.4789898991584778, "kl": 0.05631384253501892, "learning_rate": 7.995681555727011e-07, "loss": 0.0023, "reward": 2.2038750648498535, "reward_std": 1.2449820041656494, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32887500524520874, "step": 535 }, { "completion_length": 117.0, "epoch": 0.0717248762210625, "grad_norm": 0.019799182191491127, "kl": 0.08036641776561737, "learning_rate": 7.904506435266998e-07, "loss": 0.0032, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 536 }, { "completion_length": 107.5, "epoch": 0.0718586912886391, "grad_norm": 1.1266930103302002, "kl": 0.11253387480974197, "learning_rate": 7.813756422917046e-07, "loss": 0.0045, "reward": 2.8602499961853027, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 537 }, { "completion_length": 130.625, "epoch": 0.07199250635621571, "grad_norm": 0.04750635102391243, "kl": 0.07945028692483902, "learning_rate": 7.723433775328385e-07, "loss": 0.0032, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 538 }, { "completion_length": 131.375, "epoch": 0.07212632142379231, "grad_norm": 0.025986328721046448, "kl": 0.09481579065322876, "learning_rate": 7.633540738525066e-07, "loss": 0.0038, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 539 }, { "completion_length": 110.75, "epoch": 0.07226013649136893, "grad_norm": 0.01735413447022438, "kl": 0.08277501165866852, "learning_rate": 7.544079547848183e-07, "loss": 0.0033, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 540 }, { "completion_length": 128.0, "epoch": 0.07239395155894554, "grad_norm": 0.03729762136936188, "kl": 0.07067389786243439, "learning_rate": 7.455052427900214e-07, "loss": 0.0028, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 541 }, { "completion_length": 125.625, "epoch": 0.07252776662652215, "grad_norm": 1.8370012044906616, "kl": 0.10086166858673096, "learning_rate": 7.366461592489781e-07, "loss": 0.004, "reward": 2.563499927520752, "reward_std": 0.8838834762573242, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 542 }, { "completion_length": 161.125, "epoch": 0.07266158169409875, "grad_norm": 0.02304711751639843, "kl": 0.056352630257606506, "learning_rate": 7.278309244576525e-07, "loss": 0.0023, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 543 }, { "completion_length": 187.375, "epoch": 0.07279539676167536, "grad_norm": 0.501508355140686, "kl": 0.03645408898591995, "learning_rate": 7.190597576216385e-07, "loss": 0.0015, "reward": 0.547374963760376, "reward_std": 0.3536894917488098, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29737499356269836, "step": 544 }, { "completion_length": 151.625, "epoch": 0.07292921182925198, "grad_norm": 0.028268888592720032, "kl": 0.08622566610574722, "learning_rate": 7.103328768507039e-07, "loss": 0.0034, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 545 }, { "completion_length": 182.125, "epoch": 0.07306302689682859, "grad_norm": 0.42480555176734924, "kl": 0.04699990153312683, "learning_rate": 7.016504991533727e-07, "loss": 0.0019, "reward": 1.844249963760376, "reward_std": 1.3903591632843018, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28174999356269836, "step": 546 }, { "completion_length": 146.75, "epoch": 0.0731968419644052, "grad_norm": 0.6320887804031372, "kl": 0.08432218432426453, "learning_rate": 6.930128404315214e-07, "loss": 0.0034, "reward": 2.84375, "reward_std": 0.09121671319007874, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34375, "step": 547 }, { "completion_length": 121.0, "epoch": 0.0733306570319818, "grad_norm": 0.0864342674612999, "kl": 0.09474191069602966, "learning_rate": 6.844201154750176e-07, "loss": 0.0038, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 548 }, { "completion_length": 127.625, "epoch": 0.0734644720995584, "grad_norm": 0.9658562541007996, "kl": 0.077848419547081, "learning_rate": 6.75872537956373e-07, "loss": 0.0031, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 549 }, { "completion_length": 140.0, "epoch": 0.07359828716713501, "grad_norm": 1.1906623840332031, "kl": 0.10162507742643356, "learning_rate": 6.673703204254348e-07, "loss": 0.0041, "reward": 1.875999927520752, "reward_std": 1.0690449476242065, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 550 }, { "completion_length": 180.75, "epoch": 0.07373210223471163, "grad_norm": 0.01769913360476494, "kl": 0.056974902749061584, "learning_rate": 6.589136743040955e-07, "loss": 0.0023, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 551 }, { "completion_length": 115.0, "epoch": 0.07386591730228824, "grad_norm": 0.8261173963546753, "kl": 0.21568548679351807, "learning_rate": 6.505028098810407e-07, "loss": 0.0086, "reward": 2.5722498893737793, "reward_std": 0.7602196335792542, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32225000858306885, "step": 552 }, { "completion_length": 186.5, "epoch": 0.07399973236986485, "grad_norm": 0.6593990325927734, "kl": 0.04152335599064827, "learning_rate": 6.421379363065142e-07, "loss": 0.0017, "reward": 1.2503750324249268, "reward_std": 1.3616429567337036, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25037500262260437, "step": 553 }, { "completion_length": 109.75, "epoch": 0.07413354743744145, "grad_norm": 0.029769033193588257, "kl": 0.08850817382335663, "learning_rate": 6.338192615871247e-07, "loss": 0.0035, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 554 }, { "completion_length": 153.125, "epoch": 0.07426736250501806, "grad_norm": 0.4243547320365906, "kl": 0.07389362156391144, "learning_rate": 6.255469925806643e-07, "loss": 0.003, "reward": 2.6102499961853027, "reward_std": 0.7516544461250305, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 555 }, { "completion_length": 111.75, "epoch": 0.07440117757259468, "grad_norm": 1.2789517641067505, "kl": 0.07200345396995544, "learning_rate": 6.17321334990973e-07, "loss": 0.0029, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 556 }, { "completion_length": 135.75, "epoch": 0.07453499264017129, "grad_norm": 0.6069316267967224, "kl": 0.0893450677394867, "learning_rate": 6.09142493362816e-07, "loss": 0.0036, "reward": 2.860374927520752, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36037498712539673, "step": 557 }, { "completion_length": 194.5, "epoch": 0.07466880770774789, "grad_norm": 0.6206295490264893, "kl": 0.024507684633135796, "learning_rate": 6.010106710768051e-07, "loss": 0.001, "reward": 0.562624990940094, "reward_std": 0.921034038066864, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.187624990940094, "step": 558 }, { "completion_length": 122.125, "epoch": 0.0748026227753245, "grad_norm": 0.025246400386095047, "kl": 0.07555050402879715, "learning_rate": 5.929260703443337e-07, "loss": 0.003, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 559 }, { "completion_length": 162.125, "epoch": 0.0749364378429011, "grad_norm": 0.05673745274543762, "kl": 0.07199706137180328, "learning_rate": 5.848888922025553e-07, "loss": 0.0029, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 560 }, { "completion_length": 171.125, "epoch": 0.07507025291047772, "grad_norm": 0.021634168922901154, "kl": 0.06122050806879997, "learning_rate": 5.768993365093784e-07, "loss": 0.0024, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 561 }, { "completion_length": 152.0, "epoch": 0.07520406797805433, "grad_norm": 0.6721054911613464, "kl": 0.07255363464355469, "learning_rate": 5.689576019385015e-07, "loss": 0.0029, "reward": 1.875999927520752, "reward_std": 1.0690449476242065, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 562 }, { "completion_length": 166.0, "epoch": 0.07533788304563094, "grad_norm": 0.4310239553451538, "kl": 0.04782772809267044, "learning_rate": 5.610638859744694e-07, "loss": 0.0019, "reward": 2.5163750648498535, "reward_std": 0.9672667980194092, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32887500524520874, "step": 563 }, { "completion_length": 200.0, "epoch": 0.07547169811320754, "grad_norm": 0.01660126820206642, "kl": 0.011784886009991169, "learning_rate": 5.532183849077651e-07, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 564 }, { "completion_length": 124.75, "epoch": 0.07560551318078415, "grad_norm": 0.06705635040998459, "kl": 0.09791280329227448, "learning_rate": 5.454212938299256e-07, "loss": 0.0039, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 565 }, { "completion_length": 145.75, "epoch": 0.07573932824836077, "grad_norm": 0.5540480613708496, "kl": 0.08040163666009903, "learning_rate": 5.376728066286943e-07, "loss": 0.0032, "reward": 2.8602499961853027, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 566 }, { "completion_length": 109.125, "epoch": 0.07587314331593738, "grad_norm": 0.027278535068035126, "kl": 0.08633355796337128, "learning_rate": 5.299731159831953e-07, "loss": 0.0035, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 567 }, { "completion_length": 122.5, "epoch": 0.07600695838351398, "grad_norm": 0.027871521189808846, "kl": 0.09406545013189316, "learning_rate": 5.223224133591475e-07, "loss": 0.0038, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 568 }, { "completion_length": 120.75, "epoch": 0.07614077345109059, "grad_norm": 0.5620666742324829, "kl": 0.10334020853042603, "learning_rate": 5.147208890040975e-07, "loss": 0.0041, "reward": 2.8602499961853027, "reward_std": 0.0445476770401001, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 569 }, { "completion_length": 175.875, "epoch": 0.0762745885186672, "grad_norm": 0.014393470250070095, "kl": 0.06305301189422607, "learning_rate": 5.071687319426946e-07, "loss": 0.0025, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 570 }, { "completion_length": 137.375, "epoch": 0.07640840358624382, "grad_norm": 0.6216964721679688, "kl": 0.09672308713197708, "learning_rate": 4.996661299719846e-07, "loss": 0.0039, "reward": 2.8368749618530273, "reward_std": 0.11066216975450516, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33687499165534973, "step": 571 }, { "completion_length": 191.5, "epoch": 0.07654221865382042, "grad_norm": 0.5882800817489624, "kl": 0.033178165555000305, "learning_rate": 4.922132696567463e-07, "loss": 0.0013, "reward": 1.0003750324249268, "reward_std": 1.193793535232544, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25037500262260437, "step": 572 }, { "completion_length": 126.125, "epoch": 0.07667603372139703, "grad_norm": 0.8736768960952759, "kl": 0.0916358083486557, "learning_rate": 4.848103363248447e-07, "loss": 0.0037, "reward": 1.688249945640564, "reward_std": 1.3010985851287842, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31325000524520874, "step": 573 }, { "completion_length": 139.875, "epoch": 0.07680984878897364, "grad_norm": 0.6638439893722534, "kl": 0.08750330656766891, "learning_rate": 4.774575140626317e-07, "loss": 0.0035, "reward": 2.5764999389648438, "reward_std": 0.7009395956993103, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3264999985694885, "step": 574 }, { "completion_length": 86.25, "epoch": 0.07694366385655024, "grad_norm": 0.7298170924186707, "kl": 0.1341966688632965, "learning_rate": 4.7015498571035877e-07, "loss": 0.0054, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 575 }, { "completion_length": 165.625, "epoch": 0.07707747892412686, "grad_norm": 0.4482704699039459, "kl": 0.05860976129770279, "learning_rate": 4.6290293285763816e-07, "loss": 0.0023, "reward": 2.5477499961853027, "reward_std": 0.9284311532974243, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 576 }, { "completion_length": 124.0, "epoch": 0.07721129399170347, "grad_norm": 0.04342132434248924, "kl": 0.09685185551643372, "learning_rate": 4.5570153583892165e-07, "loss": 0.0039, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 577 }, { "completion_length": 148.75, "epoch": 0.07734510905928008, "grad_norm": 0.021825704723596573, "kl": 0.08103406429290771, "learning_rate": 4.485509737290214e-07, "loss": 0.0032, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 578 }, { "completion_length": 160.75, "epoch": 0.07747892412685668, "grad_norm": 0.013138951733708382, "kl": 0.04282214865088463, "learning_rate": 4.41451424338652e-07, "loss": 0.0017, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 579 }, { "completion_length": 198.75, "epoch": 0.07761273919443329, "grad_norm": 0.46848276257514954, "kl": 0.014850848354399204, "learning_rate": 4.344030642100133e-07, "loss": 0.0006, "reward": 0.468874990940094, "reward_std": 0.9726253747940063, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.156374990940094, "step": 580 }, { "completion_length": 83.125, "epoch": 0.0777465542620099, "grad_norm": 0.021492378786206245, "kl": 0.09291397780179977, "learning_rate": 4.27406068612396e-07, "loss": 0.0037, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 581 }, { "completion_length": 156.125, "epoch": 0.07788036932958652, "grad_norm": 0.48811134696006775, "kl": 0.0598423033952713, "learning_rate": 4.204606115378282e-07, "loss": 0.0024, "reward": 2.5321249961853027, "reward_std": 0.9231545329093933, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 582 }, { "completion_length": 111.5, "epoch": 0.07801418439716312, "grad_norm": 0.036602240055799484, "kl": 0.0803874209523201, "learning_rate": 4.1356686569674344e-07, "loss": 0.0032, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 583 }, { "completion_length": 88.875, "epoch": 0.07814799946473973, "grad_norm": 0.03849063813686371, "kl": 0.08863966166973114, "learning_rate": 4.0672500251369204e-07, "loss": 0.0035, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 584 }, { "completion_length": 117.125, "epoch": 0.07828181453231634, "grad_norm": 0.5750055313110352, "kl": 0.07768222689628601, "learning_rate": 3.999351921230715e-07, "loss": 0.0031, "reward": 2.844749927520752, "reward_std": 0.0578637570142746, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34474998712539673, "step": 585 }, { "completion_length": 123.125, "epoch": 0.07841562959989294, "grad_norm": 0.018067844212055206, "kl": 0.0754643976688385, "learning_rate": 3.931976033649021e-07, "loss": 0.003, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 586 }, { "completion_length": 105.375, "epoch": 0.07854944466746956, "grad_norm": 0.038764555007219315, "kl": 0.08026950061321259, "learning_rate": 3.8651240378062505e-07, "loss": 0.0032, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 587 }, { "completion_length": 192.375, "epoch": 0.07868325973504617, "grad_norm": 0.7248687744140625, "kl": 0.02163390815258026, "learning_rate": 3.798797596089351e-07, "loss": 0.0009, "reward": 0.906624972820282, "reward_std": 1.2428367137908936, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21912500262260437, "step": 588 }, { "completion_length": 125.625, "epoch": 0.07881707480262277, "grad_norm": 0.036562297493219376, "kl": 0.07197481393814087, "learning_rate": 3.732998357816514e-07, "loss": 0.0029, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 589 }, { "completion_length": 134.125, "epoch": 0.07895088987019938, "grad_norm": 0.4463981091976166, "kl": 0.09366697072982788, "learning_rate": 3.66772795919611e-07, "loss": 0.0037, "reward": 2.6102499961853027, "reward_std": 0.7516544461250305, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36024999618530273, "step": 590 }, { "completion_length": 168.0, "epoch": 0.07908470493777599, "grad_norm": 0.5940809845924377, "kl": 0.07222042977809906, "learning_rate": 3.6029880232860417e-07, "loss": 0.0029, "reward": 1.3012499809265137, "reward_std": 1.228981375694275, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.30124998092651367, "step": 591 }, { "completion_length": 122.75, "epoch": 0.07921852000535261, "grad_norm": 0.7764634490013123, "kl": 0.09206965565681458, "learning_rate": 3.538780159953348e-07, "loss": 0.0037, "reward": 2.563499927520752, "reward_std": 0.8838834762573242, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 592 }, { "completion_length": 194.5, "epoch": 0.07935233507292921, "grad_norm": 0.610711932182312, "kl": 0.03278031945228577, "learning_rate": 3.4751059658342106e-07, "loss": 0.0013, "reward": 1.4847500324249268, "reward_std": 1.454218864440918, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23475000262260437, "step": 593 }, { "completion_length": 112.25, "epoch": 0.07948615014050582, "grad_norm": 0.031216178089380264, "kl": 0.088915154337883, "learning_rate": 3.41196702429423e-07, "loss": 0.0036, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 594 }, { "completion_length": 138.25, "epoch": 0.07961996520808243, "grad_norm": 0.03143882006406784, "kl": 0.09493938833475113, "learning_rate": 3.3493649053890325e-07, "loss": 0.0038, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 595 }, { "completion_length": 109.375, "epoch": 0.07975378027565903, "grad_norm": 0.017304735258221626, "kl": 0.09940584003925323, "learning_rate": 3.2873011658252796e-07, "loss": 0.004, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 596 }, { "completion_length": 170.5, "epoch": 0.07988759534323565, "grad_norm": 0.5503603219985962, "kl": 0.0743388682603836, "learning_rate": 3.225777348921899e-07, "loss": 0.003, "reward": 2.5321249961853027, "reward_std": 0.9726253151893616, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 597 }, { "completion_length": 154.25, "epoch": 0.08002141041081226, "grad_norm": 0.6240984797477722, "kl": 0.09557679295539856, "learning_rate": 3.164794984571759e-07, "loss": 0.0038, "reward": 2.8297500610351562, "reward_std": 0.13081471621990204, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3297500014305115, "step": 598 }, { "completion_length": 162.625, "epoch": 0.08015522547838887, "grad_norm": 0.013632676564157009, "kl": 0.07335784286260605, "learning_rate": 3.1043555892035863e-07, "loss": 0.0029, "reward": 2.875999927520752, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 599 }, { "completion_length": 112.5, "epoch": 0.08028904054596547, "grad_norm": 1.010912537574768, "kl": 0.08338392525911331, "learning_rate": 3.044460665744284e-07, "loss": 0.0033, "reward": 2.625999927520752, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 600 } ], "logging_steps": 1, "max_steps": 700, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }