{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 27, "global_step": 267, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 515.3928833007812, "epoch": 0.003745318352059925, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": -0.1899932947490015, "reward_std": 1.3191318809986115, "rewards/correctness_reward_func": 0.2812500149011612, "rewards/int_reward_func": 0.318080373108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.789323702454567, "step": 1 }, { "completion_length": 533.0357513427734, "epoch": 0.00749063670411985, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": -0.00650669834431028, "reward_std": 1.158670336008072, "rewards/correctness_reward_func": 0.3125000149011612, "rewards/int_reward_func": 0.2756696566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5946763753890991, "step": 2 }, { "completion_length": 519.4286041259766, "epoch": 0.011235955056179775, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": -0.07290402640137472, "reward_std": 1.1724370419979095, "rewards/correctness_reward_func": 0.2500000149011612, "rewards/int_reward_func": 0.266741082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5896451026201248, "step": 3 }, { "completion_length": 526.2857513427734, "epoch": 0.0149812734082397, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": -0.16426785849034786, "reward_std": 1.2251884937286377, "rewards/correctness_reward_func": 0.1875000074505806, "rewards/int_reward_func": 0.2700892984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.6229732632637024, "step": 4 }, { "completion_length": 529.8928680419922, "epoch": 0.018726591760299626, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": -0.23104465380311012, "reward_std": 1.5072951018810272, "rewards/correctness_reward_func": 0.2767857275903225, "rewards/int_reward_func": 0.2957589402794838, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8047053962945938, "step": 5 }, { "completion_length": 544.4643249511719, "epoch": 0.02247191011235955, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": -0.09965624660253525, "reward_std": 1.2428130805492401, "rewards/correctness_reward_func": 0.334821455180645, "rewards/int_reward_func": 0.266741082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7012187838554382, "step": 6 }, { "completion_length": 522.5000152587891, "epoch": 0.026217228464419477, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": -0.13676341250538826, "reward_std": 1.2565626800060272, "rewards/correctness_reward_func": 0.325892873108387, "rewards/int_reward_func": 0.2756696566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.738325908780098, "step": 7 }, { "completion_length": 521.4643096923828, "epoch": 0.0299625468164794, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": -0.02000223658978939, "reward_std": 1.237034946680069, "rewards/correctness_reward_func": 0.3705357387661934, "rewards/int_reward_func": 0.2678571492433548, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6595112085342407, "step": 8 }, { "completion_length": 516.4643096923828, "epoch": 0.033707865168539325, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": -0.19426116452086717, "reward_std": 1.3457031548023224, "rewards/correctness_reward_func": 0.2544643022119999, "rewards/int_reward_func": 0.2801339402794838, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7299755066633224, "step": 9 }, { "completion_length": 521.5357360839844, "epoch": 0.03745318352059925, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": -0.08841964812017977, "reward_std": 1.1999126970767975, "rewards/correctness_reward_func": 0.2812500111758709, "rewards/int_reward_func": 0.3035714477300644, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6732410788536072, "step": 10 }, { "completion_length": 536.6428833007812, "epoch": 0.04119850187265917, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": -0.2129687536507845, "reward_std": 1.3859791457653046, "rewards/correctness_reward_func": 0.2678571566939354, "rewards/int_reward_func": 0.271205373108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7520312815904617, "step": 11 }, { "completion_length": 534.5000305175781, "epoch": 0.0449438202247191, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": -0.14605582039803267, "reward_std": 1.3036423921585083, "rewards/correctness_reward_func": 0.2589285895228386, "rewards/int_reward_func": 0.2957589440047741, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7007433176040649, "step": 12 }, { "completion_length": 517.1786041259766, "epoch": 0.04868913857677903, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": -0.21629687771201134, "reward_std": 1.3215062022209167, "rewards/correctness_reward_func": 0.2678571529686451, "rewards/int_reward_func": 0.2700892947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7542433589696884, "step": 13 }, { "completion_length": 523.5714569091797, "epoch": 0.052434456928838954, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": -0.11244195885956287, "reward_std": 1.3325236439704895, "rewards/correctness_reward_func": 0.3125000111758709, "rewards/int_reward_func": 0.2812500149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.7073080092668533, "step": 14 }, { "completion_length": 520.7500305175781, "epoch": 0.056179775280898875, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": -0.16676786169409752, "reward_std": 1.2786953151226044, "rewards/correctness_reward_func": 0.258928582072258, "rewards/int_reward_func": 0.2868303656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7125268131494522, "step": 15 }, { "completion_length": 529.5000305175781, "epoch": 0.0599250936329588, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": -0.15193304512649775, "reward_std": 1.2763742506504059, "rewards/correctness_reward_func": 0.2767857313156128, "rewards/int_reward_func": 0.2912946492433548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7200134247541428, "step": 16 }, { "completion_length": 529.1428985595703, "epoch": 0.06367041198501873, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": -0.12793974205851555, "reward_std": 1.207583248615265, "rewards/correctness_reward_func": 0.2544643022119999, "rewards/int_reward_func": 0.2678571492433548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6502611860632896, "step": 17 }, { "completion_length": 532.3214569091797, "epoch": 0.06741573033707865, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": -0.13686831016093493, "reward_std": 1.2532348036766052, "rewards/correctness_reward_func": 0.290178582072258, "rewards/int_reward_func": 0.2834821566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7105290591716766, "step": 18 }, { "completion_length": 520.7143096923828, "epoch": 0.07116104868913857, "grad_norm": 44.85335159301758, "kl": 0.0, "learning_rate": 3.7037037037037036e-08, "loss": 0.0, "reward": -0.061281259171664715, "reward_std": 1.1696560382843018, "rewards/correctness_reward_func": 0.2991071529686451, "rewards/int_reward_func": 0.2890625149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6494508683681488, "step": 19 }, { "completion_length": 521.7500152587891, "epoch": 0.0749063670411985, "grad_norm": 57.530582427978516, "kl": 0.0, "learning_rate": 7.407407407407407e-08, "loss": 0.0, "reward": -0.08753572031855583, "reward_std": 1.292522132396698, "rewards/correctness_reward_func": 0.357142873108387, "rewards/int_reward_func": 0.2890625149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7337411046028137, "step": 20 }, { "completion_length": 549.2143249511719, "epoch": 0.07865168539325842, "grad_norm": 58.32808303833008, "kl": 0.0003091096878051758, "learning_rate": 1.111111111111111e-07, "loss": 0.0, "reward": -0.011935262009501457, "reward_std": 1.229793220758438, "rewards/correctness_reward_func": 0.3705357387661934, "rewards/int_reward_func": 0.2946428656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6771138235926628, "step": 21 }, { "completion_length": 515.3928833007812, "epoch": 0.08239700374531835, "grad_norm": 46.207237243652344, "kl": 0.00029021501541137695, "learning_rate": 1.4814814814814815e-07, "loss": 0.0, "reward": -0.2325044833123684, "reward_std": 1.287639170885086, "rewards/correctness_reward_func": 0.2946428768336773, "rewards/int_reward_func": 0.2667410895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7938884347677231, "step": 22 }, { "completion_length": 520.4286041259766, "epoch": 0.08614232209737828, "grad_norm": 59.19898223876953, "kl": 0.0009493827819824219, "learning_rate": 1.8518518518518516e-07, "loss": 0.0, "reward": -0.27188840508461, "reward_std": 1.2708870768547058, "rewards/correctness_reward_func": 0.2366071566939354, "rewards/int_reward_func": 0.2801339402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.788629487156868, "step": 23 }, { "completion_length": 524.7143096923828, "epoch": 0.0898876404494382, "grad_norm": 51.78929901123047, "kl": 0.008197784423828125, "learning_rate": 2.222222222222222e-07, "loss": 0.0003, "reward": -0.1769486702978611, "reward_std": 1.3113940060138702, "rewards/correctness_reward_func": 0.2812500149011612, "rewards/int_reward_func": 0.2667410895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7249397337436676, "step": 24 }, { "completion_length": 522.7857360839844, "epoch": 0.09363295880149813, "grad_norm": 46.790557861328125, "kl": 0.06230926513671875, "learning_rate": 2.5925925925925923e-07, "loss": 0.0025, "reward": -0.05646652076393366, "reward_std": 1.2401413023471832, "rewards/correctness_reward_func": 0.3750000223517418, "rewards/int_reward_func": 0.2689732238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7004397809505463, "step": 25 }, { "completion_length": 519.3571624755859, "epoch": 0.09737827715355805, "grad_norm": 37.130306243896484, "kl": 0.0727081298828125, "learning_rate": 2.962962962962963e-07, "loss": 0.0029, "reward": -0.14675001706928015, "reward_std": 1.2979865074157715, "rewards/correctness_reward_func": 0.2991071492433548, "rewards/int_reward_func": 0.318080373108387, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7650536149740219, "step": 26 }, { "completion_length": 517.5000305175781, "epoch": 0.10112359550561797, "grad_norm": 39.826942443847656, "kl": 0.2943115234375, "learning_rate": 3.333333333333333e-07, "loss": 0.0118, "reward": -0.10758260171860456, "reward_std": 1.2460315823554993, "rewards/correctness_reward_func": 0.3080357201397419, "rewards/int_reward_func": 0.2734375074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6890558004379272, "step": 27 }, { "epoch": 0.10112359550561797, "eval_completion_length": 533.4601748817845, "eval_kl": 0.3318269428453947, "eval_loss": 0.013290046714246273, "eval_reward": -0.19538318767751517, "eval_reward_std": 1.2432113622364245, "eval_rewards/correctness_reward_func": 0.20874061106066955, "eval_rewards/int_reward_func": 0.27404449911494005, "eval_rewards/soft_format_reward_func": 0.00016447369156307297, "eval_rewards/strict_format_reward_func": 4.6992483303735133e-05, "eval_rewards/xmlcount_reward_func": -0.6783797549573999, "eval_runtime": 2477.4681, "eval_samples_per_second": 0.532, "eval_steps_per_second": 0.038, "step": 27 }, { "completion_length": 519.5000152587891, "epoch": 0.10486891385767791, "grad_norm": 36.47107696533203, "kl": 0.3983154296875, "learning_rate": 3.703703703703703e-07, "loss": 0.0159, "reward": -0.08669420145452023, "reward_std": 1.1603919565677643, "rewards/correctness_reward_func": 0.24107144493609667, "rewards/int_reward_func": 0.2812500149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6090156137943268, "step": 28 }, { "completion_length": 524.8214416503906, "epoch": 0.10861423220973783, "grad_norm": 30.607803344726562, "kl": 0.40972900390625, "learning_rate": 4.0740740740740737e-07, "loss": 0.0164, "reward": -0.06404018122702837, "reward_std": 1.2442965805530548, "rewards/correctness_reward_func": 0.3526785969734192, "rewards/int_reward_func": 0.2656250074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6823437958955765, "step": 29 }, { "completion_length": 525.4643096923828, "epoch": 0.11235955056179775, "grad_norm": 15.785079956054688, "kl": 0.9742431640625, "learning_rate": 4.444444444444444e-07, "loss": 0.039, "reward": -0.12213393254205585, "reward_std": 1.2745769321918488, "rewards/correctness_reward_func": 0.263392873108387, "rewards/int_reward_func": 0.2767857275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6623125225305557, "step": 30 }, { "completion_length": 519.0714416503906, "epoch": 0.11610486891385768, "grad_norm": 12.257494926452637, "kl": 0.83203125, "learning_rate": 4.814814814814814e-07, "loss": 0.0333, "reward": -0.03814508765935898, "reward_std": 1.1726315319538116, "rewards/correctness_reward_func": 0.299107164144516, "rewards/int_reward_func": 0.2700892947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6073415130376816, "step": 31 }, { "completion_length": 531.1428985595703, "epoch": 0.1198501872659176, "grad_norm": 13.690109252929688, "kl": 1.0924072265625, "learning_rate": 5.185185185185185e-07, "loss": 0.0437, "reward": -0.08791965199634433, "reward_std": 1.1981682777404785, "rewards/correctness_reward_func": 0.2410714440047741, "rewards/int_reward_func": 0.3247768059372902, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.653767928481102, "step": 32 }, { "completion_length": 527.7500152587891, "epoch": 0.12359550561797752, "grad_norm": 10.83491325378418, "kl": 0.9154052734375, "learning_rate": 5.555555555555555e-07, "loss": 0.0366, "reward": -0.1102946475148201, "reward_std": 1.2648592591285706, "rewards/correctness_reward_func": 0.2812500149011612, "rewards/int_reward_func": 0.2656250074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6571696400642395, "step": 33 }, { "completion_length": 520.0357513427734, "epoch": 0.12734082397003746, "grad_norm": 5.259093284606934, "kl": 1.219482421875, "learning_rate": 5.925925925925926e-07, "loss": 0.0488, "reward": -0.1045870566740632, "reward_std": 1.2835467457771301, "rewards/correctness_reward_func": 0.2857142984867096, "rewards/int_reward_func": 0.2912946566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6815960258245468, "step": 34 }, { "completion_length": 536.4286041259766, "epoch": 0.13108614232209737, "grad_norm": 4.761948108673096, "kl": 1.23583984375, "learning_rate": 6.296296296296296e-07, "loss": 0.0494, "reward": -0.1267544706352055, "reward_std": 1.181850403547287, "rewards/correctness_reward_func": 0.2723214328289032, "rewards/int_reward_func": 0.2823660895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6814420074224472, "step": 35 }, { "completion_length": 515.1428833007812, "epoch": 0.1348314606741573, "grad_norm": 4.87550687789917, "kl": 1.398681640625, "learning_rate": 6.666666666666666e-07, "loss": 0.0559, "reward": -0.21099777147173882, "reward_std": 1.4128418564796448, "rewards/correctness_reward_func": 0.2544642984867096, "rewards/int_reward_func": 0.3035714402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7690335363149643, "step": 36 }, { "completion_length": 521.6786041259766, "epoch": 0.13857677902621723, "grad_norm": 4.369849681854248, "kl": 1.700439453125, "learning_rate": 7.037037037037037e-07, "loss": 0.068, "reward": -0.0781383914873004, "reward_std": 1.3968295454978943, "rewards/correctness_reward_func": 0.3125000149011612, "rewards/int_reward_func": 0.279017873108387, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6707723736763, "step": 37 }, { "completion_length": 525.8214569091797, "epoch": 0.14232209737827714, "grad_norm": 4.071728706359863, "kl": 1.715087890625, "learning_rate": 7.407407407407406e-07, "loss": 0.0686, "reward": -0.18714285362511873, "reward_std": 1.3205263316631317, "rewards/correctness_reward_func": 0.2366071566939354, "rewards/int_reward_func": 0.3046875074505806, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.729553610086441, "step": 38 }, { "completion_length": 516.8214569091797, "epoch": 0.14606741573033707, "grad_norm": 4.24601411819458, "kl": 1.782470703125, "learning_rate": 7.777777777777778e-07, "loss": 0.0713, "reward": -0.1367053649155423, "reward_std": 1.3963970839977264, "rewards/correctness_reward_func": 0.3392857350409031, "rewards/int_reward_func": 0.2879464365541935, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7639375329017639, "step": 39 }, { "completion_length": 522.3214721679688, "epoch": 0.149812734082397, "grad_norm": 17.463598251342773, "kl": Infinity, "learning_rate": 8.148148148148147e-07, "loss": 0.1737, "reward": 0.012379469349980354, "reward_std": 1.293159008026123, "rewards/correctness_reward_func": 0.3883928656578064, "rewards/int_reward_func": 0.2857142984867096, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6628438010811806, "step": 40 }, { "completion_length": 519.8214569091797, "epoch": 0.15355805243445692, "grad_norm": 17.463598251342773, "kl": Infinity, "learning_rate": 8.148148148148147e-07, "loss": 0.1806, "reward": -0.09995536971837282, "reward_std": 1.3616773188114166, "rewards/correctness_reward_func": 0.3214285857975483, "rewards/int_reward_func": 0.3035714402794838, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7260714471340179, "step": 41 }, { "completion_length": 518.4643249511719, "epoch": 0.15730337078651685, "grad_norm": 13.264471054077148, "kl": Infinity, "learning_rate": 8.518518518518518e-07, "loss": 0.1821, "reward": -0.08204018184915185, "reward_std": 1.192356139421463, "rewards/correctness_reward_func": 0.2857143022119999, "rewards/int_reward_func": 0.3169642984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6847188025712967, "step": 42 }, { "completion_length": 522.6071624755859, "epoch": 0.16104868913857678, "grad_norm": 12.370172500610352, "kl": Infinity, "learning_rate": 8.888888888888888e-07, "loss": 0.1696, "reward": -0.16021204832941294, "reward_std": 1.268011212348938, "rewards/correctness_reward_func": 0.3080357313156128, "rewards/int_reward_func": 0.2500000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.718247801065445, "step": 43 }, { "completion_length": 518.0000152587891, "epoch": 0.1647940074906367, "grad_norm": 4.8801093101501465, "kl": 3.1953125, "learning_rate": 9.259259259259259e-07, "loss": 0.1278, "reward": -0.21614731661975384, "reward_std": 1.3435205817222595, "rewards/correctness_reward_func": 0.258928582072258, "rewards/int_reward_func": 0.2700892947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.7462812960147858, "step": 44 }, { "completion_length": 520.1786041259766, "epoch": 0.16853932584269662, "grad_norm": 3.4923696517944336, "kl": 2.48486328125, "learning_rate": 9.629629629629628e-07, "loss": 0.0994, "reward": -0.06906696478836238, "reward_std": 1.3961973786354065, "rewards/correctness_reward_func": 0.3839285895228386, "rewards/int_reward_func": 0.302455373108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7554509043693542, "step": 45 }, { "completion_length": 527.8571624755859, "epoch": 0.17228464419475656, "grad_norm": 3.2191402912139893, "kl": 2.7216796875, "learning_rate": 1e-06, "loss": 0.1089, "reward": -0.0912076011300087, "reward_std": 1.3214216530323029, "rewards/correctness_reward_func": 0.2857142984867096, "rewards/int_reward_func": 0.306919664144516, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6849575936794281, "step": 46 }, { "completion_length": 539.8928985595703, "epoch": 0.1760299625468165, "grad_norm": 4.932059288024902, "kl": 3.138671875, "learning_rate": 9.999571637870034e-07, "loss": 0.1255, "reward": -0.14713840186595917, "reward_std": 1.3085481524467468, "rewards/correctness_reward_func": 0.3794642984867096, "rewards/int_reward_func": 0.2857142984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8123169839382172, "step": 47 }, { "completion_length": 520.5357360839844, "epoch": 0.1797752808988764, "grad_norm": 2.9191691875457764, "kl": 3.26220703125, "learning_rate": 9.998286624877785e-07, "loss": 0.1305, "reward": -0.13023214414715767, "reward_std": 1.3650483787059784, "rewards/correctness_reward_func": 0.2991071566939354, "rewards/int_reward_func": 0.2734375149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7027768045663834, "step": 48 }, { "completion_length": 523.0714569091797, "epoch": 0.18352059925093633, "grad_norm": 3.485138177871704, "kl": 3.185546875, "learning_rate": 9.996145181203615e-07, "loss": 0.1274, "reward": -0.1737388470210135, "reward_std": 1.2116670310497284, "rewards/correctness_reward_func": 0.2321428693830967, "rewards/int_reward_func": 0.2801339477300644, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6860156804323196, "step": 49 }, { "completion_length": 522.857177734375, "epoch": 0.18726591760299627, "grad_norm": 3.4678406715393066, "kl": 3.61083984375, "learning_rate": 9.99314767377287e-07, "loss": 0.1445, "reward": 0.0006718746153637767, "reward_std": 1.2830963432788849, "rewards/correctness_reward_func": 0.3169642984867096, "rewards/int_reward_func": 0.2912946492433548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6075870618224144, "step": 50 }, { "completion_length": 520.607177734375, "epoch": 0.19101123595505617, "grad_norm": 3.153886556625366, "kl": 3.3359375, "learning_rate": 9.989294616193017e-07, "loss": 0.1334, "reward": -0.0694642961025238, "reward_std": 1.3241977095603943, "rewards/correctness_reward_func": 0.2812500074505806, "rewards/int_reward_func": 0.3091517984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.6609821990132332, "step": 51 }, { "completion_length": 526.0357360839844, "epoch": 0.1947565543071161, "grad_norm": 3.5470564365386963, "kl": 3.62744140625, "learning_rate": 9.98458666866564e-07, "loss": 0.1451, "reward": -0.14027010113932192, "reward_std": 1.2881506383419037, "rewards/correctness_reward_func": 0.3660714402794838, "rewards/int_reward_func": 0.2678571566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7741986811161041, "step": 52 }, { "completion_length": 538.3571624755859, "epoch": 0.19850187265917604, "grad_norm": 3.714517831802368, "kl": 3.9716796875, "learning_rate": 9.979024637873308e-07, "loss": 0.1589, "reward": -0.1821674220263958, "reward_std": 1.214670181274414, "rewards/correctness_reward_func": 0.3303571678698063, "rewards/int_reward_func": 0.2812500074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7937746047973633, "step": 53 }, { "completion_length": 521.3214569091797, "epoch": 0.20224719101123595, "grad_norm": 2.559316396713257, "kl": 3.90087890625, "learning_rate": 9.972609476841365e-07, "loss": 0.156, "reward": -0.08583930134773254, "reward_std": 1.2739556729793549, "rewards/correctness_reward_func": 0.2544642984867096, "rewards/int_reward_func": 0.2734375074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6137410700321198, "step": 54 }, { "epoch": 0.20224719101123595, "eval_completion_length": 529.9684467516447, "eval_kl": Infinity, "eval_loss": Infinity, "eval_reward": -0.1981811836655987, "eval_reward_std": 1.2300641655921936, "eval_rewards/correctness_reward_func": 0.214567679245221, "eval_rewards/int_reward_func": 0.2699248248024991, "eval_rewards/soft_format_reward_func": 9.398496660747027e-05, "eval_rewards/strict_format_reward_func": 9.398496660747027e-05, "eval_rewards/xmlcount_reward_func": -0.6828616565779636, "eval_runtime": 2467.139, "eval_samples_per_second": 0.535, "eval_steps_per_second": 0.039, "step": 54 }, { "completion_length": 521.0357360839844, "epoch": 0.20599250936329588, "grad_norm": 7.153426170349121, "kl": 5.3447265625, "learning_rate": 9.965342284774631e-07, "loss": 0.2137, "reward": -0.24599106796085835, "reward_std": 1.428410142660141, "rewards/correctness_reward_func": 0.2544642984867096, "rewards/int_reward_func": 0.3013393059372902, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8029107004404068, "step": 55 }, { "completion_length": 518.3928833007812, "epoch": 0.20973782771535582, "grad_norm": 3.757683515548706, "kl": 4.50537109375, "learning_rate": 9.957224306869053e-07, "loss": 0.1802, "reward": -0.1148928627371788, "reward_std": 1.3824162483215332, "rewards/correctness_reward_func": 0.3214285895228386, "rewards/int_reward_func": 0.2756696604192257, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7119911015033722, "step": 56 }, { "completion_length": 533.7500305175781, "epoch": 0.21348314606741572, "grad_norm": 5.380308628082275, "kl": 5.0009765625, "learning_rate": 9.948256934098351e-07, "loss": 0.2001, "reward": -0.008803579956293106, "reward_std": 1.2614071667194366, "rewards/correctness_reward_func": 0.325892873108387, "rewards/int_reward_func": 0.297991082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.6338035762310028, "step": 57 }, { "completion_length": 534.8571624755859, "epoch": 0.21722846441947566, "grad_norm": 5.718860626220703, "kl": 5.1064453125, "learning_rate": 9.938441702975689e-07, "loss": 0.2042, "reward": -0.03438171138986945, "reward_std": 1.3422514498233795, "rewards/correctness_reward_func": 0.3214285895228386, "rewards/int_reward_func": 0.2901785895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6459888815879822, "step": 58 }, { "completion_length": 518.9643249511719, "epoch": 0.2209737827715356, "grad_norm": 3.3873369693756104, "kl": 4.580078125, "learning_rate": 9.927780295290389e-07, "loss": 0.1832, "reward": -0.13019196595996618, "reward_std": 1.2826339900493622, "rewards/correctness_reward_func": 0.2455357275903225, "rewards/int_reward_func": 0.2868303693830967, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.662558063864708, "step": 59 }, { "completion_length": 536.8928833007812, "epoch": 0.2247191011235955, "grad_norm": 2.9956631660461426, "kl": 4.05908203125, "learning_rate": 9.916274537819773e-07, "loss": 0.1624, "reward": -0.17484374437481165, "reward_std": 1.2421143352985382, "rewards/correctness_reward_func": 0.2232142984867096, "rewards/int_reward_func": 0.266741082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6647991389036179, "step": 60 }, { "completion_length": 531.4286041259766, "epoch": 0.22846441947565543, "grad_norm": 7.34979772567749, "kl": 5.10546875, "learning_rate": 9.90392640201615e-07, "loss": 0.2042, "reward": -0.07384821801679209, "reward_std": 1.2932099103927612, "rewards/correctness_reward_func": 0.2901785895228386, "rewards/int_reward_func": 0.3046875149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.6698303818702698, "step": 61 }, { "completion_length": 514.9643096923828, "epoch": 0.23220973782771537, "grad_norm": 5.556492328643799, "kl": 4.26611328125, "learning_rate": 9.890738003669027e-07, "loss": 0.1706, "reward": -0.13382366113364697, "reward_std": 1.174208790063858, "rewards/correctness_reward_func": 0.20982143748551607, "rewards/int_reward_func": 0.2845982238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6282433420419693, "step": 62 }, { "completion_length": 515.9643249511719, "epoch": 0.23595505617977527, "grad_norm": 3.788120985031128, "kl": 3.6513671875, "learning_rate": 9.876711602542563e-07, "loss": 0.146, "reward": -0.04057143209502101, "reward_std": 1.3794358968734741, "rewards/correctness_reward_func": 0.3348214477300644, "rewards/int_reward_func": 0.294642873108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.6711518168449402, "step": 63 }, { "completion_length": 517.1786041259766, "epoch": 0.2397003745318352, "grad_norm": 4.478960037231445, "kl": 3.5478515625, "learning_rate": 9.861849601988383e-07, "loss": 0.1419, "reward": -0.10594867006875575, "reward_std": 1.1321979463100433, "rewards/correctness_reward_func": 0.2723214402794838, "rewards/int_reward_func": 0.2801339402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6584040820598602, "step": 64 }, { "completion_length": 522.9643096923828, "epoch": 0.24344569288389514, "grad_norm": 3.61104416847229, "kl": 3.99169921875, "learning_rate": 9.846154548533772e-07, "loss": 0.1596, "reward": -0.13633259385824203, "reward_std": 1.269362598657608, "rewards/correctness_reward_func": 0.2723214477300644, "rewards/int_reward_func": 0.3013392984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.709993377327919, "step": 65 }, { "completion_length": 522.0000152587891, "epoch": 0.24719101123595505, "grad_norm": 8.631311416625977, "kl": 5.59375, "learning_rate": 9.82962913144534e-07, "loss": 0.2237, "reward": -0.053970987908542156, "reward_std": 1.3836995363235474, "rewards/correctness_reward_func": 0.4151785969734192, "rewards/int_reward_func": 0.2935267984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7626763731241226, "step": 66 }, { "completion_length": 518.2500152587891, "epoch": 0.250936329588015, "grad_norm": 12.603729248046875, "kl": 6.03515625, "learning_rate": 9.812276182268236e-07, "loss": 0.2414, "reward": -0.23663169657811522, "reward_std": 1.226292833685875, "rewards/correctness_reward_func": 0.2098214402794838, "rewards/int_reward_func": 0.2645089402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7109620720148087, "step": 67 }, { "completion_length": 518.8214721679688, "epoch": 0.2546816479400749, "grad_norm": 4.896903991699219, "kl": 4.85107421875, "learning_rate": 9.794098674340966e-07, "loss": 0.194, "reward": -0.036457577865803614, "reward_std": 1.3130376636981964, "rewards/correctness_reward_func": 0.3526785895228386, "rewards/int_reward_func": 0.2834821492433548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6726183444261551, "step": 68 }, { "completion_length": 517.1428833007812, "epoch": 0.25842696629213485, "grad_norm": 2.8336970806121826, "kl": 4.263671875, "learning_rate": 9.775099722285934e-07, "loss": 0.1706, "reward": -0.20734822936356068, "reward_std": 1.3524066507816315, "rewards/correctness_reward_func": 0.258928582072258, "rewards/int_reward_func": 0.3002232313156128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7665000259876251, "step": 69 }, { "completion_length": 517.3214569091797, "epoch": 0.26217228464419473, "grad_norm": 3.12200665473938, "kl": 4.041015625, "learning_rate": 9.755282581475767e-07, "loss": 0.1616, "reward": 0.04342857655137777, "reward_std": 1.1121753752231598, "rewards/correctness_reward_func": 0.3035714365541935, "rewards/int_reward_func": 0.2790178656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.539160743355751, "step": 70 }, { "completion_length": 524.8928833007812, "epoch": 0.26591760299625467, "grad_norm": 2.3125550746917725, "kl": 3.697265625, "learning_rate": 9.73465064747553e-07, "loss": 0.1479, "reward": -0.2864709943532944, "reward_std": 1.2171598970890045, "rewards/correctness_reward_func": 0.15178572107106447, "rewards/int_reward_func": 0.2477678656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6860245764255524, "step": 71 }, { "completion_length": 526.4285888671875, "epoch": 0.2696629213483146, "grad_norm": 2.5901665687561035, "kl": 4.2919921875, "learning_rate": 9.713207455460892e-07, "loss": 0.1717, "reward": -0.0840669646859169, "reward_std": 1.3054583966732025, "rewards/correctness_reward_func": 0.3571428842842579, "rewards/int_reward_func": 0.305803582072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7470134347677231, "step": 72 }, { "completion_length": 527.3571624755859, "epoch": 0.27340823970037453, "grad_norm": 2.6217572689056396, "kl": 4.8828125, "learning_rate": 9.69095667961242e-07, "loss": 0.1953, "reward": -0.07796428725123405, "reward_std": 1.1255577206611633, "rewards/correctness_reward_func": 0.2812500111758709, "rewards/int_reward_func": 0.2879464402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6471607089042664, "step": 73 }, { "completion_length": 526.9286041259766, "epoch": 0.27715355805243447, "grad_norm": 11.475034713745117, "kl": Infinity, "learning_rate": 9.667902132486008e-07, "loss": 0.2543, "reward": -0.06999553460627794, "reward_std": 1.2003649473190308, "rewards/correctness_reward_func": 0.294642873108387, "rewards/int_reward_func": 0.2890625149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6537009179592133, "step": 74 }, { "completion_length": 517.8571624755859, "epoch": 0.2808988764044944, "grad_norm": 11.475034713745117, "kl": Infinity, "learning_rate": 9.667902132486008e-07, "loss": 0.2206, "reward": -0.1703772358596325, "reward_std": 1.2204445004463196, "rewards/correctness_reward_func": 0.2857142984867096, "rewards/int_reward_func": 0.2421875074505806, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6993951201438904, "step": 75 }, { "completion_length": 522.0714569091797, "epoch": 0.2846441947565543, "grad_norm": 2.270301580429077, "kl": 5.0947265625, "learning_rate": 9.644047764359621e-07, "loss": 0.2038, "reward": -0.13376785721629858, "reward_std": 1.3083040714263916, "rewards/correctness_reward_func": 0.2901785895228386, "rewards/int_reward_func": 0.2790178656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7029643207788467, "step": 76 }, { "completion_length": 523.9286041259766, "epoch": 0.2883895131086142, "grad_norm": 3.0469226837158203, "kl": 4.9599609375, "learning_rate": 9.619397662556433e-07, "loss": 0.1984, "reward": -0.1042410780210048, "reward_std": 1.188673734664917, "rewards/correctness_reward_func": 0.2410714440047741, "rewards/int_reward_func": 0.2611607313156128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6064732521772385, "step": 77 }, { "completion_length": 521.5714569091797, "epoch": 0.29213483146067415, "grad_norm": 2.852745294570923, "kl": 4.5966796875, "learning_rate": 9.593956050744492e-07, "loss": 0.1839, "reward": -0.15841741440817714, "reward_std": 1.2980459928512573, "rewards/correctness_reward_func": 0.2142857275903225, "rewards/int_reward_func": 0.291294664144516, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6639978140592575, "step": 78 }, { "completion_length": 525.6786041259766, "epoch": 0.2958801498127341, "grad_norm": 4.637290000915527, "kl": 4.21337890625, "learning_rate": 9.567727288213004e-07, "loss": 0.1685, "reward": -0.19912277371622622, "reward_std": 1.2618965804576874, "rewards/correctness_reward_func": 0.2633928656578064, "rewards/int_reward_func": 0.2600446529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7225602865219116, "step": 79 }, { "completion_length": 522.5357360839844, "epoch": 0.299625468164794, "grad_norm": 3.2194876670837402, "kl": 3.90625, "learning_rate": 9.540715869125407e-07, "loss": 0.1563, "reward": -0.020919647999107838, "reward_std": 1.222971111536026, "rewards/correctness_reward_func": 0.2857142984867096, "rewards/int_reward_func": 0.277901791036129, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.584535762667656, "step": 80 }, { "completion_length": 529.8214416503906, "epoch": 0.30337078651685395, "grad_norm": 2.404968500137329, "kl": 3.91064453125, "learning_rate": 9.512926421749303e-07, "loss": 0.1564, "reward": -0.19220760464668274, "reward_std": 1.3635277450084686, "rewards/correctness_reward_func": 0.2812500111758709, "rewards/int_reward_func": 0.2845982313156128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7580558508634567, "step": 81 }, { "epoch": 0.30337078651685395, "eval_completion_length": 537.6496517783717, "eval_kl": Infinity, "eval_loss": Infinity, "eval_reward": -0.2110714678873161, "eval_reward_std": 1.246040644143757, "eval_rewards/correctness_reward_func": 0.21428572410031368, "eval_rewards/int_reward_func": 0.27151473525323366, "eval_rewards/soft_format_reward_func": 7.048872495560269e-05, "eval_rewards/strict_format_reward_func": 0.00016447369156307297, "eval_rewards/xmlcount_reward_func": -0.697106892497916, "eval_runtime": 2496.3791, "eval_samples_per_second": 0.528, "eval_steps_per_second": 0.038, "step": 81 }, { "completion_length": 525.0357513427734, "epoch": 0.30711610486891383, "grad_norm": 2.384713888168335, "kl": 5.0546875, "learning_rate": 9.484363707663441e-07, "loss": 0.2022, "reward": -0.17487278208136559, "reward_std": 1.2226494252681732, "rewards/correctness_reward_func": 0.2410714402794838, "rewards/int_reward_func": 0.266741082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6826852709054947, "step": 82 }, { "completion_length": 523.6785888671875, "epoch": 0.31086142322097376, "grad_norm": 4.885804653167725, "kl": 5.853515625, "learning_rate": 9.455032620941839e-07, "loss": 0.2341, "reward": -0.16699554910883307, "reward_std": 1.3849684596061707, "rewards/correctness_reward_func": 0.3125000074505806, "rewards/int_reward_func": 0.3002232238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7797188013792038, "step": 83 }, { "completion_length": 519.7500305175781, "epoch": 0.3146067415730337, "grad_norm": 5.602491855621338, "kl": 5.1337890625, "learning_rate": 9.424938187315209e-07, "loss": 0.2054, "reward": -0.13761384692043066, "reward_std": 1.31485316157341, "rewards/correctness_reward_func": 0.3437500149011612, "rewards/int_reward_func": 0.2533482275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.734712079167366, "step": 84 }, { "completion_length": 528.607177734375, "epoch": 0.31835205992509363, "grad_norm": 3.0608065128326416, "kl": 4.798828125, "learning_rate": 9.394085563309826e-07, "loss": 0.1919, "reward": -0.3015223369002342, "reward_std": 1.3573426604270935, "rewards/correctness_reward_func": 0.20089286798611283, "rewards/int_reward_func": 0.2756696529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.779200941324234, "step": 85 }, { "completion_length": 520.4286041259766, "epoch": 0.32209737827715357, "grad_norm": 3.1792619228363037, "kl": 4.05029296875, "learning_rate": 9.362480035363985e-07, "loss": 0.162, "reward": -0.09921875572763383, "reward_std": 1.2572968900203705, "rewards/correctness_reward_func": 0.2276785783469677, "rewards/int_reward_func": 0.302455373108387, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6304687634110451, "step": 86 }, { "completion_length": 524.8928833007812, "epoch": 0.3258426966292135, "grad_norm": 2.641871690750122, "kl": 4.3486328125, "learning_rate": 9.330127018922193e-07, "loss": 0.1739, "reward": -0.1489598285406828, "reward_std": 1.2619026154279709, "rewards/correctness_reward_func": 0.2812500223517418, "rewards/int_reward_func": 0.282366082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7125759124755859, "step": 87 }, { "completion_length": 515.8571624755859, "epoch": 0.3295880149812734, "grad_norm": 2.3113279342651367, "kl": 4.58251953125, "learning_rate": 9.297032057507264e-07, "loss": 0.1833, "reward": -0.15345759969204664, "reward_std": 1.2697344720363617, "rewards/correctness_reward_func": 0.330357164144516, "rewards/int_reward_func": 0.2444196529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7282344102859497, "step": 88 }, { "completion_length": 522.4643096923828, "epoch": 0.3333333333333333, "grad_norm": 2.758471727371216, "kl": 5.03125, "learning_rate": 9.26320082177046e-07, "loss": 0.2012, "reward": -0.03455581283196807, "reward_std": 1.2184255123138428, "rewards/correctness_reward_func": 0.3839285969734192, "rewards/int_reward_func": 0.3046875149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0022321429569274187, "rewards/xmlcount_reward_func": -0.7254040837287903, "step": 89 }, { "completion_length": 515.5000305175781, "epoch": 0.33707865168539325, "grad_norm": 2.942793130874634, "kl": 4.84375, "learning_rate": 9.228639108519866e-07, "loss": 0.1937, "reward": -0.12176339235156775, "reward_std": 1.2805460095405579, "rewards/correctness_reward_func": 0.227678582072258, "rewards/int_reward_func": 0.282366082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.6329241469502449, "step": 90 }, { "completion_length": 518.8571624755859, "epoch": 0.3408239700374532, "grad_norm": 2.8008244037628174, "kl": 5.3193359375, "learning_rate": 9.19335283972712e-07, "loss": 0.2127, "reward": -0.05108258547261357, "reward_std": 1.26492041349411, "rewards/correctness_reward_func": 0.3348214477300644, "rewards/int_reward_func": 0.2745535857975483, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6604576334357262, "step": 91 }, { "completion_length": 517.5714569091797, "epoch": 0.3445692883895131, "grad_norm": 3.959707498550415, "kl": 5.1396484375, "learning_rate": 9.157348061512726e-07, "loss": 0.2055, "reward": -0.3041093908250332, "reward_std": 1.284685343503952, "rewards/correctness_reward_func": 0.2500000074505806, "rewards/int_reward_func": 0.279017873108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8331272155046463, "step": 92 }, { "completion_length": 525.4643096923828, "epoch": 0.34831460674157305, "grad_norm": 2.2119061946868896, "kl": 4.5244140625, "learning_rate": 9.120630943110077e-07, "loss": 0.181, "reward": -0.16022099647670984, "reward_std": 1.251663863658905, "rewards/correctness_reward_func": 0.2589285895228386, "rewards/int_reward_func": 0.2700892984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.6903549581766129, "step": 93 }, { "completion_length": 520.1071929931641, "epoch": 0.352059925093633, "grad_norm": 2.263819456100464, "kl": 4.20166015625, "learning_rate": 9.083207775808394e-07, "loss": 0.1681, "reward": -0.20089509710669518, "reward_std": 1.3503172099590302, "rewards/correctness_reward_func": 0.2901785857975483, "rewards/int_reward_func": 0.2723214328289032, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.7645111978054047, "step": 94 }, { "completion_length": 516.607177734375, "epoch": 0.35580524344569286, "grad_norm": 2.402242422103882, "kl": 4.662109375, "learning_rate": 9.045084971874737e-07, "loss": 0.1865, "reward": -0.2237209901213646, "reward_std": 1.2929619252681732, "rewards/correctness_reward_func": 0.2857143022119999, "rewards/int_reward_func": 0.2678571566939354, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7784085273742676, "step": 95 }, { "completion_length": 524.4285888671875, "epoch": 0.3595505617977528, "grad_norm": 2.300976276397705, "kl": 5.189453125, "learning_rate": 9.006269063455302e-07, "loss": 0.2076, "reward": -0.05375669337809086, "reward_std": 1.2477286458015442, "rewards/correctness_reward_func": 0.3080357313156128, "rewards/int_reward_func": 0.2979910857975483, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6597835123538971, "step": 96 }, { "completion_length": 525.0357360839844, "epoch": 0.36329588014981273, "grad_norm": 2.4251925945281982, "kl": 4.642578125, "learning_rate": 8.966766701456176e-07, "loss": 0.1857, "reward": -0.1464910740032792, "reward_std": 1.3847712874412537, "rewards/correctness_reward_func": 0.3526785895228386, "rewards/int_reward_func": 0.294642873108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7938125729560852, "step": 97 }, { "completion_length": 525.0000305175781, "epoch": 0.36704119850187267, "grad_norm": 6.032417297363281, "kl": 4.716796875, "learning_rate": 8.926584654403724e-07, "loss": 0.1886, "reward": -0.1682098340243101, "reward_std": 1.2604454159736633, "rewards/correctness_reward_func": 0.3035714402794838, "rewards/int_reward_func": 0.2968750149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7686562836170197, "step": 98 }, { "completion_length": 516.9643249511719, "epoch": 0.3707865168539326, "grad_norm": 2.159947633743286, "kl": 4.19580078125, "learning_rate": 8.885729807284854e-07, "loss": 0.1678, "reward": -0.15086161345243454, "reward_std": 1.2245171070098877, "rewards/correctness_reward_func": 0.227678582072258, "rewards/int_reward_func": 0.2801339402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6586741358041763, "step": 99 }, { "completion_length": 524.7857360839844, "epoch": 0.37453183520599254, "grad_norm": 2.146036148071289, "kl": 4.57275390625, "learning_rate": 8.844209160367298e-07, "loss": 0.1829, "reward": -0.24398661218583584, "reward_std": 1.3106769621372223, "rewards/correctness_reward_func": 0.2901785895228386, "rewards/int_reward_func": 0.2968750149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.831040233373642, "step": 100 }, { "completion_length": 516.7857360839844, "epoch": 0.3782771535580524, "grad_norm": 2.4187943935394287, "kl": 5.0068359375, "learning_rate": 8.802029828000155e-07, "loss": 0.2003, "reward": -0.10016518598422408, "reward_std": 1.3009901344776154, "rewards/correctness_reward_func": 0.3035714440047741, "rewards/int_reward_func": 0.279017873108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6827544867992401, "step": 101 }, { "completion_length": 516.1428833007812, "epoch": 0.38202247191011235, "grad_norm": 3.7174782752990723, "kl": 5.791015625, "learning_rate": 8.759199037394886e-07, "loss": 0.2317, "reward": -0.10657813027501106, "reward_std": 1.2493732273578644, "rewards/correctness_reward_func": 0.2991071566939354, "rewards/int_reward_func": 0.275669664144516, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6813549548387527, "step": 102 }, { "completion_length": 520.2143096923828, "epoch": 0.3857677902621723, "grad_norm": 9.145467758178711, "kl": Infinity, "learning_rate": 8.71572412738697e-07, "loss": 0.2629, "reward": -0.08905134350061417, "reward_std": 1.1978721618652344, "rewards/correctness_reward_func": 0.2678571529686451, "rewards/int_reward_func": 0.2578125111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6147210150957108, "step": 103 }, { "completion_length": 514.3928833007812, "epoch": 0.3895131086142322, "grad_norm": 5.3565168380737305, "kl": 5.62890625, "learning_rate": 8.671612547178427e-07, "loss": 0.2251, "reward": -0.10154464282095432, "reward_std": 1.2342998683452606, "rewards/correctness_reward_func": 0.263392873108387, "rewards/int_reward_func": 0.2879464402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.652883917093277, "step": 104 }, { "completion_length": 527.7857360839844, "epoch": 0.39325842696629215, "grad_norm": 2.9622256755828857, "kl": 5.0244140625, "learning_rate": 8.626871855061437e-07, "loss": 0.201, "reward": -0.029102683067321777, "reward_std": 1.1962931752204895, "rewards/correctness_reward_func": 0.2723214402794838, "rewards/int_reward_func": 0.2779017984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5793259218335152, "step": 105 }, { "completion_length": 530.6071624755859, "epoch": 0.3970037453183521, "grad_norm": 2.1288747787475586, "kl": 4.4072265625, "learning_rate": 8.581509717123272e-07, "loss": 0.1763, "reward": -0.10718304198235273, "reward_std": 1.2426259815692902, "rewards/correctness_reward_func": 0.3571428805589676, "rewards/int_reward_func": 0.3035714402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.767897367477417, "step": 106 }, { "completion_length": 529.8571624755859, "epoch": 0.40074906367041196, "grad_norm": 2.16975474357605, "kl": 4.44921875, "learning_rate": 8.535533905932737e-07, "loss": 0.178, "reward": -0.08131696470081806, "reward_std": 1.2708694338798523, "rewards/correctness_reward_func": 0.325892873108387, "rewards/int_reward_func": 0.2968750149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7040848881006241, "step": 107 }, { "completion_length": 519.9643096923828, "epoch": 0.4044943820224719, "grad_norm": 2.3932907581329346, "kl": 4.525390625, "learning_rate": 8.488952299208401e-07, "loss": 0.181, "reward": -0.09825893118977547, "reward_std": 1.1982493996620178, "rewards/correctness_reward_func": 0.2723214477300644, "rewards/int_reward_func": 0.2845982238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6551786065101624, "step": 108 }, { "epoch": 0.4044943820224719, "eval_completion_length": 531.9955142372532, "eval_kl": Infinity, "eval_loss": Infinity, "eval_reward": -0.18952424128258014, "eval_reward_std": 1.2504213703306097, "eval_rewards/correctness_reward_func": 0.2154135453073602, "eval_rewards/int_reward_func": 0.27948779385340844, "eval_rewards/soft_format_reward_func": 7.048872495560269e-05, "eval_rewards/strict_format_reward_func": 9.398496660747027e-05, "eval_rewards/xmlcount_reward_func": -0.6845900519898064, "eval_runtime": 2483.5009, "eval_samples_per_second": 0.531, "eval_steps_per_second": 0.038, "step": 108 }, { "completion_length": 519.8214569091797, "epoch": 0.40823970037453183, "grad_norm": 2.8793561458587646, "kl": 4.71240234375, "learning_rate": 8.441772878468769e-07, "loss": 0.1885, "reward": -0.26077233999967575, "reward_std": 1.3895916938781738, "rewards/correctness_reward_func": 0.2901785857975483, "rewards/int_reward_func": 0.2678571566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8188081234693527, "step": 109 }, { "completion_length": 527.8928833007812, "epoch": 0.41198501872659177, "grad_norm": 2.0183770656585693, "kl": 4.36474609375, "learning_rate": 8.394003727664709e-07, "loss": 0.1746, "reward": -0.0459575979039073, "reward_std": 1.1931805312633514, "rewards/correctness_reward_func": 0.3437500149011612, "rewards/int_reward_func": 0.290178582072258, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.681002289056778, "step": 110 }, { "completion_length": 518.0714416503906, "epoch": 0.4157303370786517, "grad_norm": 2.79196834564209, "kl": 5.20703125, "learning_rate": 8.34565303179429e-07, "loss": 0.2083, "reward": 0.009531241841614246, "reward_std": 1.1548882126808167, "rewards/correctness_reward_func": 0.27678573317825794, "rewards/int_reward_func": 0.2901785783469677, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5585491135716438, "step": 111 }, { "completion_length": 526.0714569091797, "epoch": 0.41947565543071164, "grad_norm": 5.046426773071289, "kl": 5.162109375, "learning_rate": 8.296729075500343e-07, "loss": 0.2065, "reward": -0.1916651837527752, "reward_std": 1.2752774059772491, "rewards/correctness_reward_func": 0.2678571566939354, "rewards/int_reward_func": 0.314732164144516, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7742545157670975, "step": 112 }, { "completion_length": 540.5714416503906, "epoch": 0.4232209737827715, "grad_norm": 2.5626397132873535, "kl": 4.7333984375, "learning_rate": 8.247240241650917e-07, "loss": 0.1893, "reward": -0.046348221600055695, "reward_std": 1.2709387838840485, "rewards/correctness_reward_func": 0.3348214477300644, "rewards/int_reward_func": 0.2723214365541935, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6534911245107651, "step": 113 }, { "completion_length": 518.4643096923828, "epoch": 0.42696629213483145, "grad_norm": 2.389897584915161, "kl": 4.4033203125, "learning_rate": 8.197195009902923e-07, "loss": 0.1761, "reward": -0.08783036330714822, "reward_std": 1.316321700811386, "rewards/correctness_reward_func": 0.4107142984867096, "rewards/int_reward_func": 0.3024553656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8010000437498093, "step": 114 }, { "completion_length": 522.5000305175781, "epoch": 0.4307116104868914, "grad_norm": 2.389897584915161, "kl": Infinity, "learning_rate": 8.197195009902923e-07, "loss": 0.1755, "reward": -0.08958705811528489, "reward_std": 1.2088698744773865, "rewards/correctness_reward_func": 0.2991071566939354, "rewards/int_reward_func": 0.2912946566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6799888461828232, "step": 115 }, { "completion_length": 515.6071624755859, "epoch": 0.4344569288389513, "grad_norm": 1.8981910943984985, "kl": 4.796875, "learning_rate": 8.146601955249187e-07, "loss": 0.1919, "reward": -0.08243527729064226, "reward_std": 1.3043743073940277, "rewards/correctness_reward_func": 0.2946428656578064, "rewards/int_reward_func": 0.2712053656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6482834964990616, "step": 116 }, { "completion_length": 522.7500305175781, "epoch": 0.43820224719101125, "grad_norm": 2.014021873474121, "kl": 4.29296875, "learning_rate": 8.095469746549171e-07, "loss": 0.1717, "reward": -0.06306472327560186, "reward_std": 1.194327026605606, "rewards/correctness_reward_func": 0.2633928656578064, "rewards/int_reward_func": 0.2857142947614193, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6132879704236984, "step": 117 }, { "completion_length": 530.6428833007812, "epoch": 0.4419475655430712, "grad_norm": 2.255415439605713, "kl": 5.0439453125, "learning_rate": 8.043807145043603e-07, "loss": 0.2018, "reward": -0.22622546181082726, "reward_std": 1.2605921626091003, "rewards/correctness_reward_func": 0.1964285783469677, "rewards/int_reward_func": 0.266741082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6893951445817947, "step": 118 }, { "completion_length": 524.8571624755859, "epoch": 0.44569288389513106, "grad_norm": 2.743229627609253, "kl": 5.2587890625, "learning_rate": 7.991623002853294e-07, "loss": 0.2104, "reward": -0.18948884680867195, "reward_std": 1.2621938586235046, "rewards/correctness_reward_func": 0.2142857238650322, "rewards/int_reward_func": 0.2678571566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6716317236423492, "step": 119 }, { "completion_length": 517.5000305175781, "epoch": 0.449438202247191, "grad_norm": 3.164485454559326, "kl": 5.5810546875, "learning_rate": 7.938926261462365e-07, "loss": 0.2232, "reward": -0.21733705699443817, "reward_std": 1.231595516204834, "rewards/correctness_reward_func": 0.2187500149011612, "rewards/int_reward_func": 0.3058035895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.741890624165535, "step": 120 }, { "completion_length": 530.857177734375, "epoch": 0.45318352059925093, "grad_norm": 3.164485454559326, "kl": Infinity, "learning_rate": 7.938926261462365e-07, "loss": 0.234, "reward": -0.19310491904616356, "reward_std": 1.3157844841480255, "rewards/correctness_reward_func": 0.258928582072258, "rewards/int_reward_func": 0.2801339402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7321674823760986, "step": 121 }, { "completion_length": 517.7143249511719, "epoch": 0.45692883895131087, "grad_norm": 2.4260902404785156, "kl": 5.0986328125, "learning_rate": 7.885725950186168e-07, "loss": 0.2039, "reward": -0.11468750424683094, "reward_std": 1.3093117475509644, "rewards/correctness_reward_func": 0.3392857350409031, "rewards/int_reward_func": 0.2834821566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7374553978443146, "step": 122 }, { "completion_length": 522.8571624755859, "epoch": 0.4606741573033708, "grad_norm": 1.6100739240646362, "kl": 4.80859375, "learning_rate": 7.832031184624164e-07, "loss": 0.1923, "reward": -0.1980089396238327, "reward_std": 1.338472455739975, "rewards/correctness_reward_func": 0.2723214402794838, "rewards/int_reward_func": 0.2790178656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.7504643201828003, "step": 123 }, { "completion_length": 526.2857360839844, "epoch": 0.46441947565543074, "grad_norm": 1.831390380859375, "kl": 4.9580078125, "learning_rate": 7.777851165098011e-07, "loss": 0.1983, "reward": -0.10124107450246811, "reward_std": 1.3591813445091248, "rewards/correctness_reward_func": 0.3169643022119999, "rewards/int_reward_func": 0.2857143059372902, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7039196640253067, "step": 124 }, { "completion_length": 525.1428833007812, "epoch": 0.4681647940074906, "grad_norm": 1.9850410223007202, "kl": 4.2109375, "learning_rate": 7.723195175075135e-07, "loss": 0.1684, "reward": -0.10687501262873411, "reward_std": 1.1036955416202545, "rewards/correctness_reward_func": 0.2678571566939354, "rewards/int_reward_func": 0.2500000111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.624732181429863, "step": 125 }, { "completion_length": 515.8571624755859, "epoch": 0.47191011235955055, "grad_norm": 2.7980728149414062, "kl": 4.34130859375, "learning_rate": 7.668072579578058e-07, "loss": 0.1737, "reward": -0.22420312091708183, "reward_std": 1.3303392231464386, "rewards/correctness_reward_func": 0.24107144214212894, "rewards/int_reward_func": 0.2912946566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7565692067146301, "step": 126 }, { "completion_length": 515.3928833007812, "epoch": 0.4756554307116105, "grad_norm": 1.5911880731582642, "kl": 5.1962890625, "learning_rate": 7.612492823579744e-07, "loss": 0.2078, "reward": -0.11461607180535793, "reward_std": 1.3034924268722534, "rewards/correctness_reward_func": 0.3303571566939354, "rewards/int_reward_func": 0.2968750074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7418482303619385, "step": 127 }, { "completion_length": 523.8928680419922, "epoch": 0.4794007490636704, "grad_norm": 2.3929038047790527, "kl": 5.15625, "learning_rate": 7.556465430385259e-07, "loss": 0.2062, "reward": -0.04959822096861899, "reward_std": 1.3580511510372162, "rewards/correctness_reward_func": 0.3482143059372902, "rewards/int_reward_func": 0.2957589440047741, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6935714930295944, "step": 128 }, { "completion_length": 514.5357513427734, "epoch": 0.48314606741573035, "grad_norm": 2.6407032012939453, "kl": 4.9072265625, "learning_rate": 7.5e-07, "loss": 0.1963, "reward": -0.11345758475363255, "reward_std": 1.2759953737258911, "rewards/correctness_reward_func": 0.2946428656578064, "rewards/int_reward_func": 0.2845982238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6926987171173096, "step": 129 }, { "completion_length": 515.6428985595703, "epoch": 0.4868913857677903, "grad_norm": 1.820491909980774, "kl": 4.42724609375, "learning_rate": 7.443106207484775e-07, "loss": 0.1771, "reward": -0.08515847939997911, "reward_std": 1.2134381830692291, "rewards/correctness_reward_func": 0.2946428693830967, "rewards/int_reward_func": 0.2656250074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6454263776540756, "step": 130 }, { "completion_length": 518.107177734375, "epoch": 0.49063670411985016, "grad_norm": 3.0650475025177, "kl": 4.8212890625, "learning_rate": 7.385793801298042e-07, "loss": 0.1929, "reward": -0.18281920308072586, "reward_std": 1.3228217363357544, "rewards/correctness_reward_func": 0.2544642984867096, "rewards/int_reward_func": 0.294642873108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.731926366686821, "step": 131 }, { "completion_length": 516.1786041259766, "epoch": 0.4943820224719101, "grad_norm": 2.8371877670288086, "kl": 5.048828125, "learning_rate": 7.328072601625557e-07, "loss": 0.202, "reward": -0.03922321368008852, "reward_std": 1.0432914346456528, "rewards/correctness_reward_func": 0.2098214365541935, "rewards/int_reward_func": 0.2912946566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.540339283645153, "step": 132 }, { "completion_length": 530.3571624755859, "epoch": 0.49812734082397003, "grad_norm": 2.9099578857421875, "kl": 4.8974609375, "learning_rate": 7.269952498697734e-07, "loss": 0.1959, "reward": -0.1908794827759266, "reward_std": 1.1966271996498108, "rewards/correctness_reward_func": 0.1741071492433548, "rewards/int_reward_func": 0.2767857238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6417723447084427, "step": 133 }, { "completion_length": 537.1786193847656, "epoch": 0.50187265917603, "grad_norm": 1.9121432304382324, "kl": 4.748046875, "learning_rate": 7.211443451095006e-07, "loss": 0.1899, "reward": -0.07099777227267623, "reward_std": 1.2925509214401245, "rewards/correctness_reward_func": 0.2991071566939354, "rewards/int_reward_func": 0.282366082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6524710208177567, "step": 134 }, { "completion_length": 531.8214569091797, "epoch": 0.5056179775280899, "grad_norm": 2.4540979862213135, "kl": 5.0126953125, "learning_rate": 7.152555484041475e-07, "loss": 0.2005, "reward": -0.19015179947018623, "reward_std": 1.273418515920639, "rewards/correctness_reward_func": 0.2500000074505806, "rewards/int_reward_func": 0.2823660895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7225178927183151, "step": 135 }, { "epoch": 0.5056179775280899, "eval_completion_length": 534.5113062808388, "eval_kl": Infinity, "eval_loss": Infinity, "eval_reward": -0.1988269823652349, "eval_reward_std": 1.2561254689567967, "eval_rewards/correctness_reward_func": 0.2170739448384235, "eval_rewards/int_reward_func": 0.2772791479763232, "eval_rewards/soft_format_reward_func": 4.6992483303735133e-05, "eval_rewards/strict_format_reward_func": 2.3496241651867567e-05, "eval_rewards/xmlcount_reward_func": -0.6932505579371201, "eval_runtime": 2478.2667, "eval_samples_per_second": 0.532, "eval_steps_per_second": 0.038, "step": 135 }, { "completion_length": 517.3571472167969, "epoch": 0.5093632958801498, "grad_norm": 2.260101079940796, "kl": 4.990234375, "learning_rate": 7.09329868768714e-07, "loss": 0.1996, "reward": -0.14810045702324715, "reward_std": 1.2247705459594727, "rewards/correctness_reward_func": 0.2410714402794838, "rewards/int_reward_func": 0.2812500149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.6715379506349564, "step": 136 }, { "completion_length": 520.4286041259766, "epoch": 0.5131086142322098, "grad_norm": 1.816656231880188, "kl": 5.271484375, "learning_rate": 7.033683215379002e-07, "loss": 0.2109, "reward": -0.2372276969254017, "reward_std": 1.2999887466430664, "rewards/correctness_reward_func": 0.2500000074505806, "rewards/int_reward_func": 0.2745535895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.7628973424434662, "step": 137 }, { "completion_length": 516.1428833007812, "epoch": 0.5168539325842697, "grad_norm": 3.60597825050354, "kl": 5.40625, "learning_rate": 6.973719281921336e-07, "loss": 0.2163, "reward": -0.1252053566277027, "reward_std": 1.2438539862632751, "rewards/correctness_reward_func": 0.3035714402794838, "rewards/int_reward_func": 0.2812500111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7100268006324768, "step": 138 }, { "completion_length": 518.8928833007812, "epoch": 0.5205992509363296, "grad_norm": 2.287635326385498, "kl": 5.2548828125, "learning_rate": 6.913417161825449e-07, "loss": 0.2102, "reward": -0.1885022409260273, "reward_std": 1.2898488640785217, "rewards/correctness_reward_func": 0.2812500149011612, "rewards/int_reward_func": 0.2879464402794838, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7588147670030594, "step": 139 }, { "completion_length": 516.5357360839844, "epoch": 0.5243445692883895, "grad_norm": 1.85598623752594, "kl": 4.8173828125, "learning_rate": 6.852787187549181e-07, "loss": 0.1926, "reward": -0.08186161960475147, "reward_std": 1.207483857870102, "rewards/correctness_reward_func": 0.2946428693830967, "rewards/int_reward_func": 0.266741082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6432455480098724, "step": 140 }, { "completion_length": 519.7500305175781, "epoch": 0.5280898876404494, "grad_norm": 2.6306166648864746, "kl": 4.4951171875, "learning_rate": 6.7918397477265e-07, "loss": 0.1798, "reward": -0.12313839327543974, "reward_std": 1.155171424150467, "rewards/correctness_reward_func": 0.2232142984867096, "rewards/int_reward_func": 0.2656250149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6119777262210846, "step": 141 }, { "completion_length": 516.3928833007812, "epoch": 0.5318352059925093, "grad_norm": 2.6810405254364014, "kl": 5.0771484375, "learning_rate": 6.730585285387465e-07, "loss": 0.2031, "reward": -0.022080346941947937, "reward_std": 1.2572471797466278, "rewards/correctness_reward_func": 0.3660714402794838, "rewards/int_reward_func": 0.2979910895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6861428916454315, "step": 142 }, { "completion_length": 522.3214569091797, "epoch": 0.5355805243445693, "grad_norm": 3.570694923400879, "kl": 4.92724609375, "learning_rate": 6.669034296168854e-07, "loss": 0.1971, "reward": -0.18164288252592087, "reward_std": 1.2775076925754547, "rewards/correctness_reward_func": 0.3035714477300644, "rewards/int_reward_func": 0.266741082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7519554197788239, "step": 143 }, { "completion_length": 533.3571624755859, "epoch": 0.5393258426966292, "grad_norm": 1.905169129371643, "kl": 4.7041015625, "learning_rate": 6.607197326515807e-07, "loss": 0.1881, "reward": -0.1941227694042027, "reward_std": 1.394089788198471, "rewards/correctness_reward_func": 0.325892873108387, "rewards/int_reward_func": 0.2600446492433548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7800603359937668, "step": 144 }, { "completion_length": 533.7500152587891, "epoch": 0.5430711610486891, "grad_norm": 1.7941057682037354, "kl": 4.5029296875, "learning_rate": 6.545084971874736e-07, "loss": 0.1801, "reward": -0.1683393083512783, "reward_std": 1.3101599216461182, "rewards/correctness_reward_func": 0.28571430779993534, "rewards/int_reward_func": 0.274553582072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.728607177734375, "step": 145 }, { "completion_length": 521.607177734375, "epoch": 0.5468164794007491, "grad_norm": 2.9071526527404785, "kl": 4.875, "learning_rate": 6.482707874877854e-07, "loss": 0.1949, "reward": -0.04411383857950568, "reward_std": 1.3005799651145935, "rewards/correctness_reward_func": 0.3169642984867096, "rewards/int_reward_func": 0.3035714440047741, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6646495461463928, "step": 146 }, { "completion_length": 544.9643096923828, "epoch": 0.550561797752809, "grad_norm": 2.726607322692871, "kl": Infinity, "learning_rate": 6.420076723519614e-07, "loss": 0.22, "reward": -0.0854665283113718, "reward_std": 1.307901293039322, "rewards/correctness_reward_func": 0.401785746216774, "rewards/int_reward_func": 0.277901791036129, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7662701457738876, "step": 147 }, { "completion_length": 532.1428833007812, "epoch": 0.5543071161048689, "grad_norm": 2.9170711040496826, "kl": 5.365234375, "learning_rate": 6.357202249325371e-07, "loss": 0.2146, "reward": -0.1308973114937544, "reward_std": 1.3350401818752289, "rewards/correctness_reward_func": 0.3035714402794838, "rewards/int_reward_func": 0.2790178656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7134866267442703, "step": 148 }, { "completion_length": 518.2857360839844, "epoch": 0.5580524344569289, "grad_norm": 2.2603960037231445, "kl": 5.1748046875, "learning_rate": 6.294095225512604e-07, "loss": 0.207, "reward": -0.18937501218169928, "reward_std": 1.3142652213573456, "rewards/correctness_reward_func": 0.2767857238650322, "rewards/int_reward_func": 0.305803582072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7719643265008926, "step": 149 }, { "completion_length": 531.0000457763672, "epoch": 0.5617977528089888, "grad_norm": 2.677884578704834, "kl": 5.2392578125, "learning_rate": 6.230766465144965e-07, "loss": 0.2096, "reward": -0.1788214324042201, "reward_std": 1.2578088641166687, "rewards/correctness_reward_func": 0.2455357238650322, "rewards/int_reward_func": 0.2667410857975483, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.69221431016922, "step": 150 }, { "completion_length": 523.357177734375, "epoch": 0.5655430711610487, "grad_norm": 2.8838837146759033, "kl": 5.416015625, "learning_rate": 6.167226819279527e-07, "loss": 0.2166, "reward": -0.17594197671860456, "reward_std": 1.438678652048111, "rewards/correctness_reward_func": 0.325892873108387, "rewards/int_reward_func": 0.305803582072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8076384514570236, "step": 151 }, { "completion_length": 528.3571624755859, "epoch": 0.5692883895131086, "grad_norm": 4.809299945831299, "kl": 5.3623046875, "learning_rate": 6.103487175107507e-07, "loss": 0.2145, "reward": -0.18464063061401248, "reward_std": 1.3211232721805573, "rewards/correctness_reward_func": 0.3035714402794838, "rewards/int_reward_func": 0.314732164144516, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8029441684484482, "step": 152 }, { "completion_length": 536.7500457763672, "epoch": 0.5730337078651685, "grad_norm": 1.6935267448425293, "kl": 4.90625, "learning_rate": 6.039558454088795e-07, "loss": 0.1962, "reward": -0.1592745715752244, "reward_std": 1.2062239050865173, "rewards/correctness_reward_func": 0.28125001955777407, "rewards/int_reward_func": 0.3002232238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7407478094100952, "step": 153 }, { "completion_length": 515.2500152587891, "epoch": 0.5767790262172284, "grad_norm": 2.8796603679656982, "kl": 4.6162109375, "learning_rate": 5.975451610080642e-07, "loss": 0.1846, "reward": -0.14308037795126438, "reward_std": 1.2807807624340057, "rewards/correctness_reward_func": 0.2767857238650322, "rewards/int_reward_func": 0.2656250149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6854911148548126, "step": 154 }, { "completion_length": 518.0714569091797, "epoch": 0.5805243445692884, "grad_norm": 2.875056505203247, "kl": 4.8203125, "learning_rate": 5.911177627460738e-07, "loss": 0.1928, "reward": -0.14478794857859612, "reward_std": 1.3489331901073456, "rewards/correctness_reward_func": 0.316964291036129, "rewards/int_reward_func": 0.286830373108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7485826313495636, "step": 155 }, { "completion_length": 526.2143096923828, "epoch": 0.5842696629213483, "grad_norm": 2.2693309783935547, "kl": 5.466796875, "learning_rate": 5.846747519245122e-07, "loss": 0.2187, "reward": -0.18839732877677307, "reward_std": 1.3681853413581848, "rewards/correctness_reward_func": 0.3437500223517418, "rewards/int_reward_func": 0.286830373108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8189777284860611, "step": 156 }, { "completion_length": 518.7143096923828, "epoch": 0.5880149812734082, "grad_norm": 3.1548948287963867, "kl": 5.4326171875, "learning_rate": 5.782172325201155e-07, "loss": 0.2173, "reward": -0.11439956957474351, "reward_std": 1.17717644572258, "rewards/correctness_reward_func": 0.294642873108387, "rewards/int_reward_func": 0.2767857238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6858281642198563, "step": 157 }, { "completion_length": 528.7143096923828, "epoch": 0.5917602996254682, "grad_norm": 2.895555019378662, "kl": 5.6181640625, "learning_rate": 5.717463109955895e-07, "loss": 0.2247, "reward": -0.08654686715453863, "reward_std": 1.2630547881126404, "rewards/correctness_reward_func": 0.2678571566939354, "rewards/int_reward_func": 0.2801339402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6345379799604416, "step": 158 }, { "completion_length": 519.7500305175781, "epoch": 0.5955056179775281, "grad_norm": 2.325378894805908, "kl": 5.177734375, "learning_rate": 5.652630961100258e-07, "loss": 0.2071, "reward": -0.05345983011648059, "reward_std": 1.290926218032837, "rewards/correctness_reward_func": 0.4107143133878708, "rewards/int_reward_func": 0.2834821566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7476563155651093, "step": 159 }, { "completion_length": 518.357177734375, "epoch": 0.599250936329588, "grad_norm": 2.012712240219116, "kl": 5.064453125, "learning_rate": 5.587686987289189e-07, "loss": 0.2026, "reward": -0.20887724682688713, "reward_std": 1.2406348288059235, "rewards/correctness_reward_func": 0.227678582072258, "rewards/int_reward_func": 0.2767857164144516, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.713341549038887, "step": 160 }, { "completion_length": 525.3928833007812, "epoch": 0.602996254681648, "grad_norm": 2.0301644802093506, "kl": 4.5810546875, "learning_rate": 5.522642316338268e-07, "loss": 0.1832, "reward": -0.15497322753071785, "reward_std": 1.2159148454666138, "rewards/correctness_reward_func": 0.2232142984867096, "rewards/int_reward_func": 0.2723214402794838, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.651625007390976, "step": 161 }, { "completion_length": 525.3571472167969, "epoch": 0.6067415730337079, "grad_norm": 2.222111463546753, "kl": 4.67431640625, "learning_rate": 5.457508093317013e-07, "loss": 0.1869, "reward": -0.09700893424451351, "reward_std": 1.1505461633205414, "rewards/correctness_reward_func": 0.2098214365541935, "rewards/int_reward_func": 0.2500000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5568303763866425, "step": 162 }, { "epoch": 0.6067415730337079, "eval_completion_length": 531.0331080386513, "eval_kl": Infinity, "eval_loss": Infinity, "eval_reward": -0.19856291947102075, "eval_reward_std": 1.2412375688552857, "eval_rewards/correctness_reward_func": 0.2193609140421215, "eval_rewards/int_reward_func": 0.27259556368777627, "eval_rewards/soft_format_reward_func": 0.00018796993321494053, "eval_rewards/strict_format_reward_func": 0.00011748120825933783, "eval_rewards/xmlcount_reward_func": -0.6908248430804202, "eval_runtime": 2481.1198, "eval_samples_per_second": 0.532, "eval_steps_per_second": 0.038, "step": 162 }, { "completion_length": 521.6428680419922, "epoch": 0.6104868913857678, "grad_norm": 1.9306232929229736, "kl": 4.5283203125, "learning_rate": 5.392295478639225e-07, "loss": 0.1812, "reward": -0.2653035670518875, "reward_std": 1.287449061870575, "rewards/correctness_reward_func": 0.2232142947614193, "rewards/int_reward_func": 0.2834821566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7720000445842743, "step": 163 }, { "completion_length": 520.1428833007812, "epoch": 0.6142322097378277, "grad_norm": 2.5203397274017334, "kl": 5.091796875, "learning_rate": 5.327015646150716e-07, "loss": 0.2037, "reward": 0.042263399343937635, "reward_std": 1.2937269806861877, "rewards/correctness_reward_func": 0.3437500149011612, "rewards/int_reward_func": 0.2924107313156128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.593897320330143, "step": 164 }, { "completion_length": 519.8214569091797, "epoch": 0.6179775280898876, "grad_norm": 1.5657882690429688, "kl": 4.630859375, "learning_rate": 5.26167978121472e-07, "loss": 0.1852, "reward": -0.22304466180503368, "reward_std": 1.2659655511379242, "rewards/correctness_reward_func": 0.2633928768336773, "rewards/int_reward_func": 0.2767857238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7632232308387756, "step": 165 }, { "completion_length": 524.0357360839844, "epoch": 0.6217228464419475, "grad_norm": 2.093134880065918, "kl": 4.900390625, "learning_rate": 5.196299078795343e-07, "loss": 0.196, "reward": -0.27018305100500584, "reward_std": 1.325235664844513, "rewards/correctness_reward_func": 0.2723214365541935, "rewards/int_reward_func": 0.2845982313156128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8271027356386185, "step": 166 }, { "completion_length": 524.6428833007812, "epoch": 0.6254681647940075, "grad_norm": 2.1078877449035645, "kl": 5.291015625, "learning_rate": 5.130884741539366e-07, "loss": 0.2116, "reward": -0.12524107471108437, "reward_std": 1.3548560738563538, "rewards/correctness_reward_func": 0.2991071566939354, "rewards/int_reward_func": 0.3191964402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.743544690310955, "step": 167 }, { "completion_length": 520.4643096923828, "epoch": 0.6292134831460674, "grad_norm": 3.086167812347412, "kl": 5.5380859375, "learning_rate": 5.065447977856722e-07, "loss": 0.2215, "reward": -0.21059376862831414, "reward_std": 1.2520826160907745, "rewards/correctness_reward_func": 0.21428572107106447, "rewards/int_reward_func": 0.2656250111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6905045062303543, "step": 168 }, { "completion_length": 526.7143096923828, "epoch": 0.6329588014981273, "grad_norm": 1.7510619163513184, "kl": 5.1826171875, "learning_rate": 5e-07, "loss": 0.2073, "reward": -0.1397678554058075, "reward_std": 1.1866637468338013, "rewards/correctness_reward_func": 0.2232142947614193, "rewards/int_reward_func": 0.2734375111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6364196613430977, "step": 169 }, { "completion_length": 514.8571624755859, "epoch": 0.6367041198501873, "grad_norm": 2.065687894821167, "kl": 4.8115234375, "learning_rate": 4.934552022143279e-07, "loss": 0.1925, "reward": -0.3015558011829853, "reward_std": 1.4401615262031555, "rewards/correctness_reward_func": 0.2991071566939354, "rewards/int_reward_func": 0.2801339477300644, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8807969093322754, "step": 170 }, { "completion_length": 517.0357208251953, "epoch": 0.6404494382022472, "grad_norm": 2.2018256187438965, "kl": 4.5322265625, "learning_rate": 4.869115258460634e-07, "loss": 0.1813, "reward": -0.06244420446455479, "reward_std": 1.273977816104889, "rewards/correctness_reward_func": 0.294642873108387, "rewards/int_reward_func": 0.2756696529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.632756732404232, "step": 171 }, { "completion_length": 531.3214569091797, "epoch": 0.6441947565543071, "grad_norm": 2.4541852474212646, "kl": 4.6171875, "learning_rate": 4.803700921204658e-07, "loss": 0.1847, "reward": 0.0766785740852356, "reward_std": 1.2054692655801773, "rewards/correctness_reward_func": 0.3526785895228386, "rewards/int_reward_func": 0.2991071566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5751071497797966, "step": 172 }, { "completion_length": 526.0714569091797, "epoch": 0.6479400749063671, "grad_norm": 2.2811288833618164, "kl": 4.4521484375, "learning_rate": 4.7383202187852804e-07, "loss": 0.1781, "reward": -0.08707589283585548, "reward_std": 1.3148585855960846, "rewards/correctness_reward_func": 0.3705357387661934, "rewards/int_reward_func": 0.3046875074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.762299120426178, "step": 173 }, { "completion_length": 517.8214416503906, "epoch": 0.651685393258427, "grad_norm": 5.915732383728027, "kl": 5.205078125, "learning_rate": 4.672984353849284e-07, "loss": 0.2081, "reward": -0.024636156857013702, "reward_std": 1.2718493938446045, "rewards/correctness_reward_func": 0.4107143096625805, "rewards/int_reward_func": 0.3125000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7478504627943039, "step": 174 }, { "completion_length": 517.0714569091797, "epoch": 0.6554307116104869, "grad_norm": 2.853475332260132, "kl": 5.4931640625, "learning_rate": 4.6077045213607755e-07, "loss": 0.2197, "reward": -0.31054020673036575, "reward_std": 1.394624948501587, "rewards/correctness_reward_func": 0.2455357201397419, "rewards/int_reward_func": 0.2979910895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8540670275688171, "step": 175 }, { "completion_length": 525.5714569091797, "epoch": 0.6591760299625468, "grad_norm": 1.7146692276000977, "kl": 4.740234375, "learning_rate": 4.542491906682988e-07, "loss": 0.1896, "reward": -0.13947322126477957, "reward_std": 1.1648866534233093, "rewards/correctness_reward_func": 0.2053571566939354, "rewards/int_reward_func": 0.2566964328289032, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6015268266201019, "step": 176 }, { "completion_length": 521.2143249511719, "epoch": 0.6629213483146067, "grad_norm": 1.6346399784088135, "kl": 4.8623046875, "learning_rate": 4.477357683661733e-07, "loss": 0.1945, "reward": -0.0936138320248574, "reward_std": 1.2343957126140594, "rewards/correctness_reward_func": 0.3392857313156128, "rewards/int_reward_func": 0.286830373108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7197299301624298, "step": 177 }, { "completion_length": 522.5000305175781, "epoch": 0.6666666666666666, "grad_norm": 1.850631594657898, "kl": 5.2626953125, "learning_rate": 4.412313012710812e-07, "loss": 0.2105, "reward": -0.10290402628015727, "reward_std": 1.2204317450523376, "rewards/correctness_reward_func": 0.2008928656578064, "rewards/int_reward_func": 0.2578125037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5616094022989273, "step": 178 }, { "completion_length": 534.1785888671875, "epoch": 0.6704119850187266, "grad_norm": 2.1959705352783203, "kl": 5.53125, "learning_rate": 4.347369038899743e-07, "loss": 0.2212, "reward": -0.37369197979569435, "reward_std": 1.324585199356079, "rewards/correctness_reward_func": 0.2053571529686451, "rewards/int_reward_func": 0.2622767984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8413259387016296, "step": 179 }, { "completion_length": 520.8571624755859, "epoch": 0.6741573033707865, "grad_norm": 1.6929233074188232, "kl": 5.0546875, "learning_rate": 4.2825368900441037e-07, "loss": 0.2022, "reward": -0.1256919801235199, "reward_std": 1.4035832583904266, "rewards/correctness_reward_func": 0.3348214477300644, "rewards/int_reward_func": 0.2979910895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7585045099258423, "step": 180 }, { "completion_length": 528.6786041259766, "epoch": 0.6779026217228464, "grad_norm": 1.5895494222640991, "kl": 4.9208984375, "learning_rate": 4.2178276747988444e-07, "loss": 0.1968, "reward": -0.05114509118720889, "reward_std": 1.2217027842998505, "rewards/correctness_reward_func": 0.3348214477300644, "rewards/int_reward_func": 0.2935267984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.679493322968483, "step": 181 }, { "completion_length": 527.107177734375, "epoch": 0.6816479400749064, "grad_norm": 1.8839842081069946, "kl": 4.43359375, "learning_rate": 4.153252480754877e-07, "loss": 0.1774, "reward": -0.16289733722805977, "reward_std": 1.2423360645771027, "rewards/correctness_reward_func": 0.2812500149011612, "rewards/int_reward_func": 0.2879464440047741, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7332098335027695, "step": 182 }, { "completion_length": 516.9285888671875, "epoch": 0.6853932584269663, "grad_norm": 2.7923595905303955, "kl": 4.537109375, "learning_rate": 4.0888223725392624e-07, "loss": 0.1815, "reward": -0.19774777255952358, "reward_std": 1.229084461927414, "rewards/correctness_reward_func": 0.2232142984867096, "rewards/int_reward_func": 0.2689732201397419, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6899353265762329, "step": 183 }, { "completion_length": 522.357177734375, "epoch": 0.6891385767790262, "grad_norm": 1.9454573392868042, "kl": 4.69140625, "learning_rate": 4.0245483899193586e-07, "loss": 0.1877, "reward": -0.15699554327875376, "reward_std": 1.1800599992275238, "rewards/correctness_reward_func": 0.2633928693830967, "rewards/int_reward_func": 0.3147321566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7351205945014954, "step": 184 }, { "completion_length": 533.1786041259766, "epoch": 0.6928838951310862, "grad_norm": 1.6554526090621948, "kl": 4.5224609375, "learning_rate": 3.960441545911204e-07, "loss": 0.181, "reward": -0.07045312505215406, "reward_std": 1.3237174451351166, "rewards/correctness_reward_func": 0.3660714402794838, "rewards/int_reward_func": 0.290178582072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7267031669616699, "step": 185 }, { "completion_length": 518.2500305175781, "epoch": 0.6966292134831461, "grad_norm": 2.1547718048095703, "kl": 4.7314453125, "learning_rate": 3.896512824892495e-07, "loss": 0.1893, "reward": -0.05010267719626427, "reward_std": 1.0627744495868683, "rewards/correctness_reward_func": 0.2544642984867096, "rewards/int_reward_func": 0.2890625149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5936295166611671, "step": 186 }, { "completion_length": 520.6428833007812, "epoch": 0.700374531835206, "grad_norm": 2.404644250869751, "kl": 4.72265625, "learning_rate": 3.8327731807204744e-07, "loss": 0.1889, "reward": -0.00044643133878707886, "reward_std": 1.2751690447330475, "rewards/correctness_reward_func": 0.3660714402794838, "rewards/int_reward_func": 0.2991071566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6656250506639481, "step": 187 }, { "completion_length": 527.7143096923828, "epoch": 0.704119850187266, "grad_norm": 2.1762359142303467, "kl": 4.845703125, "learning_rate": 3.7692335348550346e-07, "loss": 0.1938, "reward": -0.019415173679590225, "reward_std": 1.4023870527744293, "rewards/correctness_reward_func": 0.4375000074505806, "rewards/int_reward_func": 0.3013392984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7582544833421707, "step": 188 }, { "completion_length": 516.1428833007812, "epoch": 0.7078651685393258, "grad_norm": 1.9605624675750732, "kl": 4.767578125, "learning_rate": 3.7059047744873955e-07, "loss": 0.1907, "reward": -0.28092633839696646, "reward_std": 1.2700144350528717, "rewards/correctness_reward_func": 0.2455357238650322, "rewards/int_reward_func": 0.2857142984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8121763616800308, "step": 189 }, { "epoch": 0.7078651685393258, "eval_completion_length": 534.5684467516447, "eval_kl": Infinity, "eval_loss": Infinity, "eval_reward": -0.19735878194976403, "eval_reward_std": 1.251351103029753, "eval_rewards/correctness_reward_func": 0.2117481308548074, "eval_rewards/int_reward_func": 0.2752584706795843, "eval_rewards/soft_format_reward_func": 4.6992483303735133e-05, "eval_rewards/strict_format_reward_func": 7.048872495560269e-05, "eval_rewards/xmlcount_reward_func": -0.6844828577418076, "eval_runtime": 2493.3052, "eval_samples_per_second": 0.529, "eval_steps_per_second": 0.038, "step": 189 }, { "completion_length": 519.7143096923828, "epoch": 0.7116104868913857, "grad_norm": 2.0175857543945312, "kl": 4.990234375, "learning_rate": 3.642797750674629e-07, "loss": 0.1996, "reward": -0.04893304128199816, "reward_std": 1.230804204940796, "rewards/correctness_reward_func": 0.3258928693830967, "rewards/int_reward_func": 0.3058035895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6806294620037079, "step": 190 }, { "completion_length": 524.7857360839844, "epoch": 0.7153558052434457, "grad_norm": 1.9237942695617676, "kl": 5.12890625, "learning_rate": 3.5799232764803867e-07, "loss": 0.2052, "reward": -0.05077901855111122, "reward_std": 1.1569238305091858, "rewards/correctness_reward_func": 0.2232142984867096, "rewards/int_reward_func": 0.2767857275903225, "rewards/soft_format_reward_func": 0.0022321429569274187, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5530111789703369, "step": 191 }, { "completion_length": 515.8928833007812, "epoch": 0.7191011235955056, "grad_norm": 2.954789876937866, "kl": 5.5703125, "learning_rate": 3.517292125122145e-07, "loss": 0.2228, "reward": -0.22044420626480132, "reward_std": 1.381562888622284, "rewards/correctness_reward_func": 0.366071455180645, "rewards/int_reward_func": 0.2712053656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8577210158109665, "step": 192 }, { "completion_length": 518.7143096923828, "epoch": 0.7228464419475655, "grad_norm": 2.220942735671997, "kl": 5.3818359375, "learning_rate": 3.454915028125263e-07, "loss": 0.2153, "reward": -0.010651785880327225, "reward_std": 1.300764262676239, "rewards/correctness_reward_func": 0.3526785895228386, "rewards/int_reward_func": 0.302455373108387, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6669018268585205, "step": 193 }, { "completion_length": 523.6071624755859, "epoch": 0.7265917602996255, "grad_norm": 1.9517362117767334, "kl": 4.9345703125, "learning_rate": 3.392802673484193e-07, "loss": 0.1974, "reward": -0.18442187644541264, "reward_std": 1.2702946960926056, "rewards/correctness_reward_func": 0.2723214440047741, "rewards/int_reward_func": 0.2700893022119999, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7268326282501221, "step": 194 }, { "completion_length": 544.4643249511719, "epoch": 0.7303370786516854, "grad_norm": 2.2297322750091553, "kl": 4.875, "learning_rate": 3.330965703831146e-07, "loss": 0.195, "reward": -0.08661830611526966, "reward_std": 1.2692251205444336, "rewards/correctness_reward_func": 0.3348214477300644, "rewards/int_reward_func": 0.2901785895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7116183340549469, "step": 195 }, { "completion_length": 521.3928833007812, "epoch": 0.7340823970037453, "grad_norm": 3.919473648071289, "kl": 5.24609375, "learning_rate": 3.269414714612534e-07, "loss": 0.2098, "reward": -0.2262209877371788, "reward_std": 1.326324224472046, "rewards/correctness_reward_func": 0.196428582072258, "rewards/int_reward_func": 0.2500000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6726495623588562, "step": 196 }, { "completion_length": 517.5714569091797, "epoch": 0.7378277153558053, "grad_norm": 2.9488518238067627, "kl": 5.2353515625, "learning_rate": 3.2081602522734985e-07, "loss": 0.2094, "reward": -0.3113616332411766, "reward_std": 1.211115837097168, "rewards/correctness_reward_func": 0.2008928656578064, "rewards/int_reward_func": 0.2712053656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7834598571062088, "step": 197 }, { "completion_length": 515.5000152587891, "epoch": 0.7415730337078652, "grad_norm": 6.636002063751221, "kl": 4.9560546875, "learning_rate": 3.147212812450818e-07, "loss": 0.1982, "reward": -0.09657144173979759, "reward_std": 1.3694335520267487, "rewards/correctness_reward_func": 0.4151785895228386, "rewards/int_reward_func": 0.3158482313156128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8275982290506363, "step": 198 }, { "completion_length": 532.4643096923828, "epoch": 0.7453183520599251, "grad_norm": 2.0196456909179688, "kl": 4.9970703125, "learning_rate": 3.086582838174551e-07, "loss": 0.1998, "reward": -0.273979929741472, "reward_std": 1.3600831031799316, "rewards/correctness_reward_func": 0.2455357201397419, "rewards/int_reward_func": 0.2611607275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7806763797998428, "step": 199 }, { "completion_length": 532.4286041259766, "epoch": 0.7490636704119851, "grad_norm": 1.798177719116211, "kl": 4.6240234375, "learning_rate": 3.026280718078664e-07, "loss": 0.185, "reward": -0.26691294088959694, "reward_std": 1.3088684678077698, "rewards/correctness_reward_func": 0.2500000111758709, "rewards/int_reward_func": 0.2633928656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7803058326244354, "step": 200 }, { "completion_length": 516.8928985595703, "epoch": 0.7528089887640449, "grad_norm": 1.977175235748291, "kl": 5.1142578125, "learning_rate": 2.9663167846209996e-07, "loss": 0.2046, "reward": -0.11764062941074371, "reward_std": 1.2654018104076385, "rewards/correctness_reward_func": 0.2455357275903225, "rewards/int_reward_func": 0.2734375149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6366139054298401, "step": 201 }, { "completion_length": 516.0357360839844, "epoch": 0.7565543071161048, "grad_norm": 2.0420053005218506, "kl": 4.185546875, "learning_rate": 2.906701312312861e-07, "loss": 0.1674, "reward": -0.1846384033560753, "reward_std": 1.2015181183815002, "rewards/correctness_reward_func": 0.2678571566939354, "rewards/int_reward_func": 0.2723214402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7248169928789139, "step": 202 }, { "completion_length": 527.607177734375, "epoch": 0.7602996254681648, "grad_norm": 2.0275211334228516, "kl": 4.865234375, "learning_rate": 2.847444515958523e-07, "loss": 0.1946, "reward": -0.0009955335408449173, "reward_std": 1.2537928223609924, "rewards/correctness_reward_func": 0.3794643059372902, "rewards/int_reward_func": 0.2979910746216774, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6795670092105865, "step": 203 }, { "completion_length": 532.7143096923828, "epoch": 0.7640449438202247, "grad_norm": 1.5860823392868042, "kl": 4.7626953125, "learning_rate": 2.7885565489049946e-07, "loss": 0.1905, "reward": -0.25017189234495163, "reward_std": 1.346979558467865, "rewards/correctness_reward_func": 0.2857142984867096, "rewards/int_reward_func": 0.286830373108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.822716549038887, "step": 204 }, { "completion_length": 514.9286041259766, "epoch": 0.7677902621722846, "grad_norm": 1.611587405204773, "kl": 4.9775390625, "learning_rate": 2.730047501302266e-07, "loss": 0.1991, "reward": -0.009238844271749258, "reward_std": 1.2853399515151978, "rewards/correctness_reward_func": 0.3660714477300644, "rewards/int_reward_func": 0.2667410895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6420513689517975, "step": 205 }, { "completion_length": 517.7500305175781, "epoch": 0.7715355805243446, "grad_norm": 2.1895291805267334, "kl": 5.3173828125, "learning_rate": 2.671927398374443e-07, "loss": 0.2127, "reward": -0.02679464779794216, "reward_std": 1.2472732365131378, "rewards/correctness_reward_func": 0.3526785895228386, "rewards/int_reward_func": 0.2912946566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6707678735256195, "step": 206 }, { "completion_length": 524.6071624755859, "epoch": 0.7752808988764045, "grad_norm": 2.1895291805267334, "kl": Infinity, "learning_rate": 2.671927398374443e-07, "loss": 0.2269, "reward": -0.11460491642355919, "reward_std": 1.2216467559337616, "rewards/correctness_reward_func": 0.2812500149011612, "rewards/int_reward_func": 0.2823660895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6782210320234299, "step": 207 }, { "completion_length": 564.3928680419922, "epoch": 0.7790262172284644, "grad_norm": 1.4329153299331665, "kl": 5.193359375, "learning_rate": 2.6142061987019574e-07, "loss": 0.2077, "reward": -0.15532590076327324, "reward_std": 1.2209243476390839, "rewards/correctness_reward_func": 0.2544642984867096, "rewards/int_reward_func": 0.2645089402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6742991209030151, "step": 208 }, { "completion_length": 535.3571624755859, "epoch": 0.7827715355805244, "grad_norm": 2.543517827987671, "kl": 5.3876953125, "learning_rate": 2.556893792515227e-07, "loss": 0.2155, "reward": -0.10556028177961707, "reward_std": 1.288367360830307, "rewards/correctness_reward_func": 0.3035714402794838, "rewards/int_reward_func": 0.297991082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7071228325366974, "step": 209 }, { "completion_length": 525.5714416503906, "epoch": 0.7865168539325843, "grad_norm": 2.5897624492645264, "kl": 5.2900390625, "learning_rate": 2.500000000000001e-07, "loss": 0.2116, "reward": -0.1855446556583047, "reward_std": 1.2734140753746033, "rewards/correctness_reward_func": 0.2276785783469677, "rewards/int_reward_func": 0.2890625074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7022857666015625, "step": 210 }, { "completion_length": 516.7142944335938, "epoch": 0.7902621722846442, "grad_norm": 4.0968732833862305, "kl": Infinity, "learning_rate": 2.44353456961474e-07, "loss": 0.2274, "reward": -0.24795536883175373, "reward_std": 1.2348434031009674, "rewards/correctness_reward_func": 0.2232142984867096, "rewards/int_reward_func": 0.2712053656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7423750311136246, "step": 211 }, { "completion_length": 529.4643096923828, "epoch": 0.7940074906367042, "grad_norm": 1.8898731470108032, "kl": 4.9951171875, "learning_rate": 2.387507176420256e-07, "loss": 0.1998, "reward": -0.2701317183673382, "reward_std": 1.333281546831131, "rewards/correctness_reward_func": 0.1741071566939354, "rewards/int_reward_func": 0.2533482201397419, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.6987031251192093, "step": 212 }, { "completion_length": 522.2857360839844, "epoch": 0.797752808988764, "grad_norm": 1.5559784173965454, "kl": 4.8896484375, "learning_rate": 2.3319274204219424e-07, "loss": 0.1956, "reward": -0.15001339837908745, "reward_std": 1.1915283203125, "rewards/correctness_reward_func": 0.2232143022119999, "rewards/int_reward_func": 0.2845982238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6578259021043777, "step": 213 }, { "completion_length": 523.5357513427734, "epoch": 0.8014981273408239, "grad_norm": 1.6422719955444336, "kl": 4.6875, "learning_rate": 2.2768048249248644e-07, "loss": 0.1875, "reward": -0.14155804365873337, "reward_std": 1.2657607197761536, "rewards/correctness_reward_func": 0.3080357387661934, "rewards/int_reward_func": 0.2578125111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7074063122272491, "step": 214 }, { "completion_length": 541.3928833007812, "epoch": 0.8052434456928839, "grad_norm": 1.9782863855361938, "kl": 4.970703125, "learning_rate": 2.2221488349019902e-07, "loss": 0.1988, "reward": -0.03004240826703608, "reward_std": 1.232962042093277, "rewards/correctness_reward_func": 0.325892873108387, "rewards/int_reward_func": 0.2823660746216774, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6383013874292374, "step": 215 }, { "completion_length": 519.107177734375, "epoch": 0.8089887640449438, "grad_norm": 1.8962812423706055, "kl": 5.2587890625, "learning_rate": 2.167968815375837e-07, "loss": 0.2104, "reward": -0.05005356844048947, "reward_std": 1.2248910963535309, "rewards/correctness_reward_func": 0.325892873108387, "rewards/int_reward_func": 0.3013392984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6772857159376144, "step": 216 }, { "epoch": 0.8089887640449438, "eval_completion_length": 529.0346140008223, "eval_kl": Infinity, "eval_loss": Infinity, "eval_reward": -0.20390458432712444, "eval_reward_std": 1.2542201882914492, "eval_rewards/correctness_reward_func": 0.21278196471302133, "eval_rewards/int_reward_func": 0.2732456263742949, "eval_rewards/soft_format_reward_func": 4.6992483303735133e-05, "eval_rewards/strict_format_reward_func": 0.00018796993321494053, "eval_rewards/xmlcount_reward_func": -0.6901671490386913, "eval_runtime": 2401.7698, "eval_samples_per_second": 0.549, "eval_steps_per_second": 0.04, "step": 216 }, { "completion_length": 530.4643249511719, "epoch": 0.8127340823970037, "grad_norm": 1.53620183467865, "kl": 5.0556640625, "learning_rate": 2.1142740498138324e-07, "loss": 0.2023, "reward": -0.06143304280703887, "reward_std": 1.3513036668300629, "rewards/correctness_reward_func": 0.3348214402794838, "rewards/int_reward_func": 0.2890625149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6853170022368431, "step": 217 }, { "completion_length": 521.9286041259766, "epoch": 0.8164794007490637, "grad_norm": 1.607338786125183, "kl": 4.9990234375, "learning_rate": 2.0610737385376348e-07, "loss": 0.1999, "reward": -0.2871718890964985, "reward_std": 1.2107819616794586, "rewards/correctness_reward_func": 0.2008928656578064, "rewards/int_reward_func": 0.2265625111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7146272361278534, "step": 218 }, { "completion_length": 523.6428833007812, "epoch": 0.8202247191011236, "grad_norm": 3.4551496505737305, "kl": 5.6904296875, "learning_rate": 2.0083769971467045e-07, "loss": 0.2276, "reward": -0.09260490955784917, "reward_std": 1.119924932718277, "rewards/correctness_reward_func": 0.2544642984867096, "rewards/int_reward_func": 0.2801339328289032, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6272031292319298, "step": 219 }, { "completion_length": 521.1428833007812, "epoch": 0.8239700374531835, "grad_norm": 1.8328137397766113, "kl": 4.896484375, "learning_rate": 1.9561928549563966e-07, "loss": 0.1958, "reward": -0.06695759034482762, "reward_std": 1.2224089801311493, "rewards/correctness_reward_func": 0.3035714402794838, "rewards/int_reward_func": 0.3069196566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6774486601352692, "step": 220 }, { "completion_length": 519.1071624755859, "epoch": 0.8277153558052435, "grad_norm": 2.1587209701538086, "kl": 5.431640625, "learning_rate": 1.9045302534508295e-07, "loss": 0.2173, "reward": -0.10098438337445259, "reward_std": 1.4179500043392181, "rewards/correctness_reward_func": 0.325892873108387, "rewards/int_reward_func": 0.2712053656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6980826258659363, "step": 221 }, { "completion_length": 515.1428833007812, "epoch": 0.8314606741573034, "grad_norm": 2.184795618057251, "kl": 5.4384765625, "learning_rate": 1.8533980447508135e-07, "loss": 0.2176, "reward": -0.19461161736398935, "reward_std": 1.3318405449390411, "rewards/correctness_reward_func": 0.258928582072258, "rewards/int_reward_func": 0.2633928693830967, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7169330567121506, "step": 222 }, { "completion_length": 520.8928985595703, "epoch": 0.8352059925093633, "grad_norm": 1.745571494102478, "kl": 5.236328125, "learning_rate": 1.8028049900970767e-07, "loss": 0.2094, "reward": -0.16562947491183877, "reward_std": 1.4269579648971558, "rewards/correctness_reward_func": 0.3125000111758709, "rewards/int_reward_func": 0.2723214402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7504509389400482, "step": 223 }, { "completion_length": 521.5, "epoch": 0.8389513108614233, "grad_norm": 1.687516689300537, "kl": 4.83203125, "learning_rate": 1.7527597583490823e-07, "loss": 0.1933, "reward": -0.18002455797977746, "reward_std": 1.2918389737606049, "rewards/correctness_reward_func": 0.2366071492433548, "rewards/int_reward_func": 0.2600446529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6766763776540756, "step": 224 }, { "completion_length": 533.3571472167969, "epoch": 0.8426966292134831, "grad_norm": 1.9606373310089111, "kl": 4.9736328125, "learning_rate": 1.7032709244996556e-07, "loss": 0.1989, "reward": -0.16109822131693363, "reward_std": 1.2961285710334778, "rewards/correctness_reward_func": 0.2633928656578064, "rewards/int_reward_func": 0.2555803582072258, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6811875551939011, "step": 225 }, { "completion_length": 524.8928680419922, "epoch": 0.846441947565543, "grad_norm": 1.9700647592544556, "kl": 5.3662109375, "learning_rate": 1.6543469682057104e-07, "loss": 0.2147, "reward": -0.21540626091882586, "reward_std": 1.4037460684776306, "rewards/correctness_reward_func": 0.258928582072258, "rewards/int_reward_func": 0.2477678656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7221027463674545, "step": 226 }, { "completion_length": 527.5357208251953, "epoch": 0.850187265917603, "grad_norm": 1.6679848432540894, "kl": 4.9638671875, "learning_rate": 1.605996272335291e-07, "loss": 0.1985, "reward": -0.08088393486104906, "reward_std": 1.2692644596099854, "rewards/correctness_reward_func": 0.3839285895228386, "rewards/int_reward_func": 0.2834821566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7482946962118149, "step": 227 }, { "completion_length": 519.2857360839844, "epoch": 0.8539325842696629, "grad_norm": 1.6768486499786377, "kl": 5.38671875, "learning_rate": 1.5582271215312293e-07, "loss": 0.2154, "reward": -0.09632367175072432, "reward_std": 1.2886937856674194, "rewards/correctness_reward_func": 0.2901785895228386, "rewards/int_reward_func": 0.2756696492433548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6621718853712082, "step": 228 }, { "completion_length": 515.8571624755859, "epoch": 0.8576779026217228, "grad_norm": 1.442134976387024, "kl": 4.7998046875, "learning_rate": 1.5110477007916e-07, "loss": 0.192, "reward": -0.2380736656486988, "reward_std": 1.2216798663139343, "rewards/correctness_reward_func": 0.1964285783469677, "rewards/int_reward_func": 0.2712053656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7057076245546341, "step": 229 }, { "completion_length": 538.9643096923828, "epoch": 0.8614232209737828, "grad_norm": 2.484066963195801, "kl": 4.9287109375, "learning_rate": 1.4644660940672627e-07, "loss": 0.1971, "reward": -0.11506248824298382, "reward_std": 1.112929493188858, "rewards/correctness_reward_func": 0.2767857238650322, "rewards/int_reward_func": 0.274553582072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6664018034934998, "step": 230 }, { "completion_length": 516.0357208251953, "epoch": 0.8651685393258427, "grad_norm": 2.529921770095825, "kl": 5.560546875, "learning_rate": 1.4184902828767286e-07, "loss": 0.2224, "reward": -0.07746875449083745, "reward_std": 1.3153048753738403, "rewards/correctness_reward_func": 0.299107164144516, "rewards/int_reward_func": 0.3013392984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6779152303934097, "step": 231 }, { "completion_length": 517.7500152587891, "epoch": 0.8689138576779026, "grad_norm": 2.0569961071014404, "kl": 5.0791015625, "learning_rate": 1.3731281449385628e-07, "loss": 0.2032, "reward": -0.2263192073442042, "reward_std": 1.2717112004756927, "rewards/correctness_reward_func": 0.2500000111758709, "rewards/int_reward_func": 0.2756696566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.751988872885704, "step": 232 }, { "completion_length": 516.8214569091797, "epoch": 0.8726591760299626, "grad_norm": 1.546913743019104, "kl": 4.626953125, "learning_rate": 1.3283874528215733e-07, "loss": 0.185, "reward": -0.0691049168817699, "reward_std": 1.3089016377925873, "rewards/correctness_reward_func": 0.3035714402794838, "rewards/int_reward_func": 0.286830373108387, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.660622775554657, "step": 233 }, { "completion_length": 515.107177734375, "epoch": 0.8764044943820225, "grad_norm": 3.8171000480651855, "kl": 5.3759765625, "learning_rate": 1.284275872613028e-07, "loss": 0.215, "reward": -0.04519642610102892, "reward_std": 1.1741289049386978, "rewards/correctness_reward_func": 0.3169643059372902, "rewards/int_reward_func": 0.3203125223517418, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6824732571840286, "step": 234 }, { "completion_length": 522.7500152587891, "epoch": 0.8801498127340824, "grad_norm": 1.9495872259140015, "kl": 4.9345703125, "learning_rate": 1.2408009626051135e-07, "loss": 0.1974, "reward": -0.058752238750457764, "reward_std": 1.273261696100235, "rewards/correctness_reward_func": 0.3214285895228386, "rewards/int_reward_func": 0.3046875074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6848683282732964, "step": 235 }, { "completion_length": 518.7500305175781, "epoch": 0.8838951310861424, "grad_norm": 1.8630801439285278, "kl": 5.265625, "learning_rate": 1.1979701719998454e-07, "loss": 0.2106, "reward": -0.20418081060051918, "reward_std": 1.4417231380939484, "rewards/correctness_reward_func": 0.3392857387661934, "rewards/int_reward_func": 0.3147321566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8581986874341965, "step": 236 }, { "completion_length": 532.1071624755859, "epoch": 0.8876404494382022, "grad_norm": 1.293250560760498, "kl": 4.8095703125, "learning_rate": 1.1557908396327026e-07, "loss": 0.1924, "reward": -0.08567634131759405, "reward_std": 1.2106836140155792, "rewards/correctness_reward_func": 0.2991071566939354, "rewards/int_reward_func": 0.2712053693830967, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.655988872051239, "step": 237 }, { "completion_length": 526.3928833007812, "epoch": 0.8913857677902621, "grad_norm": 2.8587074279785156, "kl": 5.3740234375, "learning_rate": 1.1142701927151454e-07, "loss": 0.215, "reward": -0.1162053607404232, "reward_std": 1.1973454058170319, "rewards/correctness_reward_func": 0.2589285857975483, "rewards/int_reward_func": 0.2968750149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6720089465379715, "step": 238 }, { "completion_length": 517.2500305175781, "epoch": 0.8951310861423221, "grad_norm": 1.921934962272644, "kl": 4.974609375, "learning_rate": 1.0734153455962763e-07, "loss": 0.199, "reward": -0.16254911944270134, "reward_std": 1.2848596572875977, "rewards/correctness_reward_func": 0.3125000223517418, "rewards/int_reward_func": 0.2946428656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.769691988825798, "step": 239 }, { "completion_length": 517.1071624755859, "epoch": 0.898876404494382, "grad_norm": 2.530561685562134, "kl": 5.380859375, "learning_rate": 1.0332332985438247e-07, "loss": 0.2153, "reward": -0.12155580706894398, "reward_std": 1.2704256772994995, "rewards/correctness_reward_func": 0.3214285895228386, "rewards/int_reward_func": 0.3046875149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7476719170808792, "step": 240 }, { "completion_length": 519.7143249511719, "epoch": 0.9026217228464419, "grad_norm": 2.2845098972320557, "kl": 4.5966796875, "learning_rate": 9.937309365446971e-08, "loss": 0.1839, "reward": 0.07114956201985478, "reward_std": 1.152601718902588, "rewards/correctness_reward_func": 0.3660714477300644, "rewards/int_reward_func": 0.2968750074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5917969048023224, "step": 241 }, { "completion_length": 520.1786041259766, "epoch": 0.9063670411985019, "grad_norm": 2.1960318088531494, "kl": 5.0009765625, "learning_rate": 9.549150281252632e-08, "loss": 0.2, "reward": -0.03549107629805803, "reward_std": 1.2035468220710754, "rewards/correctness_reward_func": 0.2723214402794838, "rewards/int_reward_func": 0.2812500074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5890625417232513, "step": 242 }, { "completion_length": 527.7143096923828, "epoch": 0.9101123595505618, "grad_norm": 2.1960318088531494, "kl": Infinity, "learning_rate": 9.549150281252632e-08, "loss": 0.2017, "reward": -0.17003126139752567, "reward_std": 1.2424619793891907, "rewards/correctness_reward_func": 0.21875000931322575, "rewards/int_reward_func": 0.2912946566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6800759434700012, "step": 243 }, { "epoch": 0.9101123595505618, "eval_completion_length": 537.0225817228618, "eval_kl": Infinity, "eval_loss": Infinity, "eval_reward": -0.19749192681751754, "eval_reward_std": 1.2450736566593772, "eval_rewards/correctness_reward_func": 0.21118422009442983, "eval_rewards/int_reward_func": 0.2718750116072203, "eval_rewards/soft_format_reward_func": 9.398496660747027e-05, "eval_rewards/strict_format_reward_func": 7.048872495560269e-05, "eval_rewards/xmlcount_reward_func": -0.6807156267919039, "eval_runtime": 2424.2347, "eval_samples_per_second": 0.544, "eval_steps_per_second": 0.039, "step": 243 }, { "completion_length": 515.9643096923828, "epoch": 0.9138576779026217, "grad_norm": 1.9732158184051514, "kl": 4.8046875, "learning_rate": 9.167922241916054e-08, "loss": 0.1922, "reward": -0.22770760208368301, "reward_std": 1.2481779158115387, "rewards/correctness_reward_func": 0.2366071566939354, "rewards/int_reward_func": 0.2712053768336773, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7355201095342636, "step": 244 }, { "completion_length": 531.1071624755859, "epoch": 0.9176029962546817, "grad_norm": 1.794622540473938, "kl": 5.0888671875, "learning_rate": 8.793690568899215e-08, "loss": 0.2035, "reward": -0.12151339650154114, "reward_std": 1.2451187074184418, "rewards/correctness_reward_func": 0.3303571678698063, "rewards/int_reward_func": 0.2845982275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7364687621593475, "step": 245 }, { "completion_length": 529.2143096923828, "epoch": 0.9213483146067416, "grad_norm": 3.917875051498413, "kl": 5.42578125, "learning_rate": 8.426519384872732e-08, "loss": 0.217, "reward": -0.20378349348902702, "reward_std": 1.318730652332306, "rewards/correctness_reward_func": 0.258928582072258, "rewards/int_reward_func": 0.2700892984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7328013777732849, "step": 246 }, { "completion_length": 527.3571624755859, "epoch": 0.9250936329588015, "grad_norm": 1.6186972856521606, "kl": 4.9921875, "learning_rate": 8.066471602728803e-08, "loss": 0.1997, "reward": -0.17118750512599945, "reward_std": 1.3723264038562775, "rewards/correctness_reward_func": 0.2366071566939354, "rewards/int_reward_func": 0.2845982313156128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6923928707838058, "step": 247 }, { "completion_length": 521.7143096923828, "epoch": 0.9288389513108615, "grad_norm": 1.6485563516616821, "kl": 4.7802734375, "learning_rate": 7.71360891480134e-08, "loss": 0.1912, "reward": -0.2930625192821026, "reward_std": 1.3668555617332458, "rewards/correctness_reward_func": 0.2232142984867096, "rewards/int_reward_func": 0.2723214402794838, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7897143363952637, "step": 248 }, { "completion_length": 528.6071624755859, "epoch": 0.9325842696629213, "grad_norm": 2.551079034805298, "kl": 5.6396484375, "learning_rate": 7.36799178229539e-08, "loss": 0.2256, "reward": -0.09009598195552826, "reward_std": 1.4010609686374664, "rewards/correctness_reward_func": 0.3348214402794838, "rewards/int_reward_func": 0.2991071566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7240245938301086, "step": 249 }, { "completion_length": 530.5714569091797, "epoch": 0.9363295880149812, "grad_norm": 1.623828649520874, "kl": 5.0751953125, "learning_rate": 7.029679424927365e-08, "loss": 0.203, "reward": -0.2168437521904707, "reward_std": 1.3051793575286865, "rewards/correctness_reward_func": 0.2767857238650322, "rewards/int_reward_func": 0.313616082072258, "rewards/soft_format_reward_func": 0.0022321429569274187, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8094777017831802, "step": 250 }, { "completion_length": 529.1428985595703, "epoch": 0.9400749063670412, "grad_norm": 2.090834379196167, "kl": 5.0615234375, "learning_rate": 6.698729810778064e-08, "loss": 0.2024, "reward": -0.35498661547899246, "reward_std": 1.3341231644153595, "rewards/correctness_reward_func": 0.1830357238650322, "rewards/int_reward_func": 0.282366082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8203884065151215, "step": 251 }, { "completion_length": 541.7143096923828, "epoch": 0.9438202247191011, "grad_norm": 1.5824670791625977, "kl": 5.072265625, "learning_rate": 6.375199646360141e-08, "loss": 0.2029, "reward": -0.16221206076443195, "reward_std": 1.3483150601387024, "rewards/correctness_reward_func": 0.2678571566939354, "rewards/int_reward_func": 0.2834821566939354, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7146674692630768, "step": 252 }, { "completion_length": 521.357177734375, "epoch": 0.947565543071161, "grad_norm": 2.750608205795288, "kl": 5.5068359375, "learning_rate": 6.059144366901736e-08, "loss": 0.2202, "reward": -0.1930067086359486, "reward_std": 1.1880230009555817, "rewards/correctness_reward_func": 0.2455357238650322, "rewards/int_reward_func": 0.2611607313156128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6997031569480896, "step": 253 }, { "completion_length": 519.3214569091797, "epoch": 0.951310861423221, "grad_norm": 1.7069954872131348, "kl": 5.3154296875, "learning_rate": 5.750618126847912e-08, "loss": 0.2126, "reward": -0.04345982416998595, "reward_std": 1.2727106213569641, "rewards/correctness_reward_func": 0.3214285857975483, "rewards/int_reward_func": 0.2890625149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6539509445428848, "step": 254 }, { "completion_length": 517.6786041259766, "epoch": 0.9550561797752809, "grad_norm": 1.5457656383514404, "kl": 4.9775390625, "learning_rate": 5.44967379058161e-08, "loss": 0.1991, "reward": -0.09097097720950842, "reward_std": 1.3295612633228302, "rewards/correctness_reward_func": 0.3348214477300644, "rewards/int_reward_func": 0.302455373108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7282477915287018, "step": 255 }, { "completion_length": 529.6428833007812, "epoch": 0.9588014981273408, "grad_norm": 3.4232678413391113, "kl": Infinity, "learning_rate": 5.156362923365587e-08, "loss": 0.2322, "reward": -0.2118973322212696, "reward_std": 1.3026173412799835, "rewards/correctness_reward_func": 0.2455357275903225, "rewards/int_reward_func": 0.2901785857975483, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.7487277090549469, "step": 256 }, { "completion_length": 526.2500305175781, "epoch": 0.9625468164794008, "grad_norm": 1.854992389678955, "kl": 4.8408203125, "learning_rate": 4.870735782506979e-08, "loss": 0.1936, "reward": -0.1605692015727982, "reward_std": 1.3640064895153046, "rewards/correctness_reward_func": 0.2901785783469677, "rewards/int_reward_func": 0.2812500149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7319977879524231, "step": 257 }, { "completion_length": 516.5357208251953, "epoch": 0.9662921348314607, "grad_norm": 4.538949489593506, "kl": 5.3310546875, "learning_rate": 4.592841308745932e-08, "loss": 0.2132, "reward": -0.15284822694957256, "reward_std": 1.2363348603248596, "rewards/correctness_reward_func": 0.2500000111758709, "rewards/int_reward_func": 0.3058035895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7086518108844757, "step": 258 }, { "completion_length": 514.5714569091797, "epoch": 0.9700374531835206, "grad_norm": 1.904927372932434, "kl": 4.98828125, "learning_rate": 4.322727117869951e-08, "loss": 0.1995, "reward": -0.04242856800556183, "reward_std": 1.3000076413154602, "rewards/correctness_reward_func": 0.3526785932481289, "rewards/int_reward_func": 0.2845982313156128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6797053962945938, "step": 259 }, { "completion_length": 528.6071472167969, "epoch": 0.9737827715355806, "grad_norm": 1.8089706897735596, "kl": 5.0556640625, "learning_rate": 4.06043949255509e-08, "loss": 0.2022, "reward": -0.1290022386237979, "reward_std": 1.231601744890213, "rewards/correctness_reward_func": 0.2901785895228386, "rewards/int_reward_func": 0.2700893022119999, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.689270094037056, "step": 260 }, { "completion_length": 517.4643096923828, "epoch": 0.9775280898876404, "grad_norm": 2.000513792037964, "kl": 5.0791015625, "learning_rate": 3.806023374435663e-08, "loss": 0.2031, "reward": -0.10831697471439838, "reward_std": 1.2132568657398224, "rewards/correctness_reward_func": 0.2500000074505806, "rewards/int_reward_func": 0.2924107238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.650727704167366, "step": 261 }, { "completion_length": 527.6428833007812, "epoch": 0.9812734082397003, "grad_norm": 1.606877088546753, "kl": 5.140625, "learning_rate": 3.559522356403788e-08, "loss": 0.2056, "reward": -0.1139330342411995, "reward_std": 1.2683680653572083, "rewards/correctness_reward_func": 0.325892873108387, "rewards/int_reward_func": 0.3113839402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7512098550796509, "step": 262 }, { "completion_length": 526.2143249511719, "epoch": 0.9850187265917603, "grad_norm": 1.3300163745880127, "kl": 4.8916015625, "learning_rate": 3.3209786751399184e-08, "loss": 0.1956, "reward": -0.23550447449088097, "reward_std": 1.2275131940841675, "rewards/correctness_reward_func": 0.16964286379516125, "rewards/int_reward_func": 0.2332589365541935, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6384062841534615, "step": 263 }, { "completion_length": 526.7143096923828, "epoch": 0.9887640449438202, "grad_norm": 5.495514392852783, "kl": 5.5947265625, "learning_rate": 3.0904332038757974e-08, "loss": 0.2238, "reward": -0.03097990620881319, "reward_std": 1.2771217823028564, "rewards/correctness_reward_func": 0.3035714477300644, "rewards/int_reward_func": 0.3125000149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6470513641834259, "step": 264 }, { "completion_length": 521.1428680419922, "epoch": 0.9925093632958801, "grad_norm": 1.869722604751587, "kl": 5.494140625, "learning_rate": 2.8679254453910785e-08, "loss": 0.2197, "reward": -0.08227232204808388, "reward_std": 1.3968145847320557, "rewards/correctness_reward_func": 0.4017857387661934, "rewards/int_reward_func": 0.3158482313156128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7999062538146973, "step": 265 }, { "completion_length": 534.107177734375, "epoch": 0.9962546816479401, "grad_norm": 1.7442686557769775, "kl": 4.9443359375, "learning_rate": 2.653493525244721e-08, "loss": 0.1978, "reward": -0.18733705952763557, "reward_std": 1.244984209537506, "rewards/correctness_reward_func": 0.2410714440047741, "rewards/int_reward_func": 0.271205373108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6996138542890549, "step": 266 }, { "completion_length": 517.25, "epoch": 1.0, "grad_norm": 1.4125308990478516, "kl": 4.6435546875, "learning_rate": 2.4471741852423233e-08, "loss": 0.1817, "reward": -0.36143749207258224, "reward_std": 1.4109329879283905, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5801874995231628, "step": 267 }, { "epoch": 1.0, "step": 267, "total_flos": 0.0, "train_loss": 0.16846829135939004, "train_runtime": 42505.033, "train_samples_per_second": 0.176, "train_steps_per_second": 0.006 } ], "logging_steps": 1, "max_steps": 267, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 54, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }