diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24193 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9989658738366081, + "eval_steps": 500, + "global_step": 966, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 172.6666717529297, + "completions/mean_terminated_length": 172.6666717529297, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.001034126163391934, + "grad_norm": 5.071864592095285, + "kl": 0.3359375, + "learning_rate": 1e-06, + "loss": 0.0134, + "num_tokens": 78552.0, + "reward": 0.9166666865348816, + "reward_std": 0.40627965331077576, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.5036101341247559, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 165.5416717529297, + "completions/mean_terminated_length": 165.5416717529297, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.002068252326783868, + "grad_norm": 4.059461930492566, + "kl": 0.0036773681640625, + "learning_rate": 9.999973613218312e-07, + "loss": 0.0001, + "num_tokens": 161365.0, + "reward": 0.8333333730697632, + "reward_std": 0.4685417115688324, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 188.95834350585938, + "completions/mean_terminated_length": 188.95834350585938, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.0031023784901758012, + "grad_norm": 3.468953003174482, + "kl": 0.0027923583984375, + "learning_rate": 9.999894453151758e-07, + "loss": 0.0001, + "num_tokens": 239820.0, + "reward": 0.6666666865348816, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 157.0, + "completions/mean_terminated_length": 157.0, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.004136504653567736, + "grad_norm": 3.413335444191301, + "kl": 0.002685546875, + "learning_rate": 9.999762520635849e-07, + "loss": 0.0001, + "num_tokens": 318868.0, + "reward": 1.0833333730697632, + "reward_std": 0.25634264945983887, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.371054083108902, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 209.45834350585938, + "completions/mean_terminated_length": 209.45834350585938, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.005170630816959669, + "grad_norm": 3.8321189551506687, + "kl": 0.0026397705078125, + "learning_rate": 9.99957781706309e-07, + "loss": 0.0001, + "num_tokens": 402655.0, + "reward": 1.027777910232544, + "reward_std": 0.3323635756969452, + "rewards/reasoning_reward/mean": 1.0277777910232544, + "rewards/reasoning_reward/std": 0.5443310141563416, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 181.6666717529297, + "completions/mean_terminated_length": 181.6666717529297, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.0062047569803516025, + "grad_norm": 4.016570280789986, + "kl": 0.00323486328125, + "learning_rate": 9.999340344382978e-07, + "loss": 0.0001, + "num_tokens": 484391.0, + "reward": 1.3958333730697632, + "reward_std": 0.3663109242916107, + "rewards/reasoning_reward/mean": 1.3958333730697632, + "rewards/reasoning_reward/std": 0.642332136631012, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 213.08334350585938, + "completions/mean_terminated_length": 213.08334350585938, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.007238883143743537, + "grad_norm": 2.9810718864115326, + "kl": 0.0038604736328125, + "learning_rate": 9.99905010510197e-07, + "loss": 0.0002, + "num_tokens": 572137.0, + "reward": 0.7083333730697632, + "reward_std": 0.3535533845424652, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.5089774131774902, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 168.58334350585938, + "completions/mean_terminated_length": 168.58334350585938, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.008273009307135471, + "grad_norm": 0.033223094376599045, + "kl": 0.0020904541015625, + "learning_rate": 9.998707102283457e-07, + "loss": 0.0001, + "num_tokens": 650255.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 166.4166717529297, + "completions/mean_terminated_length": 166.4166717529297, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.009307135470527405, + "grad_norm": 2.0403595074712557, + "kl": 0.0030975341796875, + "learning_rate": 9.998311339547733e-07, + "loss": 0.0001, + "num_tokens": 734225.0, + "reward": 1.2916667461395264, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 1.2916666269302368, + "rewards/reasoning_reward/std": 0.550032913684845, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 185.1666717529297, + "completions/mean_terminated_length": 185.1666717529297, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.010341261633919338, + "grad_norm": 3.389792798216633, + "kl": 0.0031890869140625, + "learning_rate": 9.997862821071964e-07, + "loss": 0.0001, + "num_tokens": 819653.0, + "reward": 0.875, + "reward_std": 0.40812820196151733, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.5366967916488647, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 186.95834350585938, + "completions/mean_terminated_length": 186.95834350585938, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.011375387797311272, + "grad_norm": 2.331094314362588, + "kl": 0.0029296875, + "learning_rate": 9.997361551590132e-07, + "loss": 0.0001, + "num_tokens": 904148.0, + "reward": 0.5, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 0.5, + "rewards/reasoning_reward/std": 0.5107539296150208, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 147.2916717529297, + "completions/mean_terminated_length": 147.2916717529297, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.012409513960703205, + "grad_norm": 3.413766004232087, + "kl": 0.0036163330078125, + "learning_rate": 9.996807536392989e-07, + "loss": 0.0001, + "num_tokens": 984211.0, + "reward": 0.8541666865348816, + "reward_std": 0.2587745785713196, + "rewards/reasoning_reward/mean": 0.8541666865348816, + "rewards/reasoning_reward/std": 0.47729232907295227, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 145.625, + "completions/mean_terminated_length": 145.625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.01344364012409514, + "grad_norm": 3.9659481687377354, + "kl": 0.0033416748046875, + "learning_rate": 9.996200781328011e-07, + "loss": 0.0001, + "num_tokens": 1067738.0, + "reward": 0.9375, + "reward_std": 0.41282182931900024, + "rewards/reasoning_reward/mean": 0.9375, + "rewards/reasoning_reward/std": 0.5954993963241577, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 172.9166717529297, + "completions/mean_terminated_length": 172.9166717529297, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.014477766287487074, + "grad_norm": 2.6924838670848303, + "kl": 0.0023956298828125, + "learning_rate": 9.99554129279932e-07, + "loss": 0.0001, + "num_tokens": 1146640.0, + "reward": 0.8333333730697632, + "reward_std": 0.30860671401023865, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 164.9166717529297, + "completions/mean_terminated_length": 164.9166717529297, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.015511892450879007, + "grad_norm": 4.1278369792222325, + "kl": 0.0029144287109375, + "learning_rate": 9.99482907776763e-07, + "loss": 0.0001, + "num_tokens": 1224398.0, + "reward": 0.9583333730697632, + "reward_std": 0.42645785212516785, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 170.6666717529297, + "completions/mean_terminated_length": 170.6666717529297, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.016546018614270942, + "grad_norm": 0.03303182322940017, + "kl": 0.002471923828125, + "learning_rate": 9.994064143750165e-07, + "loss": 0.0001, + "num_tokens": 1303678.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 185.20834350585938, + "completions/mean_terminated_length": 185.20834350585938, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.017580144777662874, + "grad_norm": 4.190633270648795, + "kl": 0.003997802734375, + "learning_rate": 9.993246498820587e-07, + "loss": 0.0002, + "num_tokens": 1387411.0, + "reward": 0.875, + "reward_std": 0.47920867800712585, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.5943574905395508, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 138.75, + "completions/mean_terminated_length": 138.75, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.01861427094105481, + "grad_norm": 4.124744220407965, + "kl": 0.0032958984375, + "learning_rate": 9.992376151608897e-07, + "loss": 0.0001, + "num_tokens": 1465757.0, + "reward": 0.8333333730697632, + "reward_std": 0.4198887050151825, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.7469745874404907, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 122.29167175292969, + "completions/mean_terminated_length": 122.29167175292969, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.01964839710444674, + "grad_norm": 4.2126320398251105, + "kl": 0.0028839111328125, + "learning_rate": 9.991453111301365e-07, + "loss": 0.0001, + "num_tokens": 1546284.0, + "reward": 0.3333333432674408, + "reward_std": 0.3900056481361389, + "rewards/reasoning_reward/mean": 0.3333333432674408, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 169.5, + "completions/mean_terminated_length": 169.5, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.020682523267838676, + "grad_norm": 3.9035749186278075, + "kl": 0.00433349609375, + "learning_rate": 9.990477387640415e-07, + "loss": 0.0002, + "num_tokens": 1629112.0, + "reward": 0.7083333730697632, + "reward_std": 0.43810173869132996, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.4871538281440735, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 183.33334350585938, + "completions/mean_terminated_length": 183.33334350585938, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.02171664943123061, + "grad_norm": 2.7742036215600043, + "kl": 0.0037078857421875, + "learning_rate": 9.989448990924528e-07, + "loss": 0.0001, + "num_tokens": 1706600.0, + "reward": 0.9166666865348816, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.28232985734939575, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 179.125, + "completions/mean_terminated_length": 179.125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.022750775594622543, + "grad_norm": 2.9495492352769728, + "kl": 0.007080078125, + "learning_rate": 9.988367932008138e-07, + "loss": 0.0003, + "num_tokens": 1791203.0, + "reward": 0.9375, + "reward_std": 0.30551642179489136, + "rewards/reasoning_reward/mean": 0.9375, + "rewards/reasoning_reward/std": 0.37044334411621094, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 208.70834350585938, + "completions/mean_terminated_length": 208.70834350585938, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.023784901758014478, + "grad_norm": 3.890766517596311, + "kl": 0.00811767578125, + "learning_rate": 9.98723422230151e-07, + "loss": 0.0003, + "num_tokens": 1871036.0, + "reward": 1.1666667461395264, + "reward_std": 0.40397143363952637, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.6197241544723511, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 161.375, + "completions/mean_terminated_length": 161.375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.02481902792140641, + "grad_norm": 3.3269487482285345, + "kl": 0.0029144287109375, + "learning_rate": 9.986047873770624e-07, + "loss": 0.0001, + "num_tokens": 1949213.0, + "reward": 0.7916666865348816, + "reward_std": 0.29602527618408203, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 146.1666717529297, + "completions/mean_terminated_length": 146.1666717529297, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.025853154084798345, + "grad_norm": 3.9477140851301993, + "kl": 0.0059814453125, + "learning_rate": 9.98480889893705e-07, + "loss": 0.0002, + "num_tokens": 2028025.0, + "reward": 0.9791666865348816, + "reward_std": 0.5605560541152954, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.6507381796836853, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 219.9166717529297, + "completions/mean_terminated_length": 219.9166717529297, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.02688728024819028, + "grad_norm": 4.080956404876102, + "kl": 0.00860595703125, + "learning_rate": 9.98351731087781e-07, + "loss": 0.0003, + "num_tokens": 2107687.0, + "reward": 0.875, + "reward_std": 0.46288391947746277, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.5757792592048645, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 163.5416717529297, + "completions/mean_terminated_length": 163.5416717529297, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.027921406411582212, + "grad_norm": 3.5156776593492745, + "kl": 0.0048828125, + "learning_rate": 9.982173123225243e-07, + "loss": 0.0002, + "num_tokens": 2186948.0, + "reward": 0.8541666865348816, + "reward_std": 0.40529346466064453, + "rewards/reasoning_reward/mean": 0.8541666865348816, + "rewards/reasoning_reward/std": 0.4995468854904175, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 121.75, + "completions/mean_terminated_length": 121.75, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.028955532574974147, + "grad_norm": 2.437538390860398, + "kl": 0.0029449462890625, + "learning_rate": 9.980776350166867e-07, + "loss": 0.0001, + "num_tokens": 2267470.0, + "reward": 0.875, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 151.7916717529297, + "completions/mean_terminated_length": 151.7916717529297, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.02998965873836608, + "grad_norm": 3.9269223056078095, + "kl": 0.004730224609375, + "learning_rate": 9.979327006445216e-07, + "loss": 0.0002, + "num_tokens": 2344097.0, + "reward": 0.4583333432674408, + "reward_std": 0.5049939155578613, + "rewards/reasoning_reward/mean": 0.4583333432674408, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 186.20834350585938, + "completions/mean_terminated_length": 186.20834350585938, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.031023784901758014, + "grad_norm": 3.386462451637892, + "kl": 0.004547119140625, + "learning_rate": 9.977825107357702e-07, + "loss": 0.0002, + "num_tokens": 2425350.0, + "reward": 0.75, + "reward_std": 0.33247750997543335, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 178.6666717529297, + "completions/mean_terminated_length": 178.6666717529297, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.03205791106514995, + "grad_norm": 3.3979916126388687, + "kl": 0.01055908203125, + "learning_rate": 9.976270668756433e-07, + "loss": 0.0004, + "num_tokens": 2510054.0, + "reward": 1.0277777910232544, + "reward_std": 0.3074157238006592, + "rewards/reasoning_reward/mean": 1.0277777910232544, + "rewards/reasoning_reward/std": 0.4627858102321625, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 155.7916717529297, + "completions/mean_terminated_length": 155.7916717529297, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.033092037228541885, + "grad_norm": 2.8184150341115375, + "kl": 0.004058837890625, + "learning_rate": 9.974663707048065e-07, + "loss": 0.0002, + "num_tokens": 2588921.0, + "reward": 0.6666666865348816, + "reward_std": 0.35634833574295044, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 154.70834350585938, + "completions/mean_terminated_length": 154.70834350585938, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.03412616339193381, + "grad_norm": 2.828806479597615, + "kl": 0.005157470703125, + "learning_rate": 9.973004239193618e-07, + "loss": 0.0002, + "num_tokens": 2671218.0, + "reward": 1.1319444179534912, + "reward_std": 0.13749298453330994, + "rewards/reasoning_reward/mean": 1.1319444179534912, + "rewards/reasoning_reward/std": 0.3474515974521637, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 204.0, + "completions/mean_terminated_length": 204.0, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.03516028955532575, + "grad_norm": 3.0486725384758566, + "kl": 0.005401611328125, + "learning_rate": 9.971292282708296e-07, + "loss": 0.0002, + "num_tokens": 2755530.0, + "reward": 0.8819444179534912, + "reward_std": 0.3624235987663269, + "rewards/reasoning_reward/mean": 0.8819444179534912, + "rewards/reasoning_reward/std": 0.6226000189781189, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 190.70834350585938, + "completions/mean_terminated_length": 190.70834350585938, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.03619441571871768, + "grad_norm": 2.8675194738593675, + "kl": 0.01416015625, + "learning_rate": 9.969527855661307e-07, + "loss": 0.0006, + "num_tokens": 2845651.0, + "reward": 1.3333333730697632, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.434057354927063, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 167.6666717529297, + "completions/mean_terminated_length": 167.6666717529297, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.03722854188210962, + "grad_norm": 2.8365389717558775, + "kl": 0.0047607421875, + "learning_rate": 9.967710976675674e-07, + "loss": 0.0002, + "num_tokens": 2924235.0, + "reward": 0.4166666865348816, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 0.4166666567325592, + "rewards/reasoning_reward/std": 0.5036101937294006, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 160.20834350585938, + "completions/mean_terminated_length": 160.20834350585938, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.038262668045501554, + "grad_norm": 3.3596940088485066, + "kl": 0.00372314453125, + "learning_rate": 9.965841664928032e-07, + "loss": 0.0001, + "num_tokens": 3004232.0, + "reward": 0.7916666865348816, + "reward_std": 0.42645785212516785, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 1217.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 256.79168701171875, + "completions/mean_terminated_length": 215.04348754882812, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.03929679420889348, + "grad_norm": 3.713051017958645, + "kl": 0.006988525390625, + "learning_rate": 9.963919940148428e-07, + "loss": 0.0003, + "num_tokens": 3095475.0, + "reward": 1.2569445371627808, + "reward_std": 0.36115893721580505, + "rewards/reasoning_reward/mean": 1.2569445371627808, + "rewards/reasoning_reward/std": 0.5223950147628784, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 203.5416717529297, + "completions/mean_terminated_length": 203.5416717529297, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.04033092037228542, + "grad_norm": 4.058262730903875, + "kl": 0.01019287109375, + "learning_rate": 9.961945822620118e-07, + "loss": 0.0004, + "num_tokens": 3184992.0, + "reward": 1.375, + "reward_std": 0.6197125911712646, + "rewards/reasoning_reward/mean": 1.375, + "rewards/reasoning_reward/std": 0.6954823136329651, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 255.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 177.83334350585938, + "completions/mean_terminated_length": 174.478271484375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.04136504653567735, + "grad_norm": 3.0112482413882797, + "kl": 0.0054931640625, + "learning_rate": 9.959919333179344e-07, + "loss": 0.0002, + "num_tokens": 3266916.0, + "reward": 0.8333333730697632, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 144.9166717529297, + "completions/mean_terminated_length": 144.9166717529297, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.04239917269906929, + "grad_norm": 3.571922005370386, + "kl": 0.006378173828125, + "learning_rate": 9.957840493215116e-07, + "loss": 0.0003, + "num_tokens": 3351314.0, + "reward": 1.0625, + "reward_std": 0.3972596824169159, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.5578004121780396, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 174.2916717529297, + "completions/mean_terminated_length": 174.2916717529297, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.04343329886246122, + "grad_norm": 4.032955623530826, + "kl": 0.006561279296875, + "learning_rate": 9.955709324668997e-07, + "loss": 0.0003, + "num_tokens": 3429881.0, + "reward": 0.9166666865348816, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.8427009582519531, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 201.625, + "completions/mean_terminated_length": 201.625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.04446742502585315, + "grad_norm": 3.509751245460736, + "kl": 0.0074462890625, + "learning_rate": 9.953525850034856e-07, + "loss": 0.0003, + "num_tokens": 3520448.0, + "reward": 0.8541666865348816, + "reward_std": 0.3871031701564789, + "rewards/reasoning_reward/mean": 0.8541666865348816, + "rewards/reasoning_reward/std": 0.6672325730323792, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 160.5, + "completions/mean_terminated_length": 160.5, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.045501551189245086, + "grad_norm": 3.847890323513219, + "kl": 0.006622314453125, + "learning_rate": 9.951290092358645e-07, + "loss": 0.0003, + "num_tokens": 3603924.0, + "reward": 0.9652777910232544, + "reward_std": 0.46292293071746826, + "rewards/reasoning_reward/mean": 0.9652777314186096, + "rewards/reasoning_reward/std": 0.7121769785881042, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 146.45834350585938, + "completions/mean_terminated_length": 146.45834350585938, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.04653567735263702, + "grad_norm": 3.670364425188319, + "kl": 0.0072021484375, + "learning_rate": 9.949002075238139e-07, + "loss": 0.0003, + "num_tokens": 3684711.0, + "reward": 0.875, + "reward_std": 0.4082186222076416, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.39699962735176086, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 155.08334350585938, + "completions/mean_terminated_length": 155.08334350585938, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.047569803516028956, + "grad_norm": 3.726045501753792, + "kl": 0.0076904296875, + "learning_rate": 9.9466618228227e-07, + "loss": 0.0003, + "num_tokens": 3762713.0, + "reward": 0.8333333730697632, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 232.375, + "completions/mean_terminated_length": 232.375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.04860392967942089, + "grad_norm": 3.592630193734655, + "kl": 0.00811767578125, + "learning_rate": 9.944269359813026e-07, + "loss": 0.0003, + "num_tokens": 3840002.0, + "reward": 1.027777910232544, + "reward_std": 0.40778279304504395, + "rewards/reasoning_reward/mean": 1.0277777910232544, + "rewards/reasoning_reward/std": 0.5972292423248291, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 180.1666717529297, + "completions/mean_terminated_length": 180.1666717529297, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.04963805584281282, + "grad_norm": 4.29924922955248, + "kl": 0.00848388671875, + "learning_rate": 9.941824711460871e-07, + "loss": 0.0003, + "num_tokens": 3922982.0, + "reward": 0.8333333730697632, + "reward_std": 0.48678088188171387, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.7019641399383545, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 148.5, + "completions/mean_terminated_length": 148.5, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.050672182006204755, + "grad_norm": 2.8310868538516525, + "kl": 0.0054931640625, + "learning_rate": 9.9393279035688e-07, + "loss": 0.0002, + "num_tokens": 4001698.0, + "reward": 0.625, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.625, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 204.9166717529297, + "completions/mean_terminated_length": 204.9166717529297, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.05170630816959669, + "grad_norm": 3.525954172380883, + "kl": 0.0079345703125, + "learning_rate": 9.936778962489902e-07, + "loss": 0.0003, + "num_tokens": 4081344.0, + "reward": 0.4652777910232544, + "reward_std": 0.31936562061309814, + "rewards/reasoning_reward/mean": 0.4652777910232544, + "rewards/reasoning_reward/std": 0.442268967628479, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 159.5, + "completions/mean_terminated_length": 159.5, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.052740434332988625, + "grad_norm": 4.351659425272402, + "kl": 0.01177978515625, + "learning_rate": 9.934177915127515e-07, + "loss": 0.0005, + "num_tokens": 4169460.0, + "reward": 1.125, + "reward_std": 0.4082186818122864, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.8501917719841003, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 186.08334350585938, + "completions/mean_terminated_length": 186.08334350585938, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.05377456049638056, + "grad_norm": 3.5125629537492675, + "kl": 0.00982666015625, + "learning_rate": 9.931524788934949e-07, + "loss": 0.0004, + "num_tokens": 4252750.0, + "reward": 0.5416666865348816, + "reward_std": 0.5090917348861694, + "rewards/reasoning_reward/mean": 0.5416666865348816, + "rewards/reasoning_reward/std": 0.7210599780082703, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 218.75, + "completions/mean_terminated_length": 218.75, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.05480868665977249, + "grad_norm": 3.193225562666632, + "kl": 0.01190185546875, + "learning_rate": 9.928819611915188e-07, + "loss": 0.0005, + "num_tokens": 4330064.0, + "reward": 1.0416667461395264, + "reward_std": 0.20693820714950562, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.7058246731758118, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 200.625, + "completions/mean_terminated_length": 200.625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.055842812823164424, + "grad_norm": 2.9953108717900974, + "kl": 0.01116943359375, + "learning_rate": 9.9260624126206e-07, + "loss": 0.0004, + "num_tokens": 4415991.0, + "reward": 1.1666667461395264, + "reward_std": 0.30860671401023865, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.5646597146987915, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 176.9166717529297, + "completions/mean_terminated_length": 176.9166717529297, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.05687693898655636, + "grad_norm": 3.5740247241093708, + "kl": 0.01312255859375, + "learning_rate": 9.923253220152627e-07, + "loss": 0.0005, + "num_tokens": 4498133.0, + "reward": 0.6458333730697632, + "reward_std": 0.3310800790786743, + "rewards/reasoning_reward/mean": 0.6458333134651184, + "rewards/reasoning_reward/std": 0.5610387921333313, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 216.0416717529297, + "completions/mean_terminated_length": 216.0416717529297, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.057911065149948295, + "grad_norm": 4.117392785596258, + "kl": 0.01153564453125, + "learning_rate": 9.92039206416149e-07, + "loss": 0.0005, + "num_tokens": 4576190.0, + "reward": 0.4583333432674408, + "reward_std": 0.3574431836605072, + "rewards/reasoning_reward/mean": 0.4583333432674408, + "rewards/reasoning_reward/std": 0.4148510992527008, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 153.83334350585938, + "completions/mean_terminated_length": 153.83334350585938, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.05894519131334023, + "grad_norm": 4.837645018388818, + "kl": 0.00958251953125, + "learning_rate": 9.917478974845873e-07, + "loss": 0.0004, + "num_tokens": 4653658.0, + "reward": 0.5416666865348816, + "reward_std": 0.46288391947746277, + "rewards/reasoning_reward/mean": 0.5416666865348816, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 221.6666717529297, + "completions/mean_terminated_length": 221.6666717529297, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.05997931747673216, + "grad_norm": 3.7641592955112637, + "kl": 0.011962890625, + "learning_rate": 9.914513982952592e-07, + "loss": 0.0005, + "num_tokens": 4733554.0, + "reward": 0.5902777910232544, + "reward_std": 0.5594554543495178, + "rewards/reasoning_reward/mean": 0.5902777910232544, + "rewards/reasoning_reward/std": 0.5646151304244995, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 196.375, + "completions/mean_terminated_length": 196.375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.06101344364012409, + "grad_norm": 3.2142277856073393, + "kl": 0.01385498046875, + "learning_rate": 9.911497119776286e-07, + "loss": 0.0006, + "num_tokens": 4819339.0, + "reward": 1.1041667461395264, + "reward_std": 0.34730279445648193, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.43353530764579773, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 223.70834350585938, + "completions/mean_terminated_length": 223.70834350585938, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.06204756980351603, + "grad_norm": 3.6323983394895354, + "kl": 0.01446533203125, + "learning_rate": 9.908428417159078e-07, + "loss": 0.0006, + "num_tokens": 4902092.0, + "reward": 0.6666666865348816, + "reward_std": 0.4685417115688324, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.7613869905471802, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 210.08334350585938, + "completions/mean_terminated_length": 210.08334350585938, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.06308169596690796, + "grad_norm": 4.256974764936185, + "kl": 0.01409912109375, + "learning_rate": 9.905307907490242e-07, + "loss": 0.0006, + "num_tokens": 4982646.0, + "reward": 0.7986111044883728, + "reward_std": 0.7822409868240356, + "rewards/reasoning_reward/mean": 0.7986111044883728, + "rewards/reasoning_reward/std": 0.800028920173645, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 125.83333587646484, + "completions/mean_terminated_length": 125.83333587646484, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.0641158221302999, + "grad_norm": 3.6447804056269857, + "kl": 0.0086669921875, + "learning_rate": 9.902135623705864e-07, + "loss": 0.0003, + "num_tokens": 5061682.0, + "reward": 0.8333333730697632, + "reward_std": 0.30860671401023865, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 194.0, + "completions/mean_terminated_length": 194.0, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.06514994829369183, + "grad_norm": 2.2300859017750354, + "kl": 0.01177978515625, + "learning_rate": 9.898911599288483e-07, + "loss": 0.0005, + "num_tokens": 5146090.0, + "reward": 1.25, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 1.25, + "rewards/reasoning_reward/std": 0.6079187393188477, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 195.25, + "completions/mean_terminated_length": 195.25, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.06618407445708377, + "grad_norm": 3.6675785111838213, + "kl": 0.0224609375, + "learning_rate": 9.895635868266754e-07, + "loss": 0.0009, + "num_tokens": 5224032.0, + "reward": 1.0833333730697632, + "reward_std": 0.3493061661720276, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.4340573847293854, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 168.0, + "completions/mean_terminated_length": 168.0, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.0672182006204757, + "grad_norm": 3.434155813310191, + "kl": 0.00909423828125, + "learning_rate": 9.892308465215079e-07, + "loss": 0.0004, + "num_tokens": 5303200.0, + "reward": 0.6666666865348816, + "reward_std": 0.34503278136253357, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 196.0, + "completions/mean_terminated_length": 196.0, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.06825232678386763, + "grad_norm": 2.512958656172137, + "kl": 0.01190185546875, + "learning_rate": 9.888929425253235e-07, + "loss": 0.0005, + "num_tokens": 5378760.0, + "reward": 0.9166666865348816, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.28232985734939575, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 164.5416717529297, + "completions/mean_terminated_length": 164.5416717529297, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.06928645294725956, + "grad_norm": 3.4490407712658686, + "kl": 0.0157470703125, + "learning_rate": 9.885498784046023e-07, + "loss": 0.0006, + "num_tokens": 5468669.0, + "reward": 1.2083333730697632, + "reward_std": 0.31285393238067627, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.4402732849121094, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 166.45834350585938, + "completions/mean_terminated_length": 166.45834350585938, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.0703205791106515, + "grad_norm": 3.4607132042836923, + "kl": 0.01129150390625, + "learning_rate": 9.882016577802873e-07, + "loss": 0.0005, + "num_tokens": 5547952.0, + "reward": 0.8472222089767456, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.8472221493721008, + "rewards/reasoning_reward/std": 0.7221757173538208, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 151.7916717529297, + "completions/mean_terminated_length": 151.7916717529297, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.07135470527404343, + "grad_norm": 2.7650585026718284, + "kl": 0.00799560546875, + "learning_rate": 9.878482843277468e-07, + "loss": 0.0003, + "num_tokens": 5627683.0, + "reward": 0.75, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 210.75, + "completions/mean_terminated_length": 210.75, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.07238883143743537, + "grad_norm": 4.449011636323865, + "kl": 0.01300048828125, + "learning_rate": 9.874897617767367e-07, + "loss": 0.0005, + "num_tokens": 5707485.0, + "reward": 0.9166666865348816, + "reward_std": 0.3535533845424652, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 459.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 268.29168701171875, + "completions/mean_terminated_length": 260.0, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.0734229576008273, + "grad_norm": 3.5424047636885625, + "kl": 0.014892578125, + "learning_rate": 9.871260939113595e-07, + "loss": 0.0006, + "num_tokens": 5787164.0, + "reward": 1.2708333730697632, + "reward_std": 0.3915778398513794, + "rewards/reasoning_reward/mean": 1.2708333730697632, + "rewards/reasoning_reward/std": 0.48107290267944336, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 118.54167175292969, + "completions/mean_terminated_length": 118.54167175292969, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.07445708376421924, + "grad_norm": 3.340246386770097, + "kl": 0.00933837890625, + "learning_rate": 9.867572845700245e-07, + "loss": 0.0004, + "num_tokens": 5866153.0, + "reward": 0.7083333730697632, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 169.08334350585938, + "completions/mean_terminated_length": 169.08334350585938, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.07549120992761117, + "grad_norm": 4.82302140773541, + "kl": 0.01165771484375, + "learning_rate": 9.863833376454086e-07, + "loss": 0.0005, + "num_tokens": 5951243.0, + "reward": 0.8402777910232544, + "reward_std": 0.5060732960700989, + "rewards/reasoning_reward/mean": 0.8402777314186096, + "rewards/reasoning_reward/std": 0.5349594354629517, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 183.58334350585938, + "completions/mean_terminated_length": 183.58334350585938, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.07652533609100311, + "grad_norm": 2.6240055600050596, + "kl": 0.017578125, + "learning_rate": 9.86004257084414e-07, + "loss": 0.0007, + "num_tokens": 6027409.0, + "reward": 0.8333333730697632, + "reward_std": 0.24339044094085693, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.5247498154640198, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 150.0416717529297, + "completions/mean_terminated_length": 150.0416717529297, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.07755946225439504, + "grad_norm": 3.592297508316741, + "kl": 0.01373291015625, + "learning_rate": 9.856200468881274e-07, + "loss": 0.0006, + "num_tokens": 6110490.0, + "reward": 1.0, + "reward_std": 0.30860671401023865, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.7223151326179504, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 154.4166717529297, + "completions/mean_terminated_length": 154.4166717529297, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.07859358841778696, + "grad_norm": 9.783346222849755, + "kl": 0.01300048828125, + "learning_rate": 9.85230711111777e-07, + "loss": 0.0005, + "num_tokens": 6188476.0, + "reward": 0.75, + "reward_std": 0.5423438549041748, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.6079187393188477, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 180.83334350585938, + "completions/mean_terminated_length": 180.83334350585938, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.0796277145811789, + "grad_norm": 4.348565558470908, + "kl": 0.0186767578125, + "learning_rate": 9.848362538646898e-07, + "loss": 0.0007, + "num_tokens": 6272632.0, + "reward": 1.2083333730697632, + "reward_std": 0.33723291754722595, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.4871537983417511, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 213.70834350585938, + "completions/mean_terminated_length": 213.70834350585938, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.08066184074457083, + "grad_norm": 3.2258066953920532, + "kl": 0.01544189453125, + "learning_rate": 9.844366793102487e-07, + "loss": 0.0006, + "num_tokens": 6361513.0, + "reward": 1.3958333730697632, + "reward_std": 0.13607725501060486, + "rewards/reasoning_reward/mean": 1.3958333730697632, + "rewards/reasoning_reward/std": 0.32900264859199524, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 173.08334350585938, + "completions/mean_terminated_length": 173.08334350585938, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.08169596690796277, + "grad_norm": 3.863377237898613, + "kl": 0.01416015625, + "learning_rate": 9.840319916658487e-07, + "loss": 0.0006, + "num_tokens": 6446875.0, + "reward": 0.868055522441864, + "reward_std": 0.4124408960342407, + "rewards/reasoning_reward/mean": 0.868055522441864, + "rewards/reasoning_reward/std": 0.5793948769569397, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 166.0416717529297, + "completions/mean_terminated_length": 166.0416717529297, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.0827300930713547, + "grad_norm": 3.9380960605825535, + "kl": 0.0107421875, + "learning_rate": 9.836221952028512e-07, + "loss": 0.0004, + "num_tokens": 6526732.0, + "reward": 0.4166666865348816, + "reward_std": 0.46854168176651, + "rewards/reasoning_reward/mean": 0.4166666567325592, + "rewards/reasoning_reward/std": 0.5036101937294006, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 203.2916717529297, + "completions/mean_terminated_length": 203.2916717529297, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.08376421923474664, + "grad_norm": 2.968433728304001, + "kl": 0.013427734375, + "learning_rate": 9.832072942465403e-07, + "loss": 0.0005, + "num_tokens": 6613659.0, + "reward": 1.0, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.8340576887130737, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 216.5, + "completions/mean_terminated_length": 216.5, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.08479834539813857, + "grad_norm": 4.153184905003268, + "kl": 0.01348876953125, + "learning_rate": 9.827872931760762e-07, + "loss": 0.0005, + "num_tokens": 6692039.0, + "reward": 1.0416667461395264, + "reward_std": 0.42645785212516785, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.7506036162376404, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.08583247156153051, + "grad_norm": 3.143774778506864, + "kl": 0.01519775390625, + "learning_rate": 9.823621964244499e-07, + "loss": 0.0006, + "num_tokens": 6776582.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.48900964856147766, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 163.75, + "completions/mean_terminated_length": 163.75, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.08686659772492245, + "grad_norm": 3.829881249796472, + "kl": 0.01422119140625, + "learning_rate": 9.81932008478435e-07, + "loss": 0.0006, + "num_tokens": 6853528.0, + "reward": 0.9583333730697632, + "reward_std": 0.4563409090042114, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.4871538281440735, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 198.33334350585938, + "completions/mean_terminated_length": 198.33334350585938, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.08790072388831438, + "grad_norm": 4.080367426621312, + "kl": 0.01171875, + "learning_rate": 9.814967338785423e-07, + "loss": 0.0005, + "num_tokens": 6934072.0, + "reward": 0.8125, + "reward_std": 0.5672780275344849, + "rewards/reasoning_reward/mean": 0.8125, + "rewards/reasoning_reward/std": 0.6280721426010132, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 152.08334350585938, + "completions/mean_terminated_length": 152.08334350585938, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.0889348500517063, + "grad_norm": 2.3347185584192713, + "kl": 0.01202392578125, + "learning_rate": 9.810563772189695e-07, + "loss": 0.0005, + "num_tokens": 7012290.0, + "reward": 0.9583333730697632, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.20412415266036987, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 231.75, + "completions/mean_terminated_length": 228.3478240966797, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.08996897621509824, + "grad_norm": 2.48983908009315, + "kl": 0.01373291015625, + "learning_rate": 9.806109431475548e-07, + "loss": 0.0006, + "num_tokens": 7092732.0, + "reward": 0.875, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 198.5416717529297, + "completions/mean_terminated_length": 198.5416717529297, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.09100310237849017, + "grad_norm": 3.8019105243494167, + "kl": 0.01519775390625, + "learning_rate": 9.80160436365727e-07, + "loss": 0.0006, + "num_tokens": 7170505.0, + "reward": 0.4791666865348816, + "reward_std": 0.5551774501800537, + "rewards/reasoning_reward/mean": 0.4791666567325592, + "rewards/reasoning_reward/std": 0.5800893306732178, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 198.6666717529297, + "completions/mean_terminated_length": 198.6666717529297, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.09203722854188211, + "grad_norm": 4.310499510741145, + "kl": 0.0174560546875, + "learning_rate": 9.797048616284557e-07, + "loss": 0.0007, + "num_tokens": 7256361.0, + "reward": 0.7986111044883728, + "reward_std": 0.5306869745254517, + "rewards/reasoning_reward/mean": 0.7986111044883728, + "rewards/reasoning_reward/std": 0.6984785795211792, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 169.45834350585938, + "completions/mean_terminated_length": 168.86956787109375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.09307135470527404, + "grad_norm": 3.256888913160928, + "kl": 0.01373291015625, + "learning_rate": 9.792442237442013e-07, + "loss": 0.0006, + "num_tokens": 7333220.0, + "reward": 0.75, + "reward_std": 0.33247750997543335, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 246.45834350585938, + "completions/mean_terminated_length": 246.45834350585938, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.09410548086866598, + "grad_norm": 4.176429595136656, + "kl": 0.0230712890625, + "learning_rate": 9.787785275748643e-07, + "loss": 0.0009, + "num_tokens": 7413559.0, + "reward": 1.2291667461395264, + "reward_std": 0.6453183889389038, + "rewards/reasoning_reward/mean": 1.2291666269302368, + "rewards/reasoning_reward/std": 0.7220014929771423, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 186.70834350585938, + "completions/mean_terminated_length": 182.6521759033203, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.09513960703205791, + "grad_norm": 4.0765925380672225, + "kl": 0.0146484375, + "learning_rate": 9.783077780357338e-07, + "loss": 0.0006, + "num_tokens": 7494080.0, + "reward": 0.680555522441864, + "reward_std": 0.5991425514221191, + "rewards/reasoning_reward/mean": 0.680555522441864, + "rewards/reasoning_reward/std": 0.6077531576156616, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 210.75, + "completions/mean_terminated_length": 210.75, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.09617373319544985, + "grad_norm": 3.060802289470635, + "kl": 0.016845703125, + "learning_rate": 9.778319800954364e-07, + "loss": 0.0007, + "num_tokens": 7577778.0, + "reward": 0.9236111044883728, + "reward_std": 0.4386470317840576, + "rewards/reasoning_reward/mean": 0.9236111044883728, + "rewards/reasoning_reward/std": 0.533829391002655, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 172.45834350585938, + "completions/mean_terminated_length": 172.45834350585938, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.09720785935884178, + "grad_norm": 5.03422567888706, + "kl": 0.01513671875, + "learning_rate": 9.773511387758821e-07, + "loss": 0.0006, + "num_tokens": 7656581.0, + "reward": 0.8541666865348816, + "reward_std": 0.4434394836425781, + "rewards/reasoning_reward/mean": 0.8541666865348816, + "rewards/reasoning_reward/std": 0.878229022026062, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 179.9166717529297, + "completions/mean_terminated_length": 179.9166717529297, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.09824198552223372, + "grad_norm": 3.3926662846097773, + "kl": 0.01263427734375, + "learning_rate": 9.768652591522133e-07, + "loss": 0.0005, + "num_tokens": 7735547.0, + "reward": 0.75, + "reward_std": 0.33247750997543335, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 217.5416717529297, + "completions/mean_terminated_length": 217.5416717529297, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.09927611168562564, + "grad_norm": 4.22083012756532, + "kl": 0.0191650390625, + "learning_rate": 9.763743463527496e-07, + "loss": 0.0008, + "num_tokens": 7819328.0, + "reward": 0.7083333730697632, + "reward_std": 0.45032867789268494, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.5500329732894897, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 165.375, + "completions/mean_terminated_length": 165.375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.10031023784901758, + "grad_norm": 3.7501772543377743, + "kl": 0.0177001953125, + "learning_rate": 9.758784055589346e-07, + "loss": 0.0007, + "num_tokens": 7897641.0, + "reward": 0.625, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.625, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 172.25, + "completions/mean_terminated_length": 172.25, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.10134436401240951, + "grad_norm": 4.3294563512717215, + "kl": 0.0191650390625, + "learning_rate": 9.753774420052807e-07, + "loss": 0.0008, + "num_tokens": 7974623.0, + "reward": 0.8958333730697632, + "reward_std": 0.4929513931274414, + "rewards/reasoning_reward/mean": 0.8958333134651184, + "rewards/reasoning_reward/std": 0.5311834216117859, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 194.25, + "completions/mean_terminated_length": 194.25, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.10237849017580145, + "grad_norm": 4.1235008690948165, + "kl": 0.0220947265625, + "learning_rate": 9.748714609793147e-07, + "loss": 0.0009, + "num_tokens": 8054901.0, + "reward": 0.4166666865348816, + "reward_std": 0.5099153518676758, + "rewards/reasoning_reward/mean": 0.4166666567325592, + "rewards/reasoning_reward/std": 0.524749755859375, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 205.58334350585938, + "completions/mean_terminated_length": 205.58334350585938, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.10341261633919338, + "grad_norm": 3.113383684598118, + "kl": 0.01806640625, + "learning_rate": 9.743604678215205e-07, + "loss": 0.0007, + "num_tokens": 8136203.0, + "reward": 0.7777777910232544, + "reward_std": 0.3232055902481079, + "rewards/reasoning_reward/mean": 0.7777777314186096, + "rewards/reasoning_reward/std": 0.8145219683647156, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 202.125, + "completions/mean_terminated_length": 202.125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.10444674250258532, + "grad_norm": 3.7183447125419007, + "kl": 0.0211181640625, + "learning_rate": 9.738444679252843e-07, + "loss": 0.0008, + "num_tokens": 8215654.0, + "reward": 0.4791666865348816, + "reward_std": 0.4671442210674286, + "rewards/reasoning_reward/mean": 0.4791666567325592, + "rewards/reasoning_reward/std": 0.5413181781768799, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 176.625, + "completions/mean_terminated_length": 176.625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.10548086866597725, + "grad_norm": 3.1873320574218478, + "kl": 0.02392578125, + "learning_rate": 9.733234667368368e-07, + "loss": 0.001, + "num_tokens": 8304221.0, + "reward": 1.1388888359069824, + "reward_std": 0.24966806173324585, + "rewards/reasoning_reward/mean": 1.1388888359069824, + "rewards/reasoning_reward/std": 0.5353825092315674, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 253.0, + "completions/mean_terminated_length": 253.0, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.10651499482936919, + "grad_norm": 3.681506481032909, + "kl": 0.016357421875, + "learning_rate": 9.727974697551958e-07, + "loss": 0.0007, + "num_tokens": 8384285.0, + "reward": 0.4236111044883728, + "reward_std": 0.47574663162231445, + "rewards/reasoning_reward/mean": 0.4236110746860504, + "rewards/reasoning_reward/std": 0.49874040484428406, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 239.0416717529297, + "completions/mean_terminated_length": 239.0416717529297, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.10754912099276112, + "grad_norm": 2.7098677939554507, + "kl": 0.0281982421875, + "learning_rate": 9.722664825321082e-07, + "loss": 0.0011, + "num_tokens": 8481646.0, + "reward": 1.2916667461395264, + "reward_std": 0.1498909741640091, + "rewards/reasoning_reward/mean": 1.2916666269302368, + "rewards/reasoning_reward/std": 0.6004225611686707, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 191.625, + "completions/mean_terminated_length": 191.625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.10858324715615306, + "grad_norm": 3.7431786732254246, + "kl": 0.0157470703125, + "learning_rate": 9.717305106719916e-07, + "loss": 0.0006, + "num_tokens": 8560781.0, + "reward": 0.7083333730697632, + "reward_std": 0.42645785212516785, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.5500329732894897, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 186.20834350585938, + "completions/mean_terminated_length": 186.20834350585938, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.10961737331954498, + "grad_norm": 3.620350204008683, + "kl": 0.033447265625, + "learning_rate": 9.71189559831875e-07, + "loss": 0.0013, + "num_tokens": 8650034.0, + "reward": 1.6875, + "reward_std": 0.33768826723098755, + "rewards/reasoning_reward/mean": 1.6875, + "rewards/reasoning_reward/std": 0.4618605971336365, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 187.70834350585938, + "completions/mean_terminated_length": 187.70834350585938, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.11065149948293691, + "grad_norm": 2.325484346203483, + "kl": 0.0216064453125, + "learning_rate": 9.70643635721339e-07, + "loss": 0.0009, + "num_tokens": 8727995.0, + "reward": 1.0625, + "reward_std": 0.19795583188533783, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.3398369252681732, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 206.5416717529297, + "completions/mean_terminated_length": 206.5416717529297, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.11168562564632885, + "grad_norm": 3.929289600039794, + "kl": 0.0341796875, + "learning_rate": 9.70092744102456e-07, + "loss": 0.0014, + "num_tokens": 8817472.0, + "reward": 1.4305555820465088, + "reward_std": 0.32358893752098083, + "rewards/reasoning_reward/mean": 1.4305554628372192, + "rewards/reasoning_reward/std": 0.5222504734992981, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 221.375, + "completions/mean_terminated_length": 221.375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.11271975180972078, + "grad_norm": 3.692465779722894, + "kl": 0.0284423828125, + "learning_rate": 9.695368907897286e-07, + "loss": 0.0011, + "num_tokens": 8910545.0, + "reward": 1.3958333730697632, + "reward_std": 0.31444376707077026, + "rewards/reasoning_reward/mean": 1.3958333730697632, + "rewards/reasoning_reward/std": 0.5243180394172668, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 200.95834350585938, + "completions/mean_terminated_length": 200.95834350585938, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.11375387797311272, + "grad_norm": 3.73349102770315, + "kl": 0.037109375, + "learning_rate": 9.689760816500284e-07, + "loss": 0.0015, + "num_tokens": 9005792.0, + "reward": 1.1666667461395264, + "reward_std": 0.31443244218826294, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 228.95834350585938, + "completions/mean_terminated_length": 228.95834350585938, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.11478800413650465, + "grad_norm": 2.8820257320459106, + "kl": 0.0201416015625, + "learning_rate": 9.684103226025355e-07, + "loss": 0.0008, + "num_tokens": 9088655.0, + "reward": 0.875, + "reward_std": 0.2314550280570984, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.5565811395645142, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 204.58334350585938, + "completions/mean_terminated_length": 204.58334350585938, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.11582213029989659, + "grad_norm": 1.9153457674664132, + "kl": 0.0242919921875, + "learning_rate": 9.678396196186738e-07, + "loss": 0.001, + "num_tokens": 9172229.0, + "reward": 0.875, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 228.08334350585938, + "completions/mean_terminated_length": 228.08334350585938, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.11685625646328852, + "grad_norm": 4.091930192339665, + "kl": 0.0234375, + "learning_rate": 9.67263978722049e-07, + "loss": 0.0009, + "num_tokens": 9250847.0, + "reward": 0.7361111044883728, + "reward_std": 0.4770033359527588, + "rewards/reasoning_reward/mean": 0.7361111044883728, + "rewards/reasoning_reward/std": 0.581345796585083, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 174.7916717529297, + "completions/mean_terminated_length": 174.7916717529297, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.11789038262668046, + "grad_norm": 3.8333319654298315, + "kl": 0.0223388671875, + "learning_rate": 9.666834059883856e-07, + "loss": 0.0009, + "num_tokens": 9336570.0, + "reward": 0.7708333730697632, + "reward_std": 0.50337815284729, + "rewards/reasoning_reward/mean": 0.7708333134651184, + "rewards/reasoning_reward/std": 0.5706435441970825, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 215.0, + "completions/mean_terminated_length": 215.0, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.1189245087900724, + "grad_norm": 4.4406021802940545, + "kl": 0.0303955078125, + "learning_rate": 9.66097907545462e-07, + "loss": 0.0012, + "num_tokens": 9416634.0, + "reward": 0.8472222089767456, + "reward_std": 0.5322877764701843, + "rewards/reasoning_reward/mean": 0.8472221493721008, + "rewards/reasoning_reward/std": 0.5644814372062683, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 155.5416717529297, + "completions/mean_terminated_length": 155.5416717529297, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.11995863495346432, + "grad_norm": 2.161252175635768, + "kl": 0.017822265625, + "learning_rate": 9.655074895730462e-07, + "loss": 0.0007, + "num_tokens": 9501367.0, + "reward": 1.2916667461395264, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 1.2916666269302368, + "rewards/reasoning_reward/std": 0.550032913684845, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 185.33334350585938, + "completions/mean_terminated_length": 185.33334350585938, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.12099276111685625, + "grad_norm": 4.450081255781044, + "kl": 0.0238037109375, + "learning_rate": 9.649121583028299e-07, + "loss": 0.0009, + "num_tokens": 9580399.0, + "reward": 0.8125, + "reward_std": 0.5099677443504333, + "rewards/reasoning_reward/mean": 0.8125, + "rewards/reasoning_reward/std": 0.5067479610443115, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 211.25, + "completions/mean_terminated_length": 211.25, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.12202688728024819, + "grad_norm": 3.213840485912054, + "kl": 0.025390625, + "learning_rate": 9.643119200183637e-07, + "loss": 0.001, + "num_tokens": 9669941.0, + "reward": 1.1458333730697632, + "reward_std": 0.24056154489517212, + "rewards/reasoning_reward/mean": 1.1458333730697632, + "rewards/reasoning_reward/std": 0.7144345045089722, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 147.5, + "completions/mean_terminated_length": 147.5, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.12306101344364012, + "grad_norm": 3.122769680653565, + "kl": 0.0184326171875, + "learning_rate": 9.637067810549906e-07, + "loss": 0.0007, + "num_tokens": 9749513.0, + "reward": 1.0555555820465088, + "reward_std": 0.2238859385251999, + "rewards/reasoning_reward/mean": 1.0555554628372192, + "rewards/reasoning_reward/std": 0.31724458932876587, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 178.7916717529297, + "completions/mean_terminated_length": 178.7916717529297, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.12409513960703206, + "grad_norm": 4.639227790783391, + "kl": 0.02978515625, + "learning_rate": 9.63096747799778e-07, + "loss": 0.0012, + "num_tokens": 9838452.0, + "reward": 1.4583333730697632, + "reward_std": 0.5078567266464233, + "rewards/reasoning_reward/mean": 1.4583333730697632, + "rewards/reasoning_reward/std": 0.5882299542427063, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 131.83334350585938, + "completions/mean_terminated_length": 131.83334350585938, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.125129265770424, + "grad_norm": 2.4639138077233618, + "kl": 0.0234375, + "learning_rate": 9.624818266914519e-07, + "loss": 0.0009, + "num_tokens": 9916816.0, + "reward": 0.9583333730697632, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.20412415266036987, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 232.9166717529297, + "completions/mean_terminated_length": 232.9166717529297, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.12616339193381593, + "grad_norm": 3.8580416088238216, + "kl": 0.0281982421875, + "learning_rate": 9.618620242203278e-07, + "loss": 0.0011, + "num_tokens": 10002838.0, + "reward": 1.1388888359069824, + "reward_std": 0.3499924838542938, + "rewards/reasoning_reward/mean": 1.1388888359069824, + "rewards/reasoning_reward/std": 0.5870310664176941, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 173.875, + "completions/mean_terminated_length": 173.875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.12719751809720786, + "grad_norm": 2.83715772505631, + "kl": 0.016845703125, + "learning_rate": 9.612373469282428e-07, + "loss": 0.0007, + "num_tokens": 10081507.0, + "reward": 0.7916666865348816, + "reward_std": 0.3535533845424652, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.6580052971839905, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 144.33334350585938, + "completions/mean_terminated_length": 144.33334350585938, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.1282316442605998, + "grad_norm": 1.8523241506048544, + "kl": 0.0299072265625, + "learning_rate": 9.606078014084863e-07, + "loss": 0.0012, + "num_tokens": 10165467.0, + "reward": 0.8333333730697632, + "reward_std": 0.19920477271080017, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.40824830532073975, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 153.875, + "completions/mean_terminated_length": 153.875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.12926577042399173, + "grad_norm": 3.493963599295082, + "kl": 0.03076171875, + "learning_rate": 9.5997339430573e-07, + "loss": 0.0012, + "num_tokens": 10241968.0, + "reward": 0.7083333730697632, + "reward_std": 0.3506905436515808, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 153.33334350585938, + "completions/mean_terminated_length": 153.33334350585938, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.13029989658738367, + "grad_norm": 3.815467992926747, + "kl": 0.02294921875, + "learning_rate": 9.59334132315959e-07, + "loss": 0.0009, + "num_tokens": 10321768.0, + "reward": 0.7916666865348816, + "reward_std": 0.4082186818122864, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 184.9166717529297, + "completions/mean_terminated_length": 184.9166717529297, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.1313340227507756, + "grad_norm": 3.1667726238296843, + "kl": 0.020751953125, + "learning_rate": 9.586900221863996e-07, + "loss": 0.0008, + "num_tokens": 10400854.0, + "reward": 0.7083333730697632, + "reward_std": 0.6100153923034668, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.6064269542694092, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 164.0416717529297, + "completions/mean_terminated_length": 164.0416717529297, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.13236814891416754, + "grad_norm": 3.1674735991804623, + "kl": 0.0181884765625, + "learning_rate": 9.580410707154494e-07, + "loss": 0.0007, + "num_tokens": 10481695.0, + "reward": 0.5625, + "reward_std": 0.28302299976348877, + "rewards/reasoning_reward/mean": 0.5625, + "rewards/reasoning_reward/std": 0.49590715765953064, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 159.0, + "completions/mean_terminated_length": 159.0, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.13340227507755947, + "grad_norm": 2.4728156183528083, + "kl": 0.02294921875, + "learning_rate": 9.573872847526048e-07, + "loss": 0.0009, + "num_tokens": 10561455.0, + "reward": 1.0, + "reward_std": 0.30860671401023865, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.5107539296150208, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 178.875, + "completions/mean_terminated_length": 178.875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.1344364012409514, + "grad_norm": 4.017961732422752, + "kl": 0.0291748046875, + "learning_rate": 9.567286711983885e-07, + "loss": 0.0012, + "num_tokens": 10641076.0, + "reward": 0.6736111044883728, + "reward_std": 0.45231467485427856, + "rewards/reasoning_reward/mean": 0.6736111044883728, + "rewards/reasoning_reward/std": 0.5948230028152466, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 200.9166717529297, + "completions/mean_terminated_length": 198.9130401611328, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.13547052740434332, + "grad_norm": 3.138708740315387, + "kl": 0.0189208984375, + "learning_rate": 9.560652370042771e-07, + "loss": 0.0008, + "num_tokens": 10719122.0, + "reward": 1.0, + "reward_std": 0.19500279426574707, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.2553769648075104, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 149.75, + "completions/mean_terminated_length": 149.75, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.13650465356773525, + "grad_norm": 3.840304727799911, + "kl": 0.0203857421875, + "learning_rate": 9.553969891726289e-07, + "loss": 0.0008, + "num_tokens": 10799388.0, + "reward": 1.0833333730697632, + "reward_std": 0.4198887050151825, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.5646597146987915, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 159.83334350585938, + "completions/mean_terminated_length": 159.83334350585938, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.1375387797311272, + "grad_norm": 1.7596178724801423, + "kl": 0.01904296875, + "learning_rate": 9.547239347566068e-07, + "loss": 0.0008, + "num_tokens": 10880672.0, + "reward": 0.9583333730697632, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.20412415266036987, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 188.58334350585938, + "completions/mean_terminated_length": 188.58334350585938, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.13857290589451912, + "grad_norm": 4.414440886337846, + "kl": 0.0269775390625, + "learning_rate": 9.540460808601069e-07, + "loss": 0.0011, + "num_tokens": 10958862.0, + "reward": 1.625, + "reward_std": 0.19801273941993713, + "rewards/reasoning_reward/mean": 1.625, + "rewards/reasoning_reward/std": 0.3686048984527588, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 137.5, + "completions/mean_terminated_length": 137.5, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.13960703205791106, + "grad_norm": 0.18154064062767528, + "kl": 0.0322265625, + "learning_rate": 9.533634346376827e-07, + "loss": 0.0013, + "num_tokens": 11035034.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 126.16667175292969, + "completions/mean_terminated_length": 126.16667175292969, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.140641158221303, + "grad_norm": 4.281841546042445, + "kl": 0.01324462890625, + "learning_rate": 9.526760032944687e-07, + "loss": 0.0005, + "num_tokens": 11114766.0, + "reward": 0.5416666865348816, + "reward_std": 0.4082186818122864, + "rewards/reasoning_reward/mean": 0.5416666865348816, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 151.875, + "completions/mean_terminated_length": 151.875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.14167528438469493, + "grad_norm": 3.112436935614297, + "kl": 0.0211181640625, + "learning_rate": 9.519837940861051e-07, + "loss": 0.0008, + "num_tokens": 11198851.0, + "reward": 0.75, + "reward_std": 0.34503278136253357, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 164.375, + "completions/mean_terminated_length": 164.375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.14270941054808686, + "grad_norm": 2.1893308302361865, + "kl": 0.020751953125, + "learning_rate": 9.512868143186614e-07, + "loss": 0.0008, + "num_tokens": 11274508.0, + "reward": 0.375, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.375, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 148.4166717529297, + "completions/mean_terminated_length": 148.4166717529297, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.1437435367114788, + "grad_norm": 1.9943583574276684, + "kl": 0.03076171875, + "learning_rate": 9.505850713485588e-07, + "loss": 0.0012, + "num_tokens": 11355774.0, + "reward": 0.625, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.625, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 177.58334350585938, + "completions/mean_terminated_length": 177.58334350585938, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.14477766287487073, + "grad_norm": 3.019726108756687, + "kl": 0.0269775390625, + "learning_rate": 9.498785725824927e-07, + "loss": 0.0011, + "num_tokens": 11432412.0, + "reward": 0.7916666865348816, + "reward_std": 0.3506905436515808, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4402732849121094, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 138.875, + "completions/mean_terminated_length": 138.875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.14581178903826267, + "grad_norm": 3.4892636769097214, + "kl": 0.0234375, + "learning_rate": 9.491673254773544e-07, + "loss": 0.0009, + "num_tokens": 11508697.0, + "reward": 0.4166666865348816, + "reward_std": 0.40397143363952637, + "rewards/reasoning_reward/mean": 0.4166666567325592, + "rewards/reasoning_reward/std": 0.5646597146987915, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 186.9166717529297, + "completions/mean_terminated_length": 186.9166717529297, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.1468459152016546, + "grad_norm": 2.4711223309041235, + "kl": 0.0299072265625, + "learning_rate": 9.484513375401531e-07, + "loss": 0.0012, + "num_tokens": 11589767.0, + "reward": 0.8472222685813904, + "reward_std": 0.3876555860042572, + "rewards/reasoning_reward/mean": 0.8472222685813904, + "rewards/reasoning_reward/std": 0.6444048285484314, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 137.4166717529297, + "completions/mean_terminated_length": 137.4166717529297, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.14788004136504654, + "grad_norm": 3.9836312904992175, + "kl": 0.0179443359375, + "learning_rate": 9.477306163279353e-07, + "loss": 0.0007, + "num_tokens": 11670673.0, + "reward": 1.0416667461395264, + "reward_std": 0.43810173869132996, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.6064269542694092, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 216.125, + "completions/mean_terminated_length": 216.125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.14891416752843847, + "grad_norm": 3.084825540437868, + "kl": 0.042724609375, + "learning_rate": 9.470051694477066e-07, + "loss": 0.0017, + "num_tokens": 11757628.0, + "reward": 0.868055522441864, + "reward_std": 0.2675723433494568, + "rewards/reasoning_reward/mean": 0.868055522441864, + "rewards/reasoning_reward/std": 0.6331791877746582, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 136.83334350585938, + "completions/mean_terminated_length": 136.83334350585938, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.1499482936918304, + "grad_norm": 3.603155546437747, + "kl": 0.0255126953125, + "learning_rate": 9.462750045563502e-07, + "loss": 0.001, + "num_tokens": 11835120.0, + "reward": 0.7291666865348816, + "reward_std": 0.33108004927635193, + "rewards/reasoning_reward/mean": 0.7291666865348816, + "rewards/reasoning_reward/std": 0.48854634165763855, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 170.5, + "completions/mean_terminated_length": 170.5, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.15098241985522234, + "grad_norm": 3.781720407335475, + "kl": 0.060302734375, + "learning_rate": 9.45540129360547e-07, + "loss": 0.0024, + "num_tokens": 11919284.0, + "reward": 1.3541667461395264, + "reward_std": 0.25392836332321167, + "rewards/reasoning_reward/mean": 1.3541666269302368, + "rewards/reasoning_reward/std": 0.4293363690376282, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 159.2916717529297, + "completions/mean_terminated_length": 159.2916717529297, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.15201654601861428, + "grad_norm": 3.8293089763902644, + "kl": 0.04296875, + "learning_rate": 9.448005516166934e-07, + "loss": 0.0017, + "num_tokens": 12002627.0, + "reward": 0.9583333730697632, + "reward_std": 0.45032867789268494, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.6064269542694092, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 245.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 172.5, + "completions/mean_terminated_length": 169.3478240966797, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.15305067218200621, + "grad_norm": 2.4437113547282188, + "kl": 0.0250244140625, + "learning_rate": 9.4405627913082e-07, + "loss": 0.001, + "num_tokens": 12082679.0, + "reward": 0.7916666865348816, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 141.375, + "completions/mean_terminated_length": 141.375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.15408479834539815, + "grad_norm": 3.1876345769180663, + "kl": 0.036865234375, + "learning_rate": 9.433073197585089e-07, + "loss": 0.0015, + "num_tokens": 12166192.0, + "reward": 1.125, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.5366967916488647, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 186.875, + "completions/mean_terminated_length": 185.478271484375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.15511892450879008, + "grad_norm": 2.8956791515173244, + "kl": 0.0299072265625, + "learning_rate": 9.425536814048112e-07, + "loss": 0.0012, + "num_tokens": 12246877.0, + "reward": 1.1875, + "reward_std": 0.24052315950393677, + "rewards/reasoning_reward/mean": 1.1875, + "rewards/reasoning_reward/std": 0.4922405183315277, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 141.5, + "completions/mean_terminated_length": 141.5, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.15615305067218202, + "grad_norm": 3.578662637602958, + "kl": 0.038330078125, + "learning_rate": 9.417953720241633e-07, + "loss": 0.0015, + "num_tokens": 12331145.0, + "reward": 0.875, + "reward_std": 0.367926687002182, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.4484272003173828, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 155.83334350585938, + "completions/mean_terminated_length": 155.83334350585938, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.15718717683557393, + "grad_norm": 2.4928642366111045, + "kl": 0.040283203125, + "learning_rate": 9.410323996203026e-07, + "loss": 0.0016, + "num_tokens": 12413805.0, + "reward": 0.9166666865348816, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.28232985734939575, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 163.20834350585938, + "completions/mean_terminated_length": 163.20834350585938, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.15822130299896586, + "grad_norm": 2.0203710118248086, + "kl": 0.0213623046875, + "learning_rate": 9.402647722461838e-07, + "loss": 0.0009, + "num_tokens": 12485866.0, + "reward": 0.5, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 0.5, + "rewards/reasoning_reward/std": 0.5107539296150208, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 182.1666717529297, + "completions/mean_terminated_length": 182.1666717529297, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.1592554291623578, + "grad_norm": 2.745126246673903, + "kl": 0.02880859375, + "learning_rate": 9.394924980038931e-07, + "loss": 0.0012, + "num_tokens": 12564518.0, + "reward": 1.0347223281860352, + "reward_std": 0.2400643527507782, + "rewards/reasoning_reward/mean": 1.0347222089767456, + "rewards/reasoning_reward/std": 0.3184320330619812, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 167.95834350585938, + "completions/mean_terminated_length": 167.95834350585938, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.16028955532574973, + "grad_norm": 3.2611636461773656, + "kl": 0.01806640625, + "learning_rate": 9.387155850445634e-07, + "loss": 0.0007, + "num_tokens": 12646309.0, + "reward": 0.7916666865348816, + "reward_std": 0.29602527618408203, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148510992527008, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 162.9166717529297, + "completions/mean_terminated_length": 162.9166717529297, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.16132368148914167, + "grad_norm": 3.5046780041435315, + "kl": 0.049072265625, + "learning_rate": 9.379340415682877e-07, + "loss": 0.002, + "num_tokens": 12728907.0, + "reward": 0.8333333730697632, + "reward_std": 0.3616904020309448, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.7755316495895386, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 158.0416717529297, + "completions/mean_terminated_length": 158.0416717529297, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.1623578076525336, + "grad_norm": 3.0934531885922616, + "kl": 0.049072265625, + "learning_rate": 9.371478758240327e-07, + "loss": 0.002, + "num_tokens": 12819820.0, + "reward": 1.2708333730697632, + "reward_std": 0.21322892606258392, + "rewards/reasoning_reward/mean": 1.2708333730697632, + "rewards/reasoning_reward/std": 0.6075461506843567, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 149.75, + "completions/mean_terminated_length": 149.75, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.16339193381592554, + "grad_norm": 0.2825124812867749, + "kl": 0.03076171875, + "learning_rate": 9.363570961095522e-07, + "loss": 0.0012, + "num_tokens": 12902054.0, + "reward": 1.3333333730697632, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 188.375, + "completions/mean_terminated_length": 188.375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.16442605997931747, + "grad_norm": 3.4620590744937343, + "kl": 0.029296875, + "learning_rate": 9.355617107712988e-07, + "loss": 0.0012, + "num_tokens": 12981463.0, + "reward": 0.875, + "reward_std": 0.3506905436515808, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.4484272003173828, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 186.20834350585938, + "completions/mean_terminated_length": 185.0, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.1654601861427094, + "grad_norm": 3.67117533558172, + "kl": 0.056396484375, + "learning_rate": 9.347617282043361e-07, + "loss": 0.0022, + "num_tokens": 13061260.0, + "reward": 1.0625, + "reward_std": 0.294627845287323, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.39870715141296387, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 157.25, + "completions/mean_terminated_length": 157.25, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.16649431230610134, + "grad_norm": 3.8432953762387436, + "kl": 0.033935546875, + "learning_rate": 9.339571568522502e-07, + "loss": 0.0014, + "num_tokens": 13139362.0, + "reward": 1.1041667461395264, + "reward_std": 0.3766257166862488, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.642332136631012, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 161.625, + "completions/mean_terminated_length": 161.625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.16752843846949328, + "grad_norm": 2.827018325830448, + "kl": 0.0224609375, + "learning_rate": 9.331480052070606e-07, + "loss": 0.0009, + "num_tokens": 13221897.0, + "reward": 1.0625, + "reward_std": 0.24185511469841003, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.3398369252681732, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 151.375, + "completions/mean_terminated_length": 151.375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.16856256463288521, + "grad_norm": 3.849615825816937, + "kl": 0.03857421875, + "learning_rate": 9.323342818091307e-07, + "loss": 0.0015, + "num_tokens": 13309802.0, + "reward": 0.8472222089767456, + "reward_std": 0.38511237502098083, + "rewards/reasoning_reward/mean": 0.8472221493721008, + "rewards/reasoning_reward/std": 0.8074482083320618, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 214.6666717529297, + "completions/mean_terminated_length": 214.6666717529297, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.16959669079627715, + "grad_norm": 4.0101406070036925, + "kl": 0.02783203125, + "learning_rate": 9.315159952470765e-07, + "loss": 0.0011, + "num_tokens": 13391154.0, + "reward": 0.7083333730697632, + "reward_std": 0.31285393238067627, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.5299029350280762, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 175.1666717529297, + "completions/mean_terminated_length": 175.1666717529297, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.17063081695966908, + "grad_norm": 3.5535731972038187, + "kl": 0.0289306640625, + "learning_rate": 9.306931541576783e-07, + "loss": 0.0012, + "num_tokens": 13476590.0, + "reward": 1.2291667461395264, + "reward_std": 0.4883233904838562, + "rewards/reasoning_reward/mean": 1.2291666269302368, + "rewards/reasoning_reward/std": 0.5706435441970825, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 200.0, + "completions/mean_terminated_length": 200.0, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.17166494312306102, + "grad_norm": 2.8543563729941903, + "kl": 0.041259765625, + "learning_rate": 9.29865767225787e-07, + "loss": 0.0017, + "num_tokens": 13560134.0, + "reward": 0.9305555820465088, + "reward_std": 0.25030583143234253, + "rewards/reasoning_reward/mean": 0.9305555820465088, + "rewards/reasoning_reward/std": 0.7659995555877686, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 191.4166717529297, + "completions/mean_terminated_length": 191.4166717529297, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.17269906928645296, + "grad_norm": 4.219399972626256, + "kl": 0.0240478515625, + "learning_rate": 9.29033843184234e-07, + "loss": 0.001, + "num_tokens": 13637928.0, + "reward": 0.6041666865348816, + "reward_std": 0.5158624053001404, + "rewards/reasoning_reward/mean": 0.6041666865348816, + "rewards/reasoning_reward/std": 0.551266610622406, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 137.1666717529297, + "completions/mean_terminated_length": 137.1666717529297, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.1737331954498449, + "grad_norm": 4.058689025510401, + "kl": 0.038330078125, + "learning_rate": 9.281973908137385e-07, + "loss": 0.0015, + "num_tokens": 13723524.0, + "reward": 0.875, + "reward_std": 0.5222300291061401, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.5366967916488647, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 148.0, + "completions/mean_terminated_length": 148.0, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.17476732161323683, + "grad_norm": 2.82936642039495, + "kl": 0.0220947265625, + "learning_rate": 9.273564189428149e-07, + "loss": 0.0009, + "num_tokens": 13809044.0, + "reward": 0.625, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.625, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 149.875, + "completions/mean_terminated_length": 149.875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.17580144777662876, + "grad_norm": 3.7043384803322907, + "kl": 0.0245361328125, + "learning_rate": 9.265109364476798e-07, + "loss": 0.001, + "num_tokens": 13892353.0, + "reward": 0.75, + "reward_std": 0.41387641429901123, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 220.75, + "completions/mean_terminated_length": 216.69566345214844, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.1768355739400207, + "grad_norm": 3.0294206597876343, + "kl": 0.032470703125, + "learning_rate": 9.256609522521578e-07, + "loss": 0.0013, + "num_tokens": 13976627.0, + "reward": 0.75, + "reward_std": 0.3142080307006836, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.8135328888893127, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 165.625, + "completions/mean_terminated_length": 165.625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.1778697001034126, + "grad_norm": 3.1395182912560826, + "kl": 0.03466796875, + "learning_rate": 9.248064753275881e-07, + "loss": 0.0014, + "num_tokens": 14057114.0, + "reward": 1.2083333730697632, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.5882299542427063, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 139.2916717529297, + "completions/mean_terminated_length": 139.2916717529297, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.17890382626680454, + "grad_norm": 2.2716748016429715, + "kl": 0.0263671875, + "learning_rate": 9.239475146927289e-07, + "loss": 0.0011, + "num_tokens": 14135657.0, + "reward": 0.875, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 224.5, + "completions/mean_terminated_length": 224.5, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.17993795243019647, + "grad_norm": 3.706550911084408, + "kl": 0.0322265625, + "learning_rate": 9.23084079413663e-07, + "loss": 0.0013, + "num_tokens": 14214229.0, + "reward": 1.0833333730697632, + "reward_std": 0.5276275873184204, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.5472813844680786, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 160.45834350585938, + "completions/mean_terminated_length": 160.45834350585938, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.1809720785935884, + "grad_norm": 3.8563106320766187, + "kl": 0.0223388671875, + "learning_rate": 9.222161786037017e-07, + "loss": 0.0009, + "num_tokens": 14292528.0, + "reward": 0.7083333730697632, + "reward_std": 0.36456555128097534, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.550032913684845, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 177.875, + "completions/mean_terminated_length": 177.875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.18200620475698034, + "grad_norm": 3.522749129891572, + "kl": 0.0235595703125, + "learning_rate": 9.213438214232887e-07, + "loss": 0.0009, + "num_tokens": 14369853.0, + "reward": 0.513888955116272, + "reward_std": 0.4159068465232849, + "rewards/reasoning_reward/mean": 0.5138888955116272, + "rewards/reasoning_reward/std": 0.44482171535491943, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 193.25, + "completions/mean_terminated_length": 189.18182373046875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.18304033092037228, + "grad_norm": 4.470234549353068, + "kl": 0.0299072265625, + "learning_rate": 9.204670170799034e-07, + "loss": 0.0012, + "num_tokens": 14446075.0, + "reward": 0.7222222089767456, + "reward_std": 0.4285862445831299, + "rewards/reasoning_reward/mean": 0.7222221493721008, + "rewards/reasoning_reward/std": 0.7234289646148682, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 218.83334350585938, + "completions/mean_terminated_length": 218.83334350585938, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.18407445708376421, + "grad_norm": 3.680761822106045, + "kl": 0.0284423828125, + "learning_rate": 9.195857748279636e-07, + "loss": 0.0011, + "num_tokens": 14524703.0, + "reward": 0.1666666716337204, + "reward_std": 0.3616904020309448, + "rewards/reasoning_reward/mean": 0.1666666716337204, + "rewards/reasoning_reward/std": 0.3806935250759125, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 144.33334350585938, + "completions/mean_terminated_length": 144.33334350585938, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.18510858324715615, + "grad_norm": 2.0002566843886043, + "kl": 0.0277099609375, + "learning_rate": 9.187001039687283e-07, + "loss": 0.0011, + "num_tokens": 14609903.0, + "reward": 1.0833333730697632, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.28232985734939575, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 125.45833587646484, + "completions/mean_terminated_length": 125.45833587646484, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.18614270941054809, + "grad_norm": 3.220863102101657, + "kl": 0.0291748046875, + "learning_rate": 9.178100138501987e-07, + "loss": 0.0012, + "num_tokens": 14691482.0, + "reward": 1.0, + "reward_std": 0.24339044094085693, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.3611575663089752, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 169.375, + "completions/mean_terminated_length": 169.375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.18717683557394002, + "grad_norm": 1.9372953316829284, + "kl": 0.0308837890625, + "learning_rate": 9.169155138670202e-07, + "loss": 0.0012, + "num_tokens": 14773179.0, + "reward": 0.8333333730697632, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 198.33334350585938, + "completions/mean_terminated_length": 198.33334350585938, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.18821096173733196, + "grad_norm": 4.409408065166967, + "kl": 0.032958984375, + "learning_rate": 9.160166134603833e-07, + "loss": 0.0013, + "num_tokens": 14857035.0, + "reward": 1.0625, + "reward_std": 0.41124165058135986, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.5379611253738403, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 168.20834350585938, + "completions/mean_terminated_length": 168.20834350585938, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.1892450879007239, + "grad_norm": 2.7356052191198317, + "kl": 0.032470703125, + "learning_rate": 9.151133221179236e-07, + "loss": 0.0013, + "num_tokens": 14934480.0, + "reward": 1.2083333730697632, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.4148510992527008, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 230.5416717529297, + "completions/mean_terminated_length": 230.5416717529297, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.19027921406411583, + "grad_norm": 4.0459885315893445, + "kl": 0.0576171875, + "learning_rate": 9.142056493736214e-07, + "loss": 0.0023, + "num_tokens": 15024701.0, + "reward": 1.0, + "reward_std": 0.5209805369377136, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.7661308646202087, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 284.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 188.33334350585938, + "completions/mean_terminated_length": 184.17391967773438, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.19131334022750776, + "grad_norm": 3.393759487686452, + "kl": 0.042724609375, + "learning_rate": 9.13293604807702e-07, + "loss": 0.0017, + "num_tokens": 15119653.0, + "reward": 1.5833333730697632, + "reward_std": 0.34930619597435, + "rewards/reasoning_reward/mean": 1.5833333730697632, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 185.83334350585938, + "completions/mean_terminated_length": 185.83334350585938, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.1923474663908997, + "grad_norm": 1.8836374889884402, + "kl": 0.025390625, + "learning_rate": 9.123771980465336e-07, + "loss": 0.001, + "num_tokens": 15200161.0, + "reward": 0.875, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 211.5, + "completions/mean_terminated_length": 211.5, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.19338159255429163, + "grad_norm": 3.7167415446965446, + "kl": 0.03173828125, + "learning_rate": 9.114564387625261e-07, + "loss": 0.0013, + "num_tokens": 15278277.0, + "reward": 0.4652777910232544, + "reward_std": 0.6122889518737793, + "rewards/reasoning_reward/mean": 0.4652777910232544, + "rewards/reasoning_reward/std": 0.6756266355514526, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 201.375, + "completions/mean_terminated_length": 201.375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.19441571871768357, + "grad_norm": 3.074846245023018, + "kl": 0.02587890625, + "learning_rate": 9.105313366740295e-07, + "loss": 0.001, + "num_tokens": 15357622.0, + "reward": 1.1527777910232544, + "reward_std": 0.2801976501941681, + "rewards/reasoning_reward/mean": 1.1527777910232544, + "rewards/reasoning_reward/std": 0.35412225127220154, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 161.625, + "completions/mean_terminated_length": 161.625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.1954498448810755, + "grad_norm": 0.15074496627824105, + "kl": 0.033935546875, + "learning_rate": 9.096019015452303e-07, + "loss": 0.0014, + "num_tokens": 15442109.0, + "reward": 1.3333333730697632, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 113.41667175292969, + "completions/mean_terminated_length": 113.41667175292969, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.19648397104446744, + "grad_norm": 3.1874055691182623, + "kl": 0.0322265625, + "learning_rate": 9.086681431860492e-07, + "loss": 0.0013, + "num_tokens": 15525791.0, + "reward": 1.0, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.7801894545555115, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 158.5, + "completions/mean_terminated_length": 158.5, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.19751809720785937, + "grad_norm": 3.24323335305027, + "kl": 0.0301513671875, + "learning_rate": 9.077300714520377e-07, + "loss": 0.0012, + "num_tokens": 15609755.0, + "reward": 1.1041667461395264, + "reward_std": 0.2965203523635864, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.5706435441970825, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 215.375, + "completions/mean_terminated_length": 215.375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.19855222337125128, + "grad_norm": 3.5456355924138805, + "kl": 0.0306396484375, + "learning_rate": 9.067876962442732e-07, + "loss": 0.0012, + "num_tokens": 15696732.0, + "reward": 1.0, + "reward_std": 0.44887280464172363, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.7939992547035217, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 150.5416717529297, + "completions/mean_terminated_length": 150.5416717529297, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.19958634953464321, + "grad_norm": 2.245179535057254, + "kl": 0.02734375, + "learning_rate": 9.058410275092553e-07, + "loss": 0.0011, + "num_tokens": 15777353.0, + "reward": 0.5833333730697632, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.5833333134651184, + "rewards/reasoning_reward/std": 0.5036101341247559, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 180.625, + "completions/mean_terminated_length": 180.625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.20062047569803515, + "grad_norm": 2.2025459230007964, + "kl": 0.033447265625, + "learning_rate": 9.048900752388004e-07, + "loss": 0.0013, + "num_tokens": 15862848.0, + "reward": 0.9166666865348816, + "reward_std": 0.29546841979026794, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.5036101341247559, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 165.875, + "completions/mean_terminated_length": 165.875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.20165460186142709, + "grad_norm": 3.077504641125675, + "kl": 0.04296875, + "learning_rate": 9.039348494699366e-07, + "loss": 0.0017, + "num_tokens": 15945461.0, + "reward": 1.0416667461395264, + "reward_std": 0.3535533845424652, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.5089774131774902, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 176.45834350585938, + "completions/mean_terminated_length": 176.45834350585938, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.20268872802481902, + "grad_norm": 1.72449286161736, + "kl": 0.0302734375, + "learning_rate": 9.029753602847974e-07, + "loss": 0.0012, + "num_tokens": 16026360.0, + "reward": 1.0208333730697632, + "reward_std": 0.0589255653321743, + "rewards/reasoning_reward/mean": 1.0208333730697632, + "rewards/reasoning_reward/std": 0.10206207633018494, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 199.20834350585938, + "completions/mean_terminated_length": 199.20834350585938, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.20372285418821096, + "grad_norm": 3.9491196867976286, + "kl": 0.055908203125, + "learning_rate": 9.020116178105153e-07, + "loss": 0.0022, + "num_tokens": 16115637.0, + "reward": 1.1041667461395264, + "reward_std": 0.5151108503341675, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.7515081167221069, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 176.0, + "completions/mean_terminated_length": 176.0, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.2047569803516029, + "grad_norm": 2.8175351580004078, + "kl": 0.0289306640625, + "learning_rate": 9.010436322191155e-07, + "loss": 0.0012, + "num_tokens": 16199165.0, + "reward": 0.8472222089767456, + "reward_std": 0.3506905436515808, + "rewards/reasoning_reward/mean": 0.8472221493721008, + "rewards/reasoning_reward/std": 0.7221757769584656, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 204.58334350585938, + "completions/mean_terminated_length": 204.58334350585938, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.20579110651499483, + "grad_norm": 4.291506948811305, + "kl": 0.039794921875, + "learning_rate": 9.000714137274077e-07, + "loss": 0.0016, + "num_tokens": 16277363.0, + "reward": 0.75, + "reward_std": 0.3900056481361389, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.6079187393188477, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 200.9166717529297, + "completions/mean_terminated_length": 200.9166717529297, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.20682523267838676, + "grad_norm": 3.795582364631847, + "kl": 0.033447265625, + "learning_rate": 8.990949725968786e-07, + "loss": 0.0013, + "num_tokens": 16356617.0, + "reward": 1.1388888359069824, + "reward_std": 0.42590853571891785, + "rewards/reasoning_reward/mean": 1.1388888359069824, + "rewards/reasoning_reward/std": 0.5992480516433716, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 165.7916717529297, + "completions/mean_terminated_length": 165.7916717529297, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.2078593588417787, + "grad_norm": 4.0373942611533495, + "kl": 0.0400390625, + "learning_rate": 8.981143191335839e-07, + "loss": 0.0016, + "num_tokens": 16441756.0, + "reward": 0.930555522441864, + "reward_std": 0.42342984676361084, + "rewards/reasoning_reward/mean": 0.930555522441864, + "rewards/reasoning_reward/std": 0.6843958497047424, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 175.75, + "completions/mean_terminated_length": 175.75, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.20889348500517063, + "grad_norm": 4.339046231792284, + "kl": 0.042236328125, + "learning_rate": 8.971294636880391e-07, + "loss": 0.0017, + "num_tokens": 16530366.0, + "reward": 1.4375, + "reward_std": 0.4006626605987549, + "rewards/reasoning_reward/mean": 1.4375, + "rewards/reasoning_reward/std": 0.6753286719322205, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 155.375, + "completions/mean_terminated_length": 155.375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.20992761116856257, + "grad_norm": 2.8570580431999235, + "kl": 0.08154296875, + "learning_rate": 8.961404166551103e-07, + "loss": 0.0033, + "num_tokens": 16615319.0, + "reward": 1.375, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 1.375, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 150.0, + "completions/mean_terminated_length": 150.0, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.2109617373319545, + "grad_norm": 2.980300192320368, + "kl": 0.02880859375, + "learning_rate": 8.951471884739051e-07, + "loss": 0.0012, + "num_tokens": 16696703.0, + "reward": 1.1041667461395264, + "reward_std": 0.22466278076171875, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.36052998900413513, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 191.58334350585938, + "completions/mean_terminated_length": 191.58334350585938, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.21199586349534644, + "grad_norm": 4.159029061904166, + "kl": 0.0439453125, + "learning_rate": 8.941497896276613e-07, + "loss": 0.0018, + "num_tokens": 16778829.0, + "reward": 0.8194445371627808, + "reward_std": 0.6607243418693542, + "rewards/reasoning_reward/mean": 0.819444477558136, + "rewards/reasoning_reward/std": 0.7892953753471375, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 219.9166717529297, + "completions/mean_terminated_length": 219.9166717529297, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.21302998965873837, + "grad_norm": 3.6827623626462063, + "kl": 0.06298828125, + "learning_rate": 8.931482306436373e-07, + "loss": 0.0025, + "num_tokens": 16863339.0, + "reward": 1.3958333730697632, + "reward_std": 0.3719491958618164, + "rewards/reasoning_reward/mean": 1.3958333730697632, + "rewards/reasoning_reward/std": 0.6015529036521912, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 166.625, + "completions/mean_terminated_length": 166.625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.2140641158221303, + "grad_norm": 4.5771027363801, + "kl": 0.07666015625, + "learning_rate": 8.921425220930001e-07, + "loss": 0.0031, + "num_tokens": 16942642.0, + "reward": 0.7569444179534912, + "reward_std": 0.5365828275680542, + "rewards/reasoning_reward/mean": 0.7569444179534912, + "rewards/reasoning_reward/std": 0.5223950147628784, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 148.1666717529297, + "completions/mean_terminated_length": 148.1666717529297, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.21509824198552224, + "grad_norm": 4.6067336602799935, + "kl": 0.072265625, + "learning_rate": 8.91132674590715e-07, + "loss": 0.0029, + "num_tokens": 17028814.0, + "reward": 0.9583333730697632, + "reward_std": 0.42645785212516785, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.46430566906929016, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 161.0416717529297, + "completions/mean_terminated_length": 161.0416717529297, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.21613236814891418, + "grad_norm": 3.888741740906291, + "kl": 0.07861328125, + "learning_rate": 8.901186987954319e-07, + "loss": 0.0031, + "num_tokens": 17112839.0, + "reward": 1.0833333730697632, + "reward_std": 0.34503278136253357, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.5835920572280884, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 190.7916717529297, + "completions/mean_terminated_length": 190.7916717529297, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.2171664943123061, + "grad_norm": 3.5678811440994815, + "kl": 0.041259765625, + "learning_rate": 8.891006054093739e-07, + "loss": 0.0016, + "num_tokens": 17193130.0, + "reward": 0.9583333730697632, + "reward_std": 0.24800793826580048, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.6902530789375305, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 209.75, + "completions/mean_terminated_length": 209.75, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.21820062047569805, + "grad_norm": 3.3232953547887, + "kl": 0.046142578125, + "learning_rate": 8.880784051782243e-07, + "loss": 0.0018, + "num_tokens": 17271196.0, + "reward": 0.9791666865348816, + "reward_std": 0.1767766922712326, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.2321528047323227, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 200.7916717529297, + "completions/mean_terminated_length": 200.7916717529297, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.21923474663908996, + "grad_norm": 3.17766425227838, + "kl": 0.033935546875, + "learning_rate": 8.870521088910129e-07, + "loss": 0.0014, + "num_tokens": 17348999.0, + "reward": 0.5208333730697632, + "reward_std": 0.30699092149734497, + "rewards/reasoning_reward/mean": 0.5208333134651184, + "rewards/reasoning_reward/std": 0.49954691529273987, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 192.70834350585938, + "completions/mean_terminated_length": 192.70834350585938, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.2202688728024819, + "grad_norm": 3.8551099627848244, + "kl": 0.04296875, + "learning_rate": 8.860217273800021e-07, + "loss": 0.0017, + "num_tokens": 17428608.0, + "reward": 0.9791666865348816, + "reward_std": 0.5464199781417847, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.5985338091850281, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 154.33334350585938, + "completions/mean_terminated_length": 154.33334350585938, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.22130299896587383, + "grad_norm": 3.824252202511437, + "kl": 0.07568359375, + "learning_rate": 8.849872715205725e-07, + "loss": 0.003, + "num_tokens": 17507808.0, + "reward": 1.3333333730697632, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.434057354927063, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 193.70834350585938, + "completions/mean_terminated_length": 193.70834350585938, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.22233712512926576, + "grad_norm": 2.7022340614121916, + "kl": 0.033447265625, + "learning_rate": 8.839487522311086e-07, + "loss": 0.0013, + "num_tokens": 17598361.0, + "reward": 1.5, + "reward_std": 0.24966806173324585, + "rewards/reasoning_reward/mean": 1.5, + "rewards/reasoning_reward/std": 0.7071067690849304, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 186.70834350585938, + "completions/mean_terminated_length": 186.70834350585938, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.2233712512926577, + "grad_norm": 2.845013705079817, + "kl": 0.043701171875, + "learning_rate": 8.829061804728834e-07, + "loss": 0.0017, + "num_tokens": 17687378.0, + "reward": 1.1666667461395264, + "reward_std": 0.39000558853149414, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.4815434515476227, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 142.2916717529297, + "completions/mean_terminated_length": 142.2916717529297, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.22440537745604963, + "grad_norm": 3.704720807397464, + "kl": 0.029296875, + "learning_rate": 8.818595672499418e-07, + "loss": 0.0012, + "num_tokens": 17771825.0, + "reward": 0.763888955116272, + "reward_std": 0.32886335253715515, + "rewards/reasoning_reward/mean": 0.7638888955116272, + "rewards/reasoning_reward/std": 0.5152661204338074, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 204.58334350585938, + "completions/mean_terminated_length": 204.58334350585938, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.22543950361944157, + "grad_norm": 4.235645113018916, + "kl": 0.048828125, + "learning_rate": 8.808089236089857e-07, + "loss": 0.002, + "num_tokens": 17849271.0, + "reward": 1.2291667461395264, + "reward_std": 0.6813797950744629, + "rewards/reasoning_reward/mean": 1.2291666269302368, + "rewards/reasoning_reward/std": 0.7067864537239075, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 166.6666717529297, + "completions/mean_terminated_length": 166.6666717529297, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.2264736297828335, + "grad_norm": 3.7267555026025962, + "kl": 0.055908203125, + "learning_rate": 8.797542606392572e-07, + "loss": 0.0022, + "num_tokens": 17934255.0, + "reward": 0.9791666865348816, + "reward_std": 0.4042079448699951, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.6507381200790405, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 165.7916717529297, + "completions/mean_terminated_length": 165.7916717529297, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.22750775594622544, + "grad_norm": 3.259723649327136, + "kl": 0.036865234375, + "learning_rate": 8.786955894724206e-07, + "loss": 0.0015, + "num_tokens": 18011010.0, + "reward": 0.5416666865348816, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 0.5416666865348816, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 167.08334350585938, + "completions/mean_terminated_length": 167.08334350585938, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.22854188210961737, + "grad_norm": 3.2853923018575095, + "kl": 0.040771484375, + "learning_rate": 8.776329212824461e-07, + "loss": 0.0016, + "num_tokens": 18089092.0, + "reward": 1.1041667461395264, + "reward_std": 0.30551642179489136, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.7067864537239075, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 176.6666717529297, + "completions/mean_terminated_length": 176.6666717529297, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.2295760082730093, + "grad_norm": 3.3736718186548527, + "kl": 0.03173828125, + "learning_rate": 8.765662672854908e-07, + "loss": 0.0013, + "num_tokens": 18173180.0, + "reward": 0.7916666865348816, + "reward_std": 0.2314550280570984, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.9197904467582703, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 180.625, + "completions/mean_terminated_length": 180.625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.23061013443640124, + "grad_norm": 4.387150850841933, + "kl": 0.041748046875, + "learning_rate": 8.754956387397814e-07, + "loss": 0.0017, + "num_tokens": 18252827.0, + "reward": 0.7708333730697632, + "reward_std": 0.5432543754577637, + "rewards/reasoning_reward/mean": 0.7708333134651184, + "rewards/reasoning_reward/std": 0.5311833620071411, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 179.6666717529297, + "completions/mean_terminated_length": 179.6666717529297, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.23164426059979318, + "grad_norm": 3.7927764070085943, + "kl": 0.035888671875, + "learning_rate": 8.744210469454945e-07, + "loss": 0.0014, + "num_tokens": 18334267.0, + "reward": 0.8819444179534912, + "reward_std": 0.3051118850708008, + "rewards/reasoning_reward/mean": 0.8819444179534912, + "rewards/reasoning_reward/std": 0.5258513689041138, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 178.08334350585938, + "completions/mean_terminated_length": 178.08334350585938, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.2326783867631851, + "grad_norm": 2.0858422243920622, + "kl": 0.035400390625, + "learning_rate": 8.73342503244638e-07, + "loss": 0.0014, + "num_tokens": 18417093.0, + "reward": 1.1666667461395264, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 150.5416717529297, + "completions/mean_terminated_length": 150.5416717529297, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.23371251292657705, + "grad_norm": 1.7365069787778553, + "kl": 0.0274658203125, + "learning_rate": 8.722600190209303e-07, + "loss": 0.0011, + "num_tokens": 18494866.0, + "reward": 0.625, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.625, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 172.58334350585938, + "completions/mean_terminated_length": 172.58334350585938, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.23474663908996898, + "grad_norm": 4.708061391148886, + "kl": 0.034912109375, + "learning_rate": 8.711736056996817e-07, + "loss": 0.0014, + "num_tokens": 18573464.0, + "reward": 0.8125, + "reward_std": 0.6011933088302612, + "rewards/reasoning_reward/mean": 0.8125, + "rewards/reasoning_reward/std": 0.7775728702545166, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 181.7916717529297, + "completions/mean_terminated_length": 181.7916717529297, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.23578076525336092, + "grad_norm": 3.0293873866544394, + "kl": 0.037353515625, + "learning_rate": 8.700832747476725e-07, + "loss": 0.0015, + "num_tokens": 18653739.0, + "reward": 0.5277777910232544, + "reward_std": 0.24982166290283203, + "rewards/reasoning_reward/mean": 0.5277777314186096, + "rewards/reasoning_reward/std": 0.7350221872329712, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 161.75, + "completions/mean_terminated_length": 161.75, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.23681489141675285, + "grad_norm": 2.7748190749466977, + "kl": 0.03173828125, + "learning_rate": 8.689890376730327e-07, + "loss": 0.0013, + "num_tokens": 18733685.0, + "reward": 0.875, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 187.25, + "completions/mean_terminated_length": 187.25, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.2378490175801448, + "grad_norm": 2.810129730379906, + "kl": 0.036865234375, + "learning_rate": 8.678909060251201e-07, + "loss": 0.0015, + "num_tokens": 18816811.0, + "reward": 0.9791666865348816, + "reward_std": 0.352710485458374, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.5413181781768799, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 140.125, + "completions/mean_terminated_length": 140.125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.23888314374353672, + "grad_norm": 2.4272615508776347, + "kl": 0.038818359375, + "learning_rate": 8.667888913943988e-07, + "loss": 0.0016, + "num_tokens": 18893230.0, + "reward": 0.9583333730697632, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.20412415266036987, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 159.4166717529297, + "completions/mean_terminated_length": 159.4166717529297, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.23991726990692863, + "grad_norm": 4.0221042286356985, + "kl": 0.0390625, + "learning_rate": 8.656830054123168e-07, + "loss": 0.0016, + "num_tokens": 18983048.0, + "reward": 1.125, + "reward_std": 0.4563485085964203, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.8501917719841003, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 206.45834350585938, + "completions/mean_terminated_length": 206.45834350585938, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.24095139607032057, + "grad_norm": 4.213144645418847, + "kl": 0.04052734375, + "learning_rate": 8.645732597511825e-07, + "loss": 0.0016, + "num_tokens": 19060963.0, + "reward": 0.875, + "reward_std": 0.60628741979599, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.710939347743988, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 173.33334350585938, + "completions/mean_terminated_length": 173.33334350585938, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.2419855222337125, + "grad_norm": 3.05995072143482, + "kl": 0.048583984375, + "learning_rate": 8.634596661240428e-07, + "loss": 0.0019, + "num_tokens": 19137963.0, + "reward": 0.6666666865348816, + "reward_std": 0.33247750997543335, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.5036101341247559, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 217.0416717529297, + "completions/mean_terminated_length": 217.0416717529297, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.24301964839710444, + "grad_norm": 3.020436177796217, + "kl": 0.0296630859375, + "learning_rate": 8.623422362845582e-07, + "loss": 0.0012, + "num_tokens": 19215988.0, + "reward": 0.625, + "reward_std": 0.3506905436515808, + "rewards/reasoning_reward/mean": 0.625, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 180.125, + "completions/mean_terminated_length": 180.125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.24405377456049637, + "grad_norm": 2.7937754256291765, + "kl": 0.0296630859375, + "learning_rate": 8.612209820268798e-07, + "loss": 0.0012, + "num_tokens": 19299191.0, + "reward": 1.0833333730697632, + "reward_std": 0.29546841979026794, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.5036101341247559, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 217.875, + "completions/mean_terminated_length": 217.875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.2450879007238883, + "grad_norm": 3.360745099795255, + "kl": 0.052490234375, + "learning_rate": 8.600959151855241e-07, + "loss": 0.0021, + "num_tokens": 19380468.0, + "reward": 0.875, + "reward_std": 0.2985045611858368, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.5160468220710754, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 167.1666717529297, + "completions/mean_terminated_length": 167.1666717529297, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.24612202688728024, + "grad_norm": 4.215940666816239, + "kl": 0.032470703125, + "learning_rate": 8.589670476352484e-07, + "loss": 0.0013, + "num_tokens": 19465384.0, + "reward": 1.0833333730697632, + "reward_std": 0.39000558853149414, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.6197241544723511, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 226.20834350585938, + "completions/mean_terminated_length": 226.20834350585938, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.24715615305067218, + "grad_norm": 1.8917075772209162, + "kl": 0.041015625, + "learning_rate": 8.578343912909252e-07, + "loss": 0.0016, + "num_tokens": 19545613.0, + "reward": 1.0416667461395264, + "reward_std": 0.08266931027173996, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.14947572350502014, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 181.1666717529297, + "completions/mean_terminated_length": 181.1666717529297, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.2481902792140641, + "grad_norm": 3.3661865153565125, + "kl": 0.037353515625, + "learning_rate": 8.566979581074168e-07, + "loss": 0.0015, + "num_tokens": 19631889.0, + "reward": 1.1666667461395264, + "reward_std": 0.24339044094085693, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.5247498154640198, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 230.625, + "completions/mean_terminated_length": 230.625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.24922440537745605, + "grad_norm": 28.79446249181958, + "kl": 0.67578125, + "learning_rate": 8.555577600794488e-07, + "loss": 0.0271, + "num_tokens": 19711856.0, + "reward": 1.0069445371627808, + "reward_std": 0.6009811162948608, + "rewards/reasoning_reward/mean": 1.0069445371627808, + "rewards/reasoning_reward/std": 0.7010675072669983, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 175.75, + "completions/mean_terminated_length": 175.75, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.250258531540848, + "grad_norm": 2.003121090259284, + "kl": 0.037841796875, + "learning_rate": 8.54413809241484e-07, + "loss": 0.0015, + "num_tokens": 19791402.0, + "reward": 1.0833333730697632, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.28232985734939575, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 143.375, + "completions/mean_terminated_length": 143.375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.2512926577042399, + "grad_norm": 0.15042965975061226, + "kl": 0.032958984375, + "learning_rate": 8.53266117667595e-07, + "loss": 0.0013, + "num_tokens": 19871107.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 157.25, + "completions/mean_terminated_length": 157.25, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.25232678386763185, + "grad_norm": 0.13370684582648995, + "kl": 0.042236328125, + "learning_rate": 8.521146974713363e-07, + "loss": 0.0017, + "num_tokens": 19950505.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 144.375, + "completions/mean_terminated_length": 142.6521759033203, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.2533609100310238, + "grad_norm": 2.465382515779942, + "kl": 0.0281982421875, + "learning_rate": 8.50959560805617e-07, + "loss": 0.0011, + "num_tokens": 20031090.0, + "reward": 0.8333333730697632, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 161.33334350585938, + "completions/mean_terminated_length": 161.33334350585938, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.2543950361944157, + "grad_norm": 4.076548379222048, + "kl": 0.038330078125, + "learning_rate": 8.498007198625732e-07, + "loss": 0.0015, + "num_tokens": 20112338.0, + "reward": 1.1458333730697632, + "reward_std": 0.22466278076171875, + "rewards/reasoning_reward/mean": 1.1458333730697632, + "rewards/reasoning_reward/std": 0.40322521328926086, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 163.9166717529297, + "completions/mean_terminated_length": 163.9166717529297, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.25542916235780766, + "grad_norm": 3.9449748133141647, + "kl": 0.051513671875, + "learning_rate": 8.486381868734378e-07, + "loss": 0.0021, + "num_tokens": 20196792.0, + "reward": 0.9791666865348816, + "reward_std": 0.4564814567565918, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.6833288669586182, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 127.91667175292969, + "completions/mean_terminated_length": 127.91667175292969, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.2564632885211996, + "grad_norm": 4.846992572972704, + "kl": 0.040283203125, + "learning_rate": 8.474719741084126e-07, + "loss": 0.0016, + "num_tokens": 20279862.0, + "reward": 1.1666667461395264, + "reward_std": 0.3900056481361389, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.5646597146987915, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 180.70834350585938, + "completions/mean_terminated_length": 180.70834350585938, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.25749741468459153, + "grad_norm": 5.015013743019392, + "kl": 0.06787109375, + "learning_rate": 8.463020938765384e-07, + "loss": 0.0027, + "num_tokens": 20363271.0, + "reward": 1.125, + "reward_std": 0.5078567266464233, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.8501917719841003, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 158.0416717529297, + "completions/mean_terminated_length": 158.0416717529297, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.25853154084798347, + "grad_norm": 3.3147927169947553, + "kl": 0.06982421875, + "learning_rate": 8.451285585255646e-07, + "loss": 0.0028, + "num_tokens": 20442512.0, + "reward": 0.9166666865348816, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.28232985734939575, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 133.83334350585938, + "completions/mean_terminated_length": 133.83334350585938, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.2595656670113754, + "grad_norm": 1.9961818932049205, + "kl": 0.0390625, + "learning_rate": 8.439513804418196e-07, + "loss": 0.0016, + "num_tokens": 20523788.0, + "reward": 0.7916666865348816, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148510992527008, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 148.6666717529297, + "completions/mean_terminated_length": 148.6666717529297, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.26059979317476734, + "grad_norm": 3.886022313764354, + "kl": 0.0380859375, + "learning_rate": 8.4277057205008e-07, + "loss": 0.0015, + "num_tokens": 20606012.0, + "reward": 1.2083333730697632, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.5882299542427063, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 166.375, + "completions/mean_terminated_length": 166.375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.26163391933815927, + "grad_norm": 3.611166967490902, + "kl": 0.07421875, + "learning_rate": 8.415861458134392e-07, + "loss": 0.003, + "num_tokens": 20694661.0, + "reward": 1.625, + "reward_std": 0.3933655619621277, + "rewards/reasoning_reward/mean": 1.625, + "rewards/reasoning_reward/std": 0.5366967916488647, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 188.875, + "completions/mean_terminated_length": 188.875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.2626680455015512, + "grad_norm": 3.9865927693728582, + "kl": 0.076171875, + "learning_rate": 8.403981142331758e-07, + "loss": 0.003, + "num_tokens": 20774098.0, + "reward": 0.875, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 133.125, + "completions/mean_terminated_length": 133.125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.26370217166494314, + "grad_norm": 2.6781574727908297, + "kl": 0.0556640625, + "learning_rate": 8.392064898486215e-07, + "loss": 0.0022, + "num_tokens": 20853589.0, + "reward": 0.8333333730697632, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 149.70834350585938, + "completions/mean_terminated_length": 149.70834350585938, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.2647362978283351, + "grad_norm": 5.965736009719672, + "kl": 0.07080078125, + "learning_rate": 8.380112852370296e-07, + "loss": 0.0028, + "num_tokens": 20936110.0, + "reward": 1.1805555820465088, + "reward_std": 0.36751919984817505, + "rewards/reasoning_reward/mean": 1.1805554628372192, + "rewards/reasoning_reward/std": 0.6117146611213684, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 153.5416717529297, + "completions/mean_terminated_length": 153.5416717529297, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.265770423991727, + "grad_norm": 2.4421141476780917, + "kl": 0.038818359375, + "learning_rate": 8.368125130134414e-07, + "loss": 0.0015, + "num_tokens": 21024379.0, + "reward": 0.9791666865348816, + "reward_std": 0.18766528367996216, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.31204676628112793, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 160.5416717529297, + "completions/mean_terminated_length": 160.5416717529297, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.26680455015511895, + "grad_norm": 3.149878852517023, + "kl": 0.0556640625, + "learning_rate": 8.356101858305528e-07, + "loss": 0.0022, + "num_tokens": 21110432.0, + "reward": 1.1875, + "reward_std": 0.23709973692893982, + "rewards/reasoning_reward/mean": 1.1875, + "rewards/reasoning_reward/std": 0.7042186260223389, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 181.375, + "completions/mean_terminated_length": 181.375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.2678386763185109, + "grad_norm": 4.173006267735779, + "kl": 0.07080078125, + "learning_rate": 8.344043163785823e-07, + "loss": 0.0028, + "num_tokens": 21193913.0, + "reward": 0.9583333730697632, + "reward_std": 0.5727972984313965, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.8329709768295288, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 161.7916717529297, + "completions/mean_terminated_length": 161.7916717529297, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.2688728024819028, + "grad_norm": 3.930560040382512, + "kl": 0.07470703125, + "learning_rate": 8.331949173851354e-07, + "loss": 0.003, + "num_tokens": 21270796.0, + "reward": 0.75, + "reward_std": 0.33247750997543335, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 137.5, + "completions/mean_terminated_length": 137.5, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.26990692864529475, + "grad_norm": 4.470134236685902, + "kl": 0.06298828125, + "learning_rate": 8.319820016150706e-07, + "loss": 0.0025, + "num_tokens": 21353616.0, + "reward": 0.9583333730697632, + "reward_std": 0.39814266562461853, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.4402732849121094, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 184.08334350585938, + "completions/mean_terminated_length": 184.08334350585938, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.27094105480868663, + "grad_norm": 2.50371678079303, + "kl": 0.06298828125, + "learning_rate": 8.307655818703657e-07, + "loss": 0.0025, + "num_tokens": 21432634.0, + "reward": 1.0833333730697632, + "reward_std": 0.1259881556034088, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.24077169597148895, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 188.20834350585938, + "completions/mean_terminated_length": 188.20834350585938, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.27197518097207857, + "grad_norm": 2.890189899345422, + "kl": 0.0284423828125, + "learning_rate": 8.295456709899816e-07, + "loss": 0.0011, + "num_tokens": 21519879.0, + "reward": 0.6666666865348816, + "reward_std": 0.33247750997543335, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.746974527835846, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 153.33334350585938, + "completions/mean_terminated_length": 153.33334350585938, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.2730093071354705, + "grad_norm": 4.254336132840291, + "kl": 0.0634765625, + "learning_rate": 8.283222818497269e-07, + "loss": 0.0025, + "num_tokens": 21599551.0, + "reward": 0.7083333730697632, + "reward_std": 0.3506905436515808, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 142.125, + "completions/mean_terminated_length": 147.0, + "completions/min_length": 30.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.27404343329886244, + "grad_norm": 4.319565951453572, + "kl": 0.060302734375, + "learning_rate": 8.270954273621228e-07, + "loss": 0.0024, + "num_tokens": 21682506.0, + "reward": 0.75, + "reward_std": 0.45846566557884216, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.7071067690849304, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 163.6666717529297, + "completions/mean_terminated_length": 163.6666717529297, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.2750775594622544, + "grad_norm": 3.4635605581504056, + "kl": 0.05859375, + "learning_rate": 8.258651204762657e-07, + "loss": 0.0023, + "num_tokens": 21771346.0, + "reward": 1.5416667461395264, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 1.5416666269302368, + "rewards/reasoning_reward/std": 0.6580052971839905, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 157.08334350585938, + "completions/mean_terminated_length": 157.08334350585938, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.2761116856256463, + "grad_norm": 4.0415573834840535, + "kl": 0.052490234375, + "learning_rate": 8.24631374177691e-07, + "loss": 0.0021, + "num_tokens": 21850356.0, + "reward": 0.8472222089767456, + "reward_std": 0.4722983241081238, + "rewards/reasoning_reward/mean": 0.8472221493721008, + "rewards/reasoning_reward/std": 0.7612547874450684, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 120.41667175292969, + "completions/mean_terminated_length": 120.41667175292969, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.27714581178903824, + "grad_norm": 3.246047157021264, + "kl": 0.058349609375, + "learning_rate": 8.233942014882369e-07, + "loss": 0.0023, + "num_tokens": 21930830.0, + "reward": 0.6666666865348816, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.5646597146987915, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 160.625, + "completions/mean_terminated_length": 160.625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.2781799379524302, + "grad_norm": 4.562799738250913, + "kl": 0.052490234375, + "learning_rate": 8.221536154659054e-07, + "loss": 0.0021, + "num_tokens": 22015157.0, + "reward": 1.125, + "reward_std": 0.5863928198814392, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.7408866882324219, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 553.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 163.33334350585938, + "completions/mean_terminated_length": 146.3913116455078, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.2792140641158221, + "grad_norm": 4.181549595465215, + "kl": 0.07568359375, + "learning_rate": 8.209096292047257e-07, + "loss": 0.003, + "num_tokens": 22101757.0, + "reward": 0.75, + "reward_std": 0.4446708858013153, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.675663948059082, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 165.08334350585938, + "completions/mean_terminated_length": 165.08334350585938, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.28024819027921405, + "grad_norm": 7.933565453714079, + "kl": 0.2265625, + "learning_rate": 8.196622558346152e-07, + "loss": 0.0091, + "num_tokens": 22180183.0, + "reward": 0.875, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 129.70834350585938, + "completions/mean_terminated_length": 132.13043212890625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.281282316442606, + "grad_norm": 3.1643095072157363, + "kl": 0.07421875, + "learning_rate": 8.184115085212413e-07, + "loss": 0.003, + "num_tokens": 22256120.0, + "reward": 0.875, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 146.33334350585938, + "completions/mean_terminated_length": 146.33334350585938, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.2823164426059979, + "grad_norm": 3.819168262230962, + "kl": 0.062255859375, + "learning_rate": 8.171574004658828e-07, + "loss": 0.0025, + "num_tokens": 22344216.0, + "reward": 1.375, + "reward_std": 0.31285393238067627, + "rewards/reasoning_reward/mean": 1.375, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 161.08334350585938, + "completions/mean_terminated_length": 161.08334350585938, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.28335056876938985, + "grad_norm": 3.2604091114453984, + "kl": 0.058837890625, + "learning_rate": 8.158999449052898e-07, + "loss": 0.0024, + "num_tokens": 22419034.0, + "reward": 1.0625, + "reward_std": 0.13607725501060486, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.1689159870147705, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 148.45834350585938, + "completions/mean_terminated_length": 148.45834350585938, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.2843846949327818, + "grad_norm": 5.802068488952265, + "kl": 0.1806640625, + "learning_rate": 8.146391551115442e-07, + "loss": 0.0072, + "num_tokens": 22497061.0, + "reward": 1.2291667461395264, + "reward_std": 0.23144195973873138, + "rewards/reasoning_reward/mean": 1.2291666269302368, + "rewards/reasoning_reward/std": 0.4164854884147644, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 135.4166717529297, + "completions/mean_terminated_length": 135.4166717529297, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.2854188210961737, + "grad_norm": 2.5403712266985683, + "kl": 0.0615234375, + "learning_rate": 8.133750443919205e-07, + "loss": 0.0025, + "num_tokens": 22575655.0, + "reward": 1.375, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 1.375, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 164.0416717529297, + "completions/mean_terminated_length": 164.0416717529297, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.28645294725956566, + "grad_norm": 2.660324010030384, + "kl": 0.056884765625, + "learning_rate": 8.121076260887436e-07, + "loss": 0.0023, + "num_tokens": 22653648.0, + "reward": 0.5, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 0.5, + "rewards/reasoning_reward/std": 0.5107539296150208, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 155.70834350585938, + "completions/mean_terminated_length": 155.70834350585938, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.2874870734229576, + "grad_norm": 3.979951013030403, + "kl": 0.052978515625, + "learning_rate": 8.108369135792498e-07, + "loss": 0.0021, + "num_tokens": 22739169.0, + "reward": 0.8333333730697632, + "reward_std": 0.39324939250946045, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.7755315899848938, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 163.125, + "completions/mean_terminated_length": 163.125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.28852119958634953, + "grad_norm": 2.664386460601889, + "kl": 0.07568359375, + "learning_rate": 8.095629202754447e-07, + "loss": 0.003, + "num_tokens": 22816052.0, + "reward": 0.6458333730697632, + "reward_std": 0.13908717036247253, + "rewards/reasoning_reward/mean": 0.6458333134651184, + "rewards/reasoning_reward/std": 0.5208514332771301, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 154.875, + "completions/mean_terminated_length": 154.875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.28955532574974147, + "grad_norm": 3.1543556872497907, + "kl": 0.060791015625, + "learning_rate": 8.082856596239613e-07, + "loss": 0.0024, + "num_tokens": 22893649.0, + "reward": 0.5, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 0.5, + "rewards/reasoning_reward/std": 0.5107539296150208, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 147.75, + "completions/mean_terminated_length": 147.75, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.2905894519131334, + "grad_norm": 3.3080012627332933, + "kl": 0.07275390625, + "learning_rate": 8.070051451059188e-07, + "loss": 0.0029, + "num_tokens": 22971419.0, + "reward": 0.7708333730697632, + "reward_std": 0.1849137246608734, + "rewards/reasoning_reward/mean": 0.7708333134651184, + "rewards/reasoning_reward/std": 0.5706435441970825, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 143.83334350585938, + "completions/mean_terminated_length": 143.83334350585938, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.29162357807652534, + "grad_norm": 1.9572479604131818, + "kl": 0.0556640625, + "learning_rate": 8.057213902367801e-07, + "loss": 0.0022, + "num_tokens": 23048719.0, + "reward": 0.75, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 176.20834350585938, + "completions/mean_terminated_length": 176.20834350585938, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.29265770423991727, + "grad_norm": 2.3492148432005755, + "kl": 0.0556640625, + "learning_rate": 8.044344085662092e-07, + "loss": 0.0022, + "num_tokens": 23127580.0, + "reward": 0.125, + "reward_std": 0.24800793826580048, + "rewards/reasoning_reward/mean": 0.125, + "rewards/reasoning_reward/std": 0.4484272003173828, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 178.75, + "completions/mean_terminated_length": 178.75, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.2936918304033092, + "grad_norm": 3.9640676648988027, + "kl": 0.076171875, + "learning_rate": 8.031442136779271e-07, + "loss": 0.0031, + "num_tokens": 23210798.0, + "reward": 1.0625, + "reward_std": 0.3596132695674896, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.7798991799354553, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 184.0416717529297, + "completions/mean_terminated_length": 184.0416717529297, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.29472595656670114, + "grad_norm": 4.300616429230677, + "kl": 0.057861328125, + "learning_rate": 8.018508191895712e-07, + "loss": 0.0023, + "num_tokens": 23294247.0, + "reward": 0.9791666865348816, + "reward_std": 0.40489405393600464, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.6507381200790405, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 190.375, + "completions/mean_terminated_length": 190.375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.2957600827300931, + "grad_norm": 2.685078105274284, + "kl": 0.06884765625, + "learning_rate": 8.005542387525479e-07, + "loss": 0.0028, + "num_tokens": 23383472.0, + "reward": 1.3125, + "reward_std": 0.23709973692893982, + "rewards/reasoning_reward/mean": 1.3125, + "rewards/reasoning_reward/std": 0.38483479619026184, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 161.375, + "completions/mean_terminated_length": 161.375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.296794208893485, + "grad_norm": 3.908207128618161, + "kl": 0.056640625, + "learning_rate": 7.992544860518915e-07, + "loss": 0.0023, + "num_tokens": 23464673.0, + "reward": 0.8333333730697632, + "reward_std": 0.3900056481361389, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 174.95834350585938, + "completions/mean_terminated_length": 174.95834350585938, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.29782833505687695, + "grad_norm": 3.2451345960277793, + "kl": 0.051513671875, + "learning_rate": 7.979515748061181e-07, + "loss": 0.0021, + "num_tokens": 23549896.0, + "reward": 1.0625, + "reward_std": 0.3194752335548401, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.47348156571388245, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 174.75, + "completions/mean_terminated_length": 174.75, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.2988624612202689, + "grad_norm": 4.7409903359224295, + "kl": 0.060791015625, + "learning_rate": 7.966455187670819e-07, + "loss": 0.0024, + "num_tokens": 23627058.0, + "reward": 0.4791666865348816, + "reward_std": 0.4373263716697693, + "rewards/reasoning_reward/mean": 0.4791666567325592, + "rewards/reasoning_reward/std": 0.5985338091850281, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 152.9166717529297, + "completions/mean_terminated_length": 152.9166717529297, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.2998965873836608, + "grad_norm": 1.7959395053181861, + "kl": 0.038818359375, + "learning_rate": 7.953363317198287e-07, + "loss": 0.0016, + "num_tokens": 23705888.0, + "reward": 1.1597223281860352, + "reward_std": 0.12751007080078125, + "rewards/reasoning_reward/mean": 1.1597222089767456, + "rewards/reasoning_reward/std": 0.3126911520957947, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 231.45834350585938, + "completions/mean_terminated_length": 231.45834350585938, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.30093071354705275, + "grad_norm": 2.7320267048254983, + "kl": 0.046630859375, + "learning_rate": 7.940240274824519e-07, + "loss": 0.0019, + "num_tokens": 23786971.0, + "reward": 0.8125, + "reward_std": 0.2836732268333435, + "rewards/reasoning_reward/mean": 0.8125, + "rewards/reasoning_reward/std": 0.44589513540267944, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 148.83334350585938, + "completions/mean_terminated_length": 148.83334350585938, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.3019648397104447, + "grad_norm": 3.1680818855775654, + "kl": 0.0556640625, + "learning_rate": 7.927086199059457e-07, + "loss": 0.0022, + "num_tokens": 23866479.0, + "reward": 1.1875, + "reward_std": 0.24185511469841003, + "rewards/reasoning_reward/mean": 1.1875, + "rewards/reasoning_reward/std": 0.4618605971336365, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 156.625, + "completions/mean_terminated_length": 156.625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.3029989658738366, + "grad_norm": 1.8750379438190574, + "kl": 0.045166015625, + "learning_rate": 7.913901228740589e-07, + "loss": 0.0018, + "num_tokens": 23952070.0, + "reward": 1.2916667461395264, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 1.2916666269302368, + "rewards/reasoning_reward/std": 0.550032913684845, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 138.58334350585938, + "completions/mean_terminated_length": 138.58334350585938, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.30403309203722856, + "grad_norm": 3.2244709897237893, + "kl": 0.0419921875, + "learning_rate": 7.90068550303149e-07, + "loss": 0.0017, + "num_tokens": 24034804.0, + "reward": 0.7916666865348816, + "reward_std": 0.29602527618408203, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 143.20834350585938, + "completions/mean_terminated_length": 143.20834350585938, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.3050672182006205, + "grad_norm": 3.2322747610220564, + "kl": 0.06494140625, + "learning_rate": 7.887439161420346e-07, + "loss": 0.0026, + "num_tokens": 24115961.0, + "reward": 0.9583333730697632, + "reward_std": 0.21026216447353363, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.6743220090866089, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 102.5, + "completions/mean_terminated_length": 102.5, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.30610134436401243, + "grad_norm": 4.065413966797424, + "kl": 0.050048828125, + "learning_rate": 7.874162343718489e-07, + "loss": 0.002, + "num_tokens": 24197397.0, + "reward": 0.8333333730697632, + "reward_std": 0.30860671401023865, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 156.58334350585938, + "completions/mean_terminated_length": 156.58334350585938, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.30713547052740436, + "grad_norm": 35.175482523866044, + "kl": 0.20703125, + "learning_rate": 7.860855190058913e-07, + "loss": 0.0083, + "num_tokens": 24281619.0, + "reward": 1.0625, + "reward_std": 0.4130779206752777, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.6806725859642029, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 143.125, + "completions/mean_terminated_length": 143.125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.3081695966907963, + "grad_norm": 3.422532515079038, + "kl": 0.0654296875, + "learning_rate": 7.847517840894803e-07, + "loss": 0.0026, + "num_tokens": 24361918.0, + "reward": 0.8541666865348816, + "reward_std": 0.23709973692893982, + "rewards/reasoning_reward/mean": 0.8541666865348816, + "rewards/reasoning_reward/std": 0.40322521328926086, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 171.08334350585938, + "completions/mean_terminated_length": 171.08334350585938, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.30920372285418823, + "grad_norm": 4.1270544852144475, + "kl": 0.0498046875, + "learning_rate": 7.834150436998046e-07, + "loss": 0.002, + "num_tokens": 24443768.0, + "reward": 1.1041667461395264, + "reward_std": 0.4082317352294922, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.7937139868736267, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 138.45834350585938, + "completions/mean_terminated_length": 138.45834350585938, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.31023784901758017, + "grad_norm": 3.611002627039542, + "kl": 0.0537109375, + "learning_rate": 7.820753119457751e-07, + "loss": 0.0021, + "num_tokens": 24521315.0, + "reward": 0.6458333730697632, + "reward_std": 0.1767766922712326, + "rewards/reasoning_reward/mean": 0.6458333134651184, + "rewards/reasoning_reward/std": 0.47729232907295227, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 149.20834350585938, + "completions/mean_terminated_length": 149.20834350585938, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.3112719751809721, + "grad_norm": 3.152472722978425, + "kl": 0.068359375, + "learning_rate": 7.807326029678753e-07, + "loss": 0.0027, + "num_tokens": 24603760.0, + "reward": 0.8333333730697632, + "reward_std": 0.36585909128189087, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.8164966702461243, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 205.4166717529297, + "completions/mean_terminated_length": 205.4166717529297, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.31230610134436404, + "grad_norm": 2.786475190664922, + "kl": 0.051025390625, + "learning_rate": 7.793869309380128e-07, + "loss": 0.002, + "num_tokens": 24683322.0, + "reward": 1.0833333730697632, + "reward_std": 0.42052432894706726, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.6538625359535217, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 180.625, + "completions/mean_terminated_length": 180.625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.3133402275077559, + "grad_norm": 4.57864897319015, + "kl": 0.09521484375, + "learning_rate": 7.780383100593692e-07, + "loss": 0.0038, + "num_tokens": 24766529.0, + "reward": 1.4583333730697632, + "reward_std": 0.35799640417099, + "rewards/reasoning_reward/mean": 1.4583333730697632, + "rewards/reasoning_reward/std": 0.4402732849121094, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 163.95834350585938, + "completions/mean_terminated_length": 163.95834350585938, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.31437435367114785, + "grad_norm": 3.623601369487776, + "kl": 0.053955078125, + "learning_rate": 7.766867545662506e-07, + "loss": 0.0022, + "num_tokens": 24850544.0, + "reward": 1.0208333730697632, + "reward_std": 0.31726133823394775, + "rewards/reasoning_reward/mean": 1.0208333730697632, + "rewards/reasoning_reward/std": 0.5800893306732178, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 138.9166717529297, + "completions/mean_terminated_length": 138.9166717529297, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.3154084798345398, + "grad_norm": 3.8825218639045667, + "kl": 0.0732421875, + "learning_rate": 7.753322787239365e-07, + "loss": 0.0029, + "num_tokens": 24931478.0, + "reward": 0.9583333730697632, + "reward_std": 0.31285393238067627, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.32693126797676086, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 164.4166717529297, + "completions/mean_terminated_length": 164.4166717529297, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.3164426059979317, + "grad_norm": 2.1908650833519956, + "kl": 0.048828125, + "learning_rate": 7.739748968285305e-07, + "loss": 0.0019, + "num_tokens": 25008608.0, + "reward": 0.9166666865348816, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.318511039018631, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 145.4166717529297, + "completions/mean_terminated_length": 145.4166717529297, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.31747673216132366, + "grad_norm": 4.314449957580246, + "kl": 0.07275390625, + "learning_rate": 7.726146232068083e-07, + "loss": 0.0029, + "num_tokens": 25090962.0, + "reward": 0.9166666865348816, + "reward_std": 0.5232069492340088, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.7172814607620239, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 162.0416717529297, + "completions/mean_terminated_length": 162.0416717529297, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.3185108583247156, + "grad_norm": 3.710576665295931, + "kl": 0.05859375, + "learning_rate": 7.712514722160673e-07, + "loss": 0.0023, + "num_tokens": 25179115.0, + "reward": 1.5416667461395264, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 1.5416666269302368, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 160.83334350585938, + "completions/mean_terminated_length": 160.83334350585938, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.31954498448810753, + "grad_norm": 4.256835002888248, + "kl": 0.09521484375, + "learning_rate": 7.698854582439744e-07, + "loss": 0.0038, + "num_tokens": 25267727.0, + "reward": 1.0555555820465088, + "reward_std": 0.6559478044509888, + "rewards/reasoning_reward/mean": 1.0555554628372192, + "rewards/reasoning_reward/std": 0.7656052112579346, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 188.5416717529297, + "completions/mean_terminated_length": 188.5416717529297, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.32057911065149947, + "grad_norm": 1.8170985697472053, + "kl": 0.052001953125, + "learning_rate": 7.685165957084147e-07, + "loss": 0.0021, + "num_tokens": 25358292.0, + "reward": 0.625, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.625, + "rewards/reasoning_reward/std": 0.5366967916488647, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 155.1666717529297, + "completions/mean_terminated_length": 155.1666717529297, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.3216132368148914, + "grad_norm": 3.812637593183448, + "kl": 0.08935546875, + "learning_rate": 7.671448990573391e-07, + "loss": 0.0036, + "num_tokens": 25446560.0, + "reward": 1.2708333730697632, + "reward_std": 0.5512506365776062, + "rewards/reasoning_reward/mean": 1.2708333730697632, + "rewards/reasoning_reward/std": 0.6753286719322205, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 170.20834350585938, + "completions/mean_terminated_length": 170.20834350585938, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.32264736297828334, + "grad_norm": 2.218366125481505, + "kl": 0.06689453125, + "learning_rate": 7.657703827686115e-07, + "loss": 0.0027, + "num_tokens": 25523845.0, + "reward": 0.6875, + "reward_std": 0.25877460837364197, + "rewards/reasoning_reward/mean": 0.6875, + "rewards/reasoning_reward/std": 0.6562823057174683, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 179.6666717529297, + "completions/mean_terminated_length": 179.6666717529297, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.32368148914167527, + "grad_norm": 3.4872202871188547, + "kl": 0.06298828125, + "learning_rate": 7.643930613498561e-07, + "loss": 0.0025, + "num_tokens": 25606917.0, + "reward": 1.1736111640930176, + "reward_std": 0.3726871907711029, + "rewards/reasoning_reward/mean": 1.173611044883728, + "rewards/reasoning_reward/std": 0.464901328086853, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 166.33334350585938, + "completions/mean_terminated_length": 166.33334350585938, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.3247156153050672, + "grad_norm": 3.037599272229024, + "kl": 0.06982421875, + "learning_rate": 7.630129493383052e-07, + "loss": 0.0028, + "num_tokens": 25685437.0, + "reward": 1.1458333730697632, + "reward_std": 0.23709973692893982, + "rewards/reasoning_reward/mean": 1.1458333730697632, + "rewards/reasoning_reward/std": 0.6833289265632629, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 146.95834350585938, + "completions/mean_terminated_length": 146.95834350585938, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.32574974146845914, + "grad_norm": 4.078205524302534, + "kl": 0.047119140625, + "learning_rate": 7.616300613006442e-07, + "loss": 0.0019, + "num_tokens": 25765932.0, + "reward": 0.5416666865348816, + "reward_std": 0.48112308979034424, + "rewards/reasoning_reward/mean": 0.5416666865348816, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 133.7916717529297, + "completions/mean_terminated_length": 133.7916717529297, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.3267838676318511, + "grad_norm": 0.2821083532299412, + "kl": 0.06787109375, + "learning_rate": 7.602444118328592e-07, + "loss": 0.0027, + "num_tokens": 25840439.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 134.875, + "completions/mean_terminated_length": 134.875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.327817993795243, + "grad_norm": 3.5599707379709162, + "kl": 0.091796875, + "learning_rate": 7.588560155600823e-07, + "loss": 0.0037, + "num_tokens": 25921308.0, + "reward": 1.1458333730697632, + "reward_std": 0.43601590394973755, + "rewards/reasoning_reward/mean": 1.1458333730697632, + "rewards/reasoning_reward/std": 0.7587054967880249, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 153.6666717529297, + "completions/mean_terminated_length": 153.6666717529297, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.32885211995863495, + "grad_norm": 3.1856291943344077, + "kl": 0.07666015625, + "learning_rate": 7.574648871364368e-07, + "loss": 0.0031, + "num_tokens": 26000556.0, + "reward": 1.2013888359069824, + "reward_std": 0.1516503393650055, + "rewards/reasoning_reward/mean": 1.2013888359069824, + "rewards/reasoning_reward/std": 0.3068428933620453, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 152.1666717529297, + "completions/mean_terminated_length": 152.1666717529297, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.3298862461220269, + "grad_norm": 3.283679581462454, + "kl": 0.07080078125, + "learning_rate": 7.560710412448838e-07, + "loss": 0.0028, + "num_tokens": 26080200.0, + "reward": 0.7708333730697632, + "reward_std": 0.32520395517349243, + "rewards/reasoning_reward/mean": 0.7708333134651184, + "rewards/reasoning_reward/std": 0.416485458612442, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 164.875, + "completions/mean_terminated_length": 164.875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.3309203722854188, + "grad_norm": 4.160648948343019, + "kl": 0.054443359375, + "learning_rate": 7.546744925970664e-07, + "loss": 0.0022, + "num_tokens": 26157397.0, + "reward": 0.9583333730697632, + "reward_std": 0.3205421268939972, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.35864076018333435, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 214.25, + "completions/mean_terminated_length": 209.0869598388672, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.33195449844881075, + "grad_norm": 3.5547184484445915, + "kl": 0.07373046875, + "learning_rate": 7.532752559331539e-07, + "loss": 0.003, + "num_tokens": 26235427.0, + "reward": 0.7916666865348816, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 175.25, + "completions/mean_terminated_length": 175.25, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.3329886246122027, + "grad_norm": 4.066280261564092, + "kl": 0.0791015625, + "learning_rate": 7.518733460216875e-07, + "loss": 0.0032, + "num_tokens": 26319289.0, + "reward": 1.125, + "reward_std": 0.499225378036499, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.6298723220825195, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 158.33334350585938, + "completions/mean_terminated_length": 158.33334350585938, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.3340227507755946, + "grad_norm": 2.980136617187653, + "kl": 0.051025390625, + "learning_rate": 7.504687776594234e-07, + "loss": 0.002, + "num_tokens": 26395665.0, + "reward": 0.9375, + "reward_std": 0.3255884051322937, + "rewards/reasoning_reward/mean": 0.9375, + "rewards/reasoning_reward/std": 0.39870715141296387, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 136.83334350585938, + "completions/mean_terminated_length": 136.83334350585938, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.33505687693898656, + "grad_norm": 3.0212039876952943, + "kl": 0.0576171875, + "learning_rate": 7.490615656711771e-07, + "loss": 0.0023, + "num_tokens": 26472917.0, + "reward": 1.2083333730697632, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.5882299542427063, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 172.125, + "completions/mean_terminated_length": 172.125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.3360910031023785, + "grad_norm": 3.015813746260023, + "kl": 0.07568359375, + "learning_rate": 7.47651724909667e-07, + "loss": 0.003, + "num_tokens": 26557080.0, + "reward": 0.6041666865348816, + "reward_std": 0.3116035461425781, + "rewards/reasoning_reward/mean": 0.6041666865348816, + "rewards/reasoning_reward/std": 0.7220015525817871, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.0, + "completions/max_terminated_length": 617.0, + "completions/mean_length": 188.75, + "completions/mean_terminated_length": 188.75, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.33712512926577043, + "grad_norm": 4.08859100969837, + "kl": 0.06494140625, + "learning_rate": 7.46239270255357e-07, + "loss": 0.0026, + "num_tokens": 26641714.0, + "reward": 1.0833333730697632, + "reward_std": 0.39000558853149414, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 181.6666717529297, + "completions/mean_terminated_length": 181.6666717529297, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.33815925542916236, + "grad_norm": 4.0866348173332545, + "kl": 0.07080078125, + "learning_rate": 7.448242166163003e-07, + "loss": 0.0028, + "num_tokens": 26724962.0, + "reward": 1.3333333730697632, + "reward_std": 0.32801350951194763, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.48900964856147766, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 152.625, + "completions/mean_terminated_length": 152.625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.3391933815925543, + "grad_norm": 2.1510475663169353, + "kl": 0.057373046875, + "learning_rate": 7.434065789279815e-07, + "loss": 0.0023, + "num_tokens": 26803649.0, + "reward": 0.875, + "reward_std": 0.14773420989513397, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.7260674238204956, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 159.9166717529297, + "completions/mean_terminated_length": 159.9166717529297, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.34022750775594623, + "grad_norm": 3.4038270866830955, + "kl": 0.07275390625, + "learning_rate": 7.41986372153159e-07, + "loss": 0.0029, + "num_tokens": 26895591.0, + "reward": 1.375, + "reward_std": 0.273722380399704, + "rewards/reasoning_reward/mean": 1.375, + "rewards/reasoning_reward/std": 0.4484272003173828, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 157.625, + "completions/mean_terminated_length": 157.625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.34126163391933817, + "grad_norm": 3.2332472627018083, + "kl": 0.060791015625, + "learning_rate": 7.405636112817071e-07, + "loss": 0.0024, + "num_tokens": 26974222.0, + "reward": 0.9583333730697632, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.35864076018333435, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 164.45834350585938, + "completions/mean_terminated_length": 164.45834350585938, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.3422957600827301, + "grad_norm": 3.3744277472463686, + "kl": 0.07275390625, + "learning_rate": 7.391383113304583e-07, + "loss": 0.0029, + "num_tokens": 27058377.0, + "reward": 1.0208333730697632, + "reward_std": 0.3508317470550537, + "rewards/reasoning_reward/mean": 1.0208333730697632, + "rewards/reasoning_reward/std": 0.6833289265632629, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 134.7916717529297, + "completions/mean_terminated_length": 134.7916717529297, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.34332988624612204, + "grad_norm": 3.766622375322823, + "kl": 0.08056640625, + "learning_rate": 7.377104873430438e-07, + "loss": 0.0032, + "num_tokens": 27138204.0, + "reward": 0.75, + "reward_std": 0.34503278136253357, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 134.4166717529297, + "completions/mean_terminated_length": 134.4166717529297, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.344364012409514, + "grad_norm": 3.8370132926239346, + "kl": 0.0859375, + "learning_rate": 7.362801543897357e-07, + "loss": 0.0034, + "num_tokens": 27219846.0, + "reward": 1.1041667461395264, + "reward_std": 0.23144195973873138, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.7220014929771423, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 259.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 144.625, + "completions/mean_terminated_length": 139.6521759033203, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.3453981385729059, + "grad_norm": 2.1634706010744544, + "kl": 0.058349609375, + "learning_rate": 7.348473275672873e-07, + "loss": 0.0023, + "num_tokens": 27300293.0, + "reward": 1.0416667461395264, + "reward_std": 0.18722420930862427, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.31565436720848083, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 166.625, + "completions/mean_terminated_length": 166.625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.34643226473629785, + "grad_norm": 3.494854866170849, + "kl": 0.0751953125, + "learning_rate": 7.334120219987741e-07, + "loss": 0.003, + "num_tokens": 27379068.0, + "reward": 0.6666666865348816, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 162.75, + "completions/mean_terminated_length": 162.75, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.3474663908996898, + "grad_norm": 0.35319764403800336, + "kl": 0.0810546875, + "learning_rate": 7.319742528334339e-07, + "loss": 0.0032, + "num_tokens": 27455982.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 126.70833587646484, + "completions/mean_terminated_length": 126.70833587646484, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.3485005170630817, + "grad_norm": 4.842479635487001, + "kl": 0.053955078125, + "learning_rate": 7.305340352465071e-07, + "loss": 0.0022, + "num_tokens": 27535311.0, + "reward": 0.75, + "reward_std": 0.41387641429901123, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 144.6666717529297, + "completions/mean_terminated_length": 144.6666717529297, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.34953464322647365, + "grad_norm": 3.960259845440186, + "kl": 0.0625, + "learning_rate": 7.290913844390765e-07, + "loss": 0.0025, + "num_tokens": 27613047.0, + "reward": 0.7916666865348816, + "reward_std": 0.29602527618408203, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 127.20833587646484, + "completions/mean_terminated_length": 127.20833587646484, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.3505687693898656, + "grad_norm": 2.53370412691966, + "kl": 0.052001953125, + "learning_rate": 7.276463156379069e-07, + "loss": 0.0021, + "num_tokens": 27691236.0, + "reward": 0.625, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.625, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 169.0, + "completions/mean_terminated_length": 167.56521606445312, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.3516028955532575, + "grad_norm": 13.18852870542549, + "kl": 0.390625, + "learning_rate": 7.261988440952844e-07, + "loss": 0.0157, + "num_tokens": 27775724.0, + "reward": 1.4305557012557983, + "reward_std": 0.40538716316223145, + "rewards/reasoning_reward/mean": 1.4305557012557983, + "rewards/reasoning_reward/std": 0.648142397403717, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 186.875, + "completions/mean_terminated_length": 186.875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.35263702171664946, + "grad_norm": 3.7251808322917257, + "kl": 0.08154296875, + "learning_rate": 7.247489850888551e-07, + "loss": 0.0033, + "num_tokens": 27857497.0, + "reward": 1.0833333730697632, + "reward_std": 0.3808860182762146, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.40824830532073975, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 146.125, + "completions/mean_terminated_length": 146.125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.3536711478800414, + "grad_norm": 4.446040086211323, + "kl": 0.06884765625, + "learning_rate": 7.232967539214643e-07, + "loss": 0.0028, + "num_tokens": 27938476.0, + "reward": 1.5416667461395264, + "reward_std": 0.2314550280570984, + "rewards/reasoning_reward/mean": 1.5416666269302368, + "rewards/reasoning_reward/std": 0.4871537983417511, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 122.0, + "completions/mean_terminated_length": 122.0, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.35470527404343327, + "grad_norm": 4.159860302213038, + "kl": 0.04296875, + "learning_rate": 7.218421659209948e-07, + "loss": 0.0017, + "num_tokens": 28018204.0, + "reward": 0.6666666865348816, + "reward_std": 0.46854168176651, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 135.9166717529297, + "completions/mean_terminated_length": 137.04348754882812, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.3557394002068252, + "grad_norm": 3.9456854124043956, + "kl": 0.087890625, + "learning_rate": 7.203852364402048e-07, + "loss": 0.0035, + "num_tokens": 28103106.0, + "reward": 1.2083333730697632, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 159.70834350585938, + "completions/mean_terminated_length": 159.70834350585938, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.35677352637021714, + "grad_norm": 3.8039878218613024, + "kl": 0.07275390625, + "learning_rate": 7.189259808565664e-07, + "loss": 0.0029, + "num_tokens": 28184387.0, + "reward": 0.6666666865348816, + "reward_std": 0.30860671401023865, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 154.95834350585938, + "completions/mean_terminated_length": 154.95834350585938, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.3578076525336091, + "grad_norm": 3.8927448717227846, + "kl": 0.07373046875, + "learning_rate": 7.174644145721031e-07, + "loss": 0.003, + "num_tokens": 28262346.0, + "reward": 0.6666666865348816, + "reward_std": 0.39000558853149414, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 174.33334350585938, + "completions/mean_terminated_length": 174.33334350585938, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.358841778697001, + "grad_norm": 3.9724913747791044, + "kl": 0.07373046875, + "learning_rate": 7.16000553013227e-07, + "loss": 0.003, + "num_tokens": 28338434.0, + "reward": 0.6041666865348816, + "reward_std": 0.4981047809123993, + "rewards/reasoning_reward/mean": 0.6041666865348816, + "rewards/reasoning_reward/std": 0.5311833620071411, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 154.9166717529297, + "completions/mean_terminated_length": 154.9166717529297, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.35987590486039295, + "grad_norm": 2.0828938523114524, + "kl": 0.07763671875, + "learning_rate": 7.145344116305762e-07, + "loss": 0.0031, + "num_tokens": 28415024.0, + "reward": 1.125, + "reward_std": 0.10603483021259308, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.25180506706237793, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 159.2916717529297, + "completions/mean_terminated_length": 159.2916717529297, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.3609100310237849, + "grad_norm": 4.458540222973645, + "kl": 0.11328125, + "learning_rate": 7.13066005898852e-07, + "loss": 0.0045, + "num_tokens": 28497783.0, + "reward": 1.2083333730697632, + "reward_std": 0.6395318508148193, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.6412736177444458, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 161.875, + "completions/mean_terminated_length": 161.875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.3619441571871768, + "grad_norm": 3.7766746412265, + "kl": 0.125, + "learning_rate": 7.115953513166549e-07, + "loss": 0.005, + "num_tokens": 28574772.0, + "reward": 0.6666666865348816, + "reward_std": 0.30860671401023865, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 114.16667175292969, + "completions/mean_terminated_length": 114.16667175292969, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.36297828335056875, + "grad_norm": 3.461708376714296, + "kl": 0.0732421875, + "learning_rate": 7.101224634063212e-07, + "loss": 0.0029, + "num_tokens": 28659984.0, + "reward": 0.993055522441864, + "reward_std": 0.18709687888622284, + "rewards/reasoning_reward/mean": 0.993055522441864, + "rewards/reasoning_reward/std": 0.7557815313339233, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 127.95833587646484, + "completions/mean_terminated_length": 127.95833587646484, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.3640124095139607, + "grad_norm": 2.183902034581425, + "kl": 0.06396484375, + "learning_rate": 7.086473577137598e-07, + "loss": 0.0026, + "num_tokens": 28750087.0, + "reward": 1.5, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 1.5, + "rewards/reasoning_reward/std": 0.5107539296150208, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 137.7916717529297, + "completions/mean_terminated_length": 137.7916717529297, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.3650465356773526, + "grad_norm": 3.440182240952645, + "kl": 0.07421875, + "learning_rate": 7.071700498082873e-07, + "loss": 0.003, + "num_tokens": 28835346.0, + "reward": 0.7916666865348816, + "reward_std": 0.4082186222076416, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 159.70834350585938, + "completions/mean_terminated_length": 159.70834350585938, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.36608066184074456, + "grad_norm": 3.7647272928819335, + "kl": 0.083984375, + "learning_rate": 7.056905552824644e-07, + "loss": 0.0033, + "num_tokens": 28919371.0, + "reward": 1.0625, + "reward_std": 0.3944129943847656, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.5954993963241577, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 150.4166717529297, + "completions/mean_terminated_length": 150.4166717529297, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.3671147880041365, + "grad_norm": 3.3768970946788697, + "kl": 0.059814453125, + "learning_rate": 7.042088897519307e-07, + "loss": 0.0024, + "num_tokens": 28997701.0, + "reward": 0.7916666865348816, + "reward_std": 0.3535533845424652, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4871538281440735, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 163.95834350585938, + "completions/mean_terminated_length": 163.95834350585938, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.36814891416752843, + "grad_norm": 4.085471865989243, + "kl": 0.10986328125, + "learning_rate": 7.027250688552399e-07, + "loss": 0.0044, + "num_tokens": 29086372.0, + "reward": 1.5416667461395264, + "reward_std": 0.49276697635650635, + "rewards/reasoning_reward/mean": 1.5416666269302368, + "rewards/reasoning_reward/std": 0.4871537983417511, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 160.75, + "completions/mean_terminated_length": 160.75, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.36918304033092036, + "grad_norm": 3.1239460885673687, + "kl": 0.055908203125, + "learning_rate": 7.012391082536955e-07, + "loss": 0.0022, + "num_tokens": 29163254.0, + "reward": 1.1388888359069824, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 1.1388888359069824, + "rewards/reasoning_reward/std": 0.6052640676498413, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 181.9166717529297, + "completions/mean_terminated_length": 181.9166717529297, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.3702171664943123, + "grad_norm": 3.5139169095587626, + "kl": 0.091796875, + "learning_rate": 6.997510236311846e-07, + "loss": 0.0037, + "num_tokens": 29245260.0, + "reward": 0.7916666865348816, + "reward_std": 0.49076026678085327, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.5694518089294434, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 127.41667175292969, + "completions/mean_terminated_length": 127.41667175292969, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.37125129265770423, + "grad_norm": 3.785316738310359, + "kl": 0.06396484375, + "learning_rate": 6.982608306940128e-07, + "loss": 0.0025, + "num_tokens": 29322550.0, + "reward": 0.7916666865348816, + "reward_std": 0.29602527618408203, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148510992527008, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 177.58334350585938, + "completions/mean_terminated_length": 177.58334350585938, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.37228541882109617, + "grad_norm": 2.241778037699298, + "kl": 0.09375, + "learning_rate": 6.967685451707383e-07, + "loss": 0.0037, + "num_tokens": 29399980.0, + "reward": 0.3333333432674408, + "reward_std": 0.2182178944349289, + "rewards/reasoning_reward/mean": 0.3333333432674408, + "rewards/reasoning_reward/std": 0.601929247379303, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 178.95834350585938, + "completions/mean_terminated_length": 178.95834350585938, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.3733195449844881, + "grad_norm": 0.25717311965945855, + "kl": 0.07666015625, + "learning_rate": 6.952741828120062e-07, + "loss": 0.0031, + "num_tokens": 29484387.0, + "reward": 0.8333333730697632, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.637022078037262, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 159.875, + "completions/mean_terminated_length": 159.875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.37435367114788004, + "grad_norm": 4.722848836354921, + "kl": 0.09765625, + "learning_rate": 6.937777593903817e-07, + "loss": 0.0039, + "num_tokens": 29568552.0, + "reward": 1.1875, + "reward_std": 0.0589255653321743, + "rewards/reasoning_reward/mean": 1.1875, + "rewards/reasoning_reward/std": 0.28788962960243225, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 152.70834350585938, + "completions/mean_terminated_length": 152.70834350585938, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.375387797311272, + "grad_norm": 3.6743007364748133, + "kl": 0.062255859375, + "learning_rate": 6.922792907001842e-07, + "loss": 0.0025, + "num_tokens": 29646721.0, + "reward": 0.4166666865348816, + "reward_std": 0.33247750997543335, + "rewards/reasoning_reward/mean": 0.4166666567325592, + "rewards/reasoning_reward/std": 0.5036101937294006, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 181.08334350585938, + "completions/mean_terminated_length": 181.08334350585938, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.3764219234746639, + "grad_norm": 3.384160520560316, + "kl": 0.095703125, + "learning_rate": 6.9077879255732e-07, + "loss": 0.0038, + "num_tokens": 29731443.0, + "reward": 1.2222222089767456, + "reward_std": 0.4460780620574951, + "rewards/reasoning_reward/mean": 1.2222222089767456, + "rewards/reasoning_reward/std": 0.5787431597709656, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 150.75, + "completions/mean_terminated_length": 150.75, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.37745604963805585, + "grad_norm": 3.8887377118282904, + "kl": 0.087890625, + "learning_rate": 6.892762807991159e-07, + "loss": 0.0035, + "num_tokens": 29814029.0, + "reward": 1.0069444179534912, + "reward_std": 0.44658660888671875, + "rewards/reasoning_reward/mean": 1.0069444179534912, + "rewards/reasoning_reward/std": 0.7573778629302979, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 120.66667175292969, + "completions/mean_terminated_length": 120.66667175292969, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.3784901758014478, + "grad_norm": 2.581428574067341, + "kl": 0.08203125, + "learning_rate": 6.87771771284152e-07, + "loss": 0.0033, + "num_tokens": 29897901.0, + "reward": 0.9166666865348816, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.40824830532073975, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 165.1666717529297, + "completions/mean_terminated_length": 165.1666717529297, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.3795243019648397, + "grad_norm": 2.9198024039411745, + "kl": 0.0634765625, + "learning_rate": 6.862652798920938e-07, + "loss": 0.0025, + "num_tokens": 29976281.0, + "reward": 0.8541666865348816, + "reward_std": 0.27053868770599365, + "rewards/reasoning_reward/mean": 0.8541666865348816, + "rewards/reasoning_reward/std": 0.34512653946876526, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 193.375, + "completions/mean_terminated_length": 193.375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.38055842812823165, + "grad_norm": 2.9072434267769616, + "kl": 0.09326171875, + "learning_rate": 6.84756822523525e-07, + "loss": 0.0037, + "num_tokens": 30054058.0, + "reward": 0.8125, + "reward_std": 0.3433460593223572, + "rewards/reasoning_reward/mean": 0.8125, + "rewards/reasoning_reward/std": 0.4848240315914154, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 144.58334350585938, + "completions/mean_terminated_length": 144.58334350585938, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.3815925542916236, + "grad_norm": 3.5358384891912107, + "kl": 0.08984375, + "learning_rate": 6.832464150997798e-07, + "loss": 0.0036, + "num_tokens": 30137656.0, + "reward": 1.2986111640930176, + "reward_std": 0.20151451230049133, + "rewards/reasoning_reward/mean": 1.298611044883728, + "rewards/reasoning_reward/std": 0.41985490918159485, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 149.75, + "completions/mean_terminated_length": 149.75, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.3826266804550155, + "grad_norm": 0.19189388153538847, + "kl": 0.06298828125, + "learning_rate": 6.817340735627745e-07, + "loss": 0.0025, + "num_tokens": 30223114.0, + "reward": 0.3333333432674408, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 0.3333333432674408, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 138.375, + "completions/mean_terminated_length": 138.375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.38366080661840746, + "grad_norm": 2.5621136837530125, + "kl": 0.056640625, + "learning_rate": 6.802198138748397e-07, + "loss": 0.0023, + "num_tokens": 30305003.0, + "reward": 0.9166666865348816, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.28232985734939575, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 168.875, + "completions/mean_terminated_length": 168.875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.3846949327817994, + "grad_norm": 3.730162920596462, + "kl": 0.095703125, + "learning_rate": 6.78703652018551e-07, + "loss": 0.0038, + "num_tokens": 30387952.0, + "reward": 1.2777777910232544, + "reward_std": 0.4832340478897095, + "rewards/reasoning_reward/mean": 1.2777777910232544, + "rewards/reasoning_reward/std": 0.6287527680397034, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 143.70834350585938, + "completions/mean_terminated_length": 143.70834350585938, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.3857290589451913, + "grad_norm": 2.227976386200522, + "kl": 0.055908203125, + "learning_rate": 6.771856039965615e-07, + "loss": 0.0022, + "num_tokens": 30474265.0, + "reward": 0.9583333730697632, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.8064504265785217, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 154.625, + "completions/mean_terminated_length": 154.625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.38676318510858326, + "grad_norm": 3.974774217751018, + "kl": 0.12451171875, + "learning_rate": 6.756656858314318e-07, + "loss": 0.005, + "num_tokens": 30559688.0, + "reward": 1.4513888359069824, + "reward_std": 0.39070504903793335, + "rewards/reasoning_reward/mean": 1.4513888359069824, + "rewards/reasoning_reward/std": 0.5613973736763, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 175.58334350585938, + "completions/mean_terminated_length": 175.58334350585938, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.3877973112719752, + "grad_norm": 2.8084365803963354, + "kl": 0.08203125, + "learning_rate": 6.741439135654612e-07, + "loss": 0.0033, + "num_tokens": 30639974.0, + "reward": 1.1666667461395264, + "reward_std": 0.30860671401023865, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.7019641399383545, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 143.45834350585938, + "completions/mean_terminated_length": 143.45834350585938, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.38883143743536713, + "grad_norm": 3.1747783499460858, + "kl": 0.07568359375, + "learning_rate": 6.726203032605189e-07, + "loss": 0.003, + "num_tokens": 30721217.0, + "reward": 0.75, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 154.20834350585938, + "completions/mean_terminated_length": 154.20834350585938, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.38986556359875907, + "grad_norm": 2.466045662278419, + "kl": 0.1279296875, + "learning_rate": 6.710948709978741e-07, + "loss": 0.0051, + "num_tokens": 30804974.0, + "reward": 0.8958333730697632, + "reward_std": 0.15268757939338684, + "rewards/reasoning_reward/mean": 0.8958333134651184, + "rewards/reasoning_reward/std": 0.29411497712135315, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 149.58334350585938, + "completions/mean_terminated_length": 149.58334350585938, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.390899689762151, + "grad_norm": 4.338062118799447, + "kl": 0.06640625, + "learning_rate": 6.695676328780256e-07, + "loss": 0.0027, + "num_tokens": 30886508.0, + "reward": 0.9861111044883728, + "reward_std": 0.46136391162872314, + "rewards/reasoning_reward/mean": 0.9861111044883728, + "rewards/reasoning_reward/std": 0.6406455039978027, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 158.45834350585938, + "completions/mean_terminated_length": 158.45834350585938, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.39193381592554294, + "grad_norm": 3.275227479605371, + "kl": 0.10693359375, + "learning_rate": 6.680386050205332e-07, + "loss": 0.0043, + "num_tokens": 30969583.0, + "reward": 1.3958333730697632, + "reward_std": 0.24185511469841003, + "rewards/reasoning_reward/mean": 1.3958333730697632, + "rewards/reasoning_reward/std": 0.4657664895057678, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 160.6666717529297, + "completions/mean_terminated_length": 160.6666717529297, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.3929679420889349, + "grad_norm": 2.8767415849776694, + "kl": 0.11279296875, + "learning_rate": 6.665078035638465e-07, + "loss": 0.0045, + "num_tokens": 31046431.0, + "reward": 0.9861111044883728, + "reward_std": 0.2921907305717468, + "rewards/reasoning_reward/mean": 0.9861111044883728, + "rewards/reasoning_reward/std": 0.3867262005805969, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 143.08334350585938, + "completions/mean_terminated_length": 143.08334350585938, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.3940020682523268, + "grad_norm": 2.3645686686795626, + "kl": 0.09375, + "learning_rate": 6.649752446651352e-07, + "loss": 0.0037, + "num_tokens": 31125817.0, + "reward": 0.8333333730697632, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 157.33334350585938, + "completions/mean_terminated_length": 157.33334350585938, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.39503619441571874, + "grad_norm": 3.7353394391876265, + "kl": 0.056884765625, + "learning_rate": 6.634409445001181e-07, + "loss": 0.0023, + "num_tokens": 31206177.0, + "reward": 0.75, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 173.08334350585938, + "completions/mean_terminated_length": 173.08334350585938, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.3960703205791106, + "grad_norm": 3.022792894461881, + "kl": 0.08740234375, + "learning_rate": 6.619049192628924e-07, + "loss": 0.0035, + "num_tokens": 31287907.0, + "reward": 0.375, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.375, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 133.70834350585938, + "completions/mean_terminated_length": 133.70834350585938, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.39710444674250256, + "grad_norm": 2.7266764014944096, + "kl": 0.0771484375, + "learning_rate": 6.603671851657634e-07, + "loss": 0.0031, + "num_tokens": 31365276.0, + "reward": 1.1041667461395264, + "reward_std": 0.12400396913290024, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.25448867678642273, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 115.70833587646484, + "completions/mean_terminated_length": 115.70833587646484, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.3981385729058945, + "grad_norm": 3.4941613374593965, + "kl": 0.09228515625, + "learning_rate": 6.588277584390725e-07, + "loss": 0.0037, + "num_tokens": 31442029.0, + "reward": 0.75, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 135.83334350585938, + "completions/mean_terminated_length": 135.83334350585938, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.39917269906928643, + "grad_norm": 3.558030153703449, + "kl": 0.09619140625, + "learning_rate": 6.572866553310265e-07, + "loss": 0.0039, + "num_tokens": 31525481.0, + "reward": 1.0416667461395264, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 132.0, + "completions/mean_terminated_length": 132.0, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.40020682523267836, + "grad_norm": 3.432909868088115, + "kl": 0.052734375, + "learning_rate": 6.557438921075258e-07, + "loss": 0.0021, + "num_tokens": 31603905.0, + "reward": 0.5, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 0.5, + "rewards/reasoning_reward/std": 0.5107539296150208, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 152.625, + "completions/mean_terminated_length": 152.625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.4012409513960703, + "grad_norm": 3.199320861926171, + "kl": 0.08251953125, + "learning_rate": 6.541994850519933e-07, + "loss": 0.0033, + "num_tokens": 31684208.0, + "reward": 0.875, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 184.45834350585938, + "completions/mean_terminated_length": 184.45834350585938, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.40227507755946224, + "grad_norm": 2.917820029257383, + "kl": 0.08935546875, + "learning_rate": 6.526534504652013e-07, + "loss": 0.0036, + "num_tokens": 31768707.0, + "reward": 0.9166666865348816, + "reward_std": 0.30860671401023865, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.6370220184326172, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 147.6666717529297, + "completions/mean_terminated_length": 147.6666717529297, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.40330920372285417, + "grad_norm": 2.3537166439553805, + "kl": 0.0615234375, + "learning_rate": 6.511058046651011e-07, + "loss": 0.0025, + "num_tokens": 31847507.0, + "reward": 0.7916666865348816, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 183.95834350585938, + "completions/mean_terminated_length": 183.95834350585938, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.4043433298862461, + "grad_norm": 2.9414248666748626, + "kl": 0.11572265625, + "learning_rate": 6.49556563986649e-07, + "loss": 0.0046, + "num_tokens": 31932354.0, + "reward": 0.9583333730697632, + "reward_std": 0.2832478880882263, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.5882299542427063, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 158.33334350585938, + "completions/mean_terminated_length": 158.33334350585938, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.40537745604963804, + "grad_norm": 4.303254887652743, + "kl": 0.07666015625, + "learning_rate": 6.480057447816355e-07, + "loss": 0.0031, + "num_tokens": 32017602.0, + "reward": 0.7291666865348816, + "reward_std": 0.48977774381637573, + "rewards/reasoning_reward/mean": 0.7291666865348816, + "rewards/reasoning_reward/std": 0.5311833620071411, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 190.20834350585938, + "completions/mean_terminated_length": 190.20834350585938, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.40641158221303, + "grad_norm": 3.84726477694061, + "kl": 0.099609375, + "learning_rate": 6.464533634185117e-07, + "loss": 0.004, + "num_tokens": 32101527.0, + "reward": 0.7847222685813904, + "reward_std": 0.3665553033351898, + "rewards/reasoning_reward/mean": 0.7847222685813904, + "rewards/reasoning_reward/std": 0.5394557118415833, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 138.125, + "completions/mean_terminated_length": 138.125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.4074457083764219, + "grad_norm": 3.5955314731898214, + "kl": 0.087890625, + "learning_rate": 6.448994362822167e-07, + "loss": 0.0035, + "num_tokens": 32181610.0, + "reward": 0.7916666865348816, + "reward_std": 0.3917974829673767, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 175.25, + "completions/mean_terminated_length": 175.25, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.40847983453981385, + "grad_norm": 2.047840627858506, + "kl": 0.087890625, + "learning_rate": 6.433439797740049e-07, + "loss": 0.0035, + "num_tokens": 32258896.0, + "reward": 0.75, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 155.4166717529297, + "completions/mean_terminated_length": 155.4166717529297, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.4095139607032058, + "grad_norm": 3.1714239189466853, + "kl": 0.11279296875, + "learning_rate": 6.417870103112731e-07, + "loss": 0.0045, + "num_tokens": 32341714.0, + "reward": 1.5069444179534912, + "reward_std": 0.2113002985715866, + "rewards/reasoning_reward/mean": 1.5069442987442017, + "rewards/reasoning_reward/std": 0.44363224506378174, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 162.5416717529297, + "completions/mean_terminated_length": 165.60870361328125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.4105480868665977, + "grad_norm": 3.3579511466794707, + "kl": 0.08154296875, + "learning_rate": 6.402285443273865e-07, + "loss": 0.0033, + "num_tokens": 32426735.0, + "reward": 1.25, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 1.25, + "rewards/reasoning_reward/std": 0.5316095352172852, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 148.25, + "completions/mean_terminated_length": 148.25, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.41158221302998965, + "grad_norm": 3.718010993661941, + "kl": 0.0771484375, + "learning_rate": 6.386685982715056e-07, + "loss": 0.0031, + "num_tokens": 32511453.0, + "reward": 1.25, + "reward_std": 0.3120119273662567, + "rewards/reasoning_reward/mean": 1.25, + "rewards/reasoning_reward/std": 0.5897678136825562, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 129.125, + "completions/mean_terminated_length": 129.52174377441406, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.4126163391933816, + "grad_norm": 2.040886107836913, + "kl": 0.0830078125, + "learning_rate": 6.371071886084132e-07, + "loss": 0.0033, + "num_tokens": 32597216.0, + "reward": 1.0416667461395264, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.20412413775920868, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 173.0, + "completions/mean_terminated_length": 173.0, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.4136504653567735, + "grad_norm": 3.8617028180789013, + "kl": 0.09326171875, + "learning_rate": 6.355443318183394e-07, + "loss": 0.0037, + "num_tokens": 32682504.0, + "reward": 0.7708333730697632, + "reward_std": 0.21322892606258392, + "rewards/reasoning_reward/mean": 0.7708333134651184, + "rewards/reasoning_reward/std": 0.32900264859199524, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 155.20834350585938, + "completions/mean_terminated_length": 155.20834350585938, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.41468459152016546, + "grad_norm": 3.8146003140109523, + "kl": 0.10205078125, + "learning_rate": 6.339800443967884e-07, + "loss": 0.0041, + "num_tokens": 32763605.0, + "reward": 0.9375, + "reward_std": 0.43075722455978394, + "rewards/reasoning_reward/mean": 0.9375, + "rewards/reasoning_reward/std": 0.5379611253738403, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 123.0, + "completions/mean_terminated_length": 123.0, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.4157187176835574, + "grad_norm": 2.025558543566053, + "kl": 0.07666015625, + "learning_rate": 6.324143428543647e-07, + "loss": 0.0031, + "num_tokens": 32840709.0, + "reward": 1.0625, + "reward_std": 0.12400396913290024, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.2242136001586914, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 156.4166717529297, + "completions/mean_terminated_length": 156.4166717529297, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.41675284384694933, + "grad_norm": 3.726281910392908, + "kl": 0.12158203125, + "learning_rate": 6.308472437165982e-07, + "loss": 0.0049, + "num_tokens": 32934543.0, + "reward": 1.6805555820465088, + "reward_std": 0.28358834981918335, + "rewards/reasoning_reward/mean": 1.6805554628372192, + "rewards/reasoning_reward/std": 0.3143764138221741, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 146.6666717529297, + "completions/mean_terminated_length": 146.6666717529297, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.41778697001034126, + "grad_norm": 4.354273396992502, + "kl": 0.078125, + "learning_rate": 6.292787635237699e-07, + "loss": 0.0031, + "num_tokens": 33015679.0, + "reward": 0.6666666865348816, + "reward_std": 0.4993361234664917, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.6370220184326172, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 173.58334350585938, + "completions/mean_terminated_length": 173.58334350585938, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.4188210961737332, + "grad_norm": 3.983196444253434, + "kl": 0.11767578125, + "learning_rate": 6.277089188307378e-07, + "loss": 0.0047, + "num_tokens": 33110301.0, + "reward": 1.375, + "reward_std": 0.483431339263916, + "rewards/reasoning_reward/mean": 1.375, + "rewards/reasoning_reward/std": 0.6796738505363464, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 138.7916717529297, + "completions/mean_terminated_length": 138.7916717529297, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.41985522233712513, + "grad_norm": 3.0815905327353788, + "kl": 0.07763671875, + "learning_rate": 6.261377262067615e-07, + "loss": 0.0031, + "num_tokens": 33186976.0, + "reward": 0.625, + "reward_std": 0.2553258538246155, + "rewards/reasoning_reward/mean": 0.625, + "rewards/reasoning_reward/std": 0.5757792592048645, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 142.625, + "completions/mean_terminated_length": 142.625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.42088934850051707, + "grad_norm": 3.2668731146325487, + "kl": 0.111328125, + "learning_rate": 6.245652022353276e-07, + "loss": 0.0045, + "num_tokens": 33269199.0, + "reward": 0.7291666865348816, + "reward_std": 0.38959476351737976, + "rewards/reasoning_reward/mean": 0.7291666865348816, + "rewards/reasoning_reward/std": 0.5311833620071411, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 140.875, + "completions/mean_terminated_length": 136.95652770996094, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.421923474663909, + "grad_norm": 2.7293326482277154, + "kl": 0.138671875, + "learning_rate": 6.229913635139748e-07, + "loss": 0.0056, + "num_tokens": 33357652.0, + "reward": 1.3333333730697632, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.9168313145637512, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 131.5, + "completions/mean_terminated_length": 131.5, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.42295760082730094, + "grad_norm": 3.6612477267420185, + "kl": 0.08984375, + "learning_rate": 6.214162266541187e-07, + "loss": 0.0036, + "num_tokens": 33437576.0, + "reward": 0.7083333730697632, + "reward_std": 0.3907342851161957, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 120.5, + "completions/mean_terminated_length": 120.5, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.4239917269906929, + "grad_norm": 0.3608860547487124, + "kl": 0.06689453125, + "learning_rate": 6.198398082808763e-07, + "loss": 0.0027, + "num_tokens": 33518076.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 159.625, + "completions/mean_terminated_length": 159.625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.4250258531540848, + "grad_norm": 1.8961111171035863, + "kl": 0.06787109375, + "learning_rate": 6.182621250328905e-07, + "loss": 0.0027, + "num_tokens": 33603971.0, + "reward": 1.25, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 1.25, + "rewards/reasoning_reward/std": 0.6079187393188477, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 121.16667175292969, + "completions/mean_terminated_length": 121.16667175292969, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.42605997931747674, + "grad_norm": 3.9688045476328773, + "kl": 0.08642578125, + "learning_rate": 6.166831935621546e-07, + "loss": 0.0035, + "num_tokens": 33684095.0, + "reward": 1.0, + "reward_std": 0.34503278136253357, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.5107539296150208, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 166.70834350585938, + "completions/mean_terminated_length": 166.70834350585938, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.4270941054808687, + "grad_norm": 3.51307920136012, + "kl": 0.072265625, + "learning_rate": 6.151030305338367e-07, + "loss": 0.0029, + "num_tokens": 33763608.0, + "reward": 0.9166666865348816, + "reward_std": 0.355445921421051, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.5450701117515564, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 142.7916717529297, + "completions/mean_terminated_length": 142.7916717529297, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.4281282316442606, + "grad_norm": 2.4788553923664116, + "kl": 0.08642578125, + "learning_rate": 6.135216526261036e-07, + "loss": 0.0035, + "num_tokens": 33848787.0, + "reward": 0.9791666865348816, + "reward_std": 0.0589255653321743, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.8139966726303101, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 150.125, + "completions/mean_terminated_length": 150.125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.42916235780765255, + "grad_norm": 2.8834248799500073, + "kl": 0.10009765625, + "learning_rate": 6.119390765299447e-07, + "loss": 0.004, + "num_tokens": 33931398.0, + "reward": 0.5902777910232544, + "reward_std": 0.19089612364768982, + "rewards/reasoning_reward/mean": 0.5902777910232544, + "rewards/reasoning_reward/std": 0.4661984443664551, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 157.1666717529297, + "completions/mean_terminated_length": 157.1666717529297, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.4301964839710445, + "grad_norm": 3.9268847081933598, + "kl": 0.0859375, + "learning_rate": 6.103553189489959e-07, + "loss": 0.0034, + "num_tokens": 34013978.0, + "reward": 1.7430557012557983, + "reward_std": 0.36321234703063965, + "rewards/reasoning_reward/mean": 1.7430557012557983, + "rewards/reasoning_reward/std": 0.3961748480796814, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 179.83334350585938, + "completions/mean_terminated_length": 179.83334350585938, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.4312306101344364, + "grad_norm": 2.896040987170424, + "kl": 0.09619140625, + "learning_rate": 6.087703965993636e-07, + "loss": 0.0038, + "num_tokens": 34098558.0, + "reward": 1.3333333730697632, + "reward_std": 0.26726123690605164, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.458415687084198, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 148.125, + "completions/mean_terminated_length": 148.125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.43226473629782836, + "grad_norm": 3.0726708113736834, + "kl": 0.08642578125, + "learning_rate": 6.071843262094476e-07, + "loss": 0.0035, + "num_tokens": 34176129.0, + "reward": 0.7708333730697632, + "reward_std": 0.4130779504776001, + "rewards/reasoning_reward/mean": 0.7708333134651184, + "rewards/reasoning_reward/std": 0.5311833620071411, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 158.2916717529297, + "completions/mean_terminated_length": 158.2916717529297, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.4332988624612203, + "grad_norm": 3.651158491177238, + "kl": 0.115234375, + "learning_rate": 6.055971245197652e-07, + "loss": 0.0046, + "num_tokens": 34258992.0, + "reward": 1.1319445371627808, + "reward_std": 0.40136003494262695, + "rewards/reasoning_reward/mean": 1.1319445371627808, + "rewards/reasoning_reward/std": 0.6407633423805237, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 165.75, + "completions/mean_terminated_length": 165.75, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.4343329886246122, + "grad_norm": 3.011267910041691, + "kl": 0.1337890625, + "learning_rate": 6.040088082827744e-07, + "loss": 0.0053, + "num_tokens": 34342034.0, + "reward": 0.9791666865348816, + "reward_std": 0.43203312158584595, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.9609683156013489, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 142.125, + "completions/mean_terminated_length": 142.125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.43536711478800416, + "grad_norm": 3.934648597506697, + "kl": 0.048828125, + "learning_rate": 6.024193942626961e-07, + "loss": 0.002, + "num_tokens": 34423189.0, + "reward": 0.4583333432674408, + "reward_std": 0.4082186818122864, + "rewards/reasoning_reward/mean": 0.4583333432674408, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 146.0, + "completions/mean_terminated_length": 146.0, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.4364012409513961, + "grad_norm": 2.895700821484859, + "kl": 0.103515625, + "learning_rate": 6.008288992353396e-07, + "loss": 0.0041, + "num_tokens": 34500773.0, + "reward": 0.625, + "reward_std": 0.19416078925132751, + "rewards/reasoning_reward/mean": 0.625, + "rewards/reasoning_reward/std": 0.5565811395645142, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 154.5416717529297, + "completions/mean_terminated_length": 154.5416717529297, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.43743536711478803, + "grad_norm": 3.7713820783469303, + "kl": 0.07177734375, + "learning_rate": 5.99237339987922e-07, + "loss": 0.0029, + "num_tokens": 34577722.0, + "reward": 0.5625, + "reward_std": 0.5429885983467102, + "rewards/reasoning_reward/mean": 0.5625, + "rewards/reasoning_reward/std": 0.5379611253738403, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 148.70834350585938, + "completions/mean_terminated_length": 148.70834350585938, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.4384694932781799, + "grad_norm": 3.373544653686179, + "kl": 0.050537109375, + "learning_rate": 5.976447333188944e-07, + "loss": 0.002, + "num_tokens": 34652923.0, + "reward": 0.5, + "reward_std": 0.30860671401023865, + "rewards/reasoning_reward/mean": 0.5, + "rewards/reasoning_reward/std": 0.5107539296150208, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 147.125, + "completions/mean_terminated_length": 147.125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.43950361944157185, + "grad_norm": 4.7102217326975415, + "kl": 0.09033203125, + "learning_rate": 5.960510960377626e-07, + "loss": 0.0036, + "num_tokens": 34736582.0, + "reward": 0.9791666865348816, + "reward_std": 0.354950875043869, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.3753018081188202, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 153.95834350585938, + "completions/mean_terminated_length": 153.95834350585938, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.4405377456049638, + "grad_norm": 4.16978047474189, + "kl": 0.10693359375, + "learning_rate": 5.944564449649099e-07, + "loss": 0.0043, + "num_tokens": 34813797.0, + "reward": 0.9583333730697632, + "reward_std": 0.5391935706138611, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.6743220090866089, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 128.875, + "completions/mean_terminated_length": 128.875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.4415718717683557, + "grad_norm": 4.140299837667284, + "kl": 0.10791015625, + "learning_rate": 5.928607969314201e-07, + "loss": 0.0043, + "num_tokens": 34894010.0, + "reward": 0.75, + "reward_std": 0.47364258766174316, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.6079187393188477, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 124.45833587646484, + "completions/mean_terminated_length": 124.45833587646484, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.44260599793174765, + "grad_norm": 3.8213113040181987, + "kl": 0.064453125, + "learning_rate": 5.912641687789002e-07, + "loss": 0.0026, + "num_tokens": 34968525.0, + "reward": 0.625, + "reward_std": 0.3506905436515808, + "rewards/reasoning_reward/mean": 0.625, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 136.4166717529297, + "completions/mean_terminated_length": 136.4166717529297, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.4436401240951396, + "grad_norm": 2.733283075630004, + "kl": 0.058837890625, + "learning_rate": 5.896665773593012e-07, + "loss": 0.0023, + "num_tokens": 35046975.0, + "reward": 1.0625, + "reward_std": 0.08625819534063339, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.1689159870147705, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 151.58334350585938, + "completions/mean_terminated_length": 151.58334350585938, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.4446742502585315, + "grad_norm": 2.848539827633773, + "kl": 0.07177734375, + "learning_rate": 5.880680395347418e-07, + "loss": 0.0029, + "num_tokens": 35132493.0, + "reward": 1.2638888359069824, + "reward_std": 0.26332971453666687, + "rewards/reasoning_reward/mean": 1.2638888359069824, + "rewards/reasoning_reward/std": 0.8899827003479004, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 133.0, + "completions/mean_terminated_length": 133.0, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.44570837642192346, + "grad_norm": 2.221455031748688, + "kl": 0.057861328125, + "learning_rate": 5.864685721773293e-07, + "loss": 0.0023, + "num_tokens": 35212989.0, + "reward": 1.2916667461395264, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 1.2916666269302368, + "rewards/reasoning_reward/std": 0.550032913684845, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 157.0, + "completions/mean_terminated_length": 157.0, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.4467425025853154, + "grad_norm": 3.710718263121241, + "kl": 0.07958984375, + "learning_rate": 5.848681921689819e-07, + "loss": 0.0032, + "num_tokens": 35298725.0, + "reward": 1.1319445371627808, + "reward_std": 0.3816637396812439, + "rewards/reasoning_reward/mean": 1.1319444179534912, + "rewards/reasoning_reward/std": 0.6255030035972595, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 169.08334350585938, + "completions/mean_terminated_length": 169.08334350585938, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.44777662874870733, + "grad_norm": 3.6503587546386016, + "kl": 0.0810546875, + "learning_rate": 5.832669164012513e-07, + "loss": 0.0032, + "num_tokens": 35381551.0, + "reward": 0.6666666865348816, + "reward_std": 0.4745539426803589, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.5036101341247559, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 173.0416717529297, + "completions/mean_terminated_length": 173.0416717529297, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.44881075491209926, + "grad_norm": 3.4278772902345302, + "kl": 0.07421875, + "learning_rate": 5.816647617751424e-07, + "loss": 0.003, + "num_tokens": 35463120.0, + "reward": 1.1875, + "reward_std": 0.3177132308483124, + "rewards/reasoning_reward/mean": 1.1875, + "rewards/reasoning_reward/std": 0.5067479610443115, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 141.0, + "completions/mean_terminated_length": 141.0, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.4498448810754912, + "grad_norm": 2.8560665845210957, + "kl": 0.07666015625, + "learning_rate": 5.800617452009375e-07, + "loss": 0.0031, + "num_tokens": 35548048.0, + "reward": 1.2916667461395264, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 1.2916666269302368, + "rewards/reasoning_reward/std": 0.550032913684845, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 161.1666717529297, + "completions/mean_terminated_length": 161.1666717529297, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.45087900723888313, + "grad_norm": 4.509744320701168, + "kl": 0.119140625, + "learning_rate": 5.784578835980157e-07, + "loss": 0.0048, + "num_tokens": 35642260.0, + "reward": 1.4791667461395264, + "reward_std": 0.41317981481552124, + "rewards/reasoning_reward/mean": 1.4791666269302368, + "rewards/reasoning_reward/std": 0.4293363690376282, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 134.7916717529297, + "completions/mean_terminated_length": 134.7916717529297, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.45191313340227507, + "grad_norm": 2.5466178790890446, + "kl": 0.07080078125, + "learning_rate": 5.768531938946756e-07, + "loss": 0.0028, + "num_tokens": 35719927.0, + "reward": 0.6458333730697632, + "reward_std": 0.13908717036247253, + "rewards/reasoning_reward/mean": 0.6458333134651184, + "rewards/reasoning_reward/std": 0.5208514928817749, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 148.4166717529297, + "completions/mean_terminated_length": 148.4166717529297, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.452947259565667, + "grad_norm": 4.2489173473223145, + "kl": 0.107421875, + "learning_rate": 5.752476930279557e-07, + "loss": 0.0043, + "num_tokens": 35804233.0, + "reward": 1.2083333730697632, + "reward_std": 0.4671573042869568, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.5694518089294434, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 147.125, + "completions/mean_terminated_length": 147.125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.45398138572905894, + "grad_norm": 4.008875539956656, + "kl": 0.08837890625, + "learning_rate": 5.736413979434566e-07, + "loss": 0.0035, + "num_tokens": 35882052.0, + "reward": 1.0277777910232544, + "reward_std": 0.329608291387558, + "rewards/reasoning_reward/mean": 1.0277777910232544, + "rewards/reasoning_reward/std": 0.3859447240829468, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 155.25, + "completions/mean_terminated_length": 155.25, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.4550155118924509, + "grad_norm": 3.550772237674307, + "kl": 0.091796875, + "learning_rate": 5.720343255951611e-07, + "loss": 0.0037, + "num_tokens": 35961122.0, + "reward": 0.5416666865348816, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.5416666865348816, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 178.1666717529297, + "completions/mean_terminated_length": 178.1666717529297, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.4560496380558428, + "grad_norm": 4.361928899053349, + "kl": 0.11083984375, + "learning_rate": 5.704264929452562e-07, + "loss": 0.0044, + "num_tokens": 36050486.0, + "reward": 1.1597222089767456, + "reward_std": 0.41859424114227295, + "rewards/reasoning_reward/mean": 1.1597222089767456, + "rewards/reasoning_reward/std": 0.7573778033256531, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 158.70834350585938, + "completions/mean_terminated_length": 158.70834350585938, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.45708376421923474, + "grad_norm": 4.446970005435705, + "kl": 0.109375, + "learning_rate": 5.688179169639537e-07, + "loss": 0.0044, + "num_tokens": 36130319.0, + "reward": 0.7083333730697632, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 165.5, + "completions/mean_terminated_length": 165.5, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.4581178903826267, + "grad_norm": 4.394083775689496, + "kl": 0.138671875, + "learning_rate": 5.672086146293108e-07, + "loss": 0.0056, + "num_tokens": 36219107.0, + "reward": 1.3819444179534912, + "reward_std": 0.48597466945648193, + "rewards/reasoning_reward/mean": 1.3819442987442017, + "rewards/reasoning_reward/std": 0.7428876161575317, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 154.7916717529297, + "completions/mean_terminated_length": 154.7916717529297, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.4591520165460186, + "grad_norm": 3.2132018317716478, + "kl": 0.1513671875, + "learning_rate": 5.65598602927051e-07, + "loss": 0.006, + "num_tokens": 36301518.0, + "reward": 0.9375, + "reward_std": 0.26507532596588135, + "rewards/reasoning_reward/mean": 0.9375, + "rewards/reasoning_reward/std": 0.6309499144554138, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 171.95834350585938, + "completions/mean_terminated_length": 171.95834350585938, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.46018614270941055, + "grad_norm": 4.320892935715968, + "kl": 0.10791015625, + "learning_rate": 5.639878988503858e-07, + "loss": 0.0043, + "num_tokens": 36377789.0, + "reward": 1.2916667461395264, + "reward_std": 0.436039537191391, + "rewards/reasoning_reward/mean": 1.2916666269302368, + "rewards/reasoning_reward/std": 0.4871537983417511, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 146.0416717529297, + "completions/mean_terminated_length": 146.0416717529297, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.4612202688728025, + "grad_norm": 3.1035680233792466, + "kl": 0.09765625, + "learning_rate": 5.623765193998333e-07, + "loss": 0.0039, + "num_tokens": 36461686.0, + "reward": 1.125, + "reward_std": 0.2553258538246155, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.6634888052940369, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 133.45834350585938, + "completions/mean_terminated_length": 133.45834350585938, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.4622543950361944, + "grad_norm": 3.4348649831091187, + "kl": 0.09130859375, + "learning_rate": 5.607644815830412e-07, + "loss": 0.0037, + "num_tokens": 36539977.0, + "reward": 0.930555522441864, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 0.930555522441864, + "rewards/reasoning_reward/std": 0.405030757188797, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 160.75, + "completions/mean_terminated_length": 160.75, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.46328852119958636, + "grad_norm": 3.0932846774731515, + "kl": 0.050048828125, + "learning_rate": 5.591518024146049e-07, + "loss": 0.002, + "num_tokens": 36621971.0, + "reward": 0.7083333730697632, + "reward_std": 0.243839293718338, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.6064269542694092, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 161.5, + "completions/mean_terminated_length": 161.5, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.4643226473629783, + "grad_norm": 1.5405981712007193, + "kl": 0.058349609375, + "learning_rate": 5.5753849891589e-07, + "loss": 0.0023, + "num_tokens": 36698823.0, + "reward": 1.1666667461395264, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 139.0, + "completions/mean_terminated_length": 139.0, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.4653567735263702, + "grad_norm": 3.2989222982872035, + "kl": 0.080078125, + "learning_rate": 5.559245881148513e-07, + "loss": 0.0032, + "num_tokens": 36779479.0, + "reward": 1.125, + "reward_std": 0.29602527618408203, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.4484272003173828, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 170.2916717529297, + "completions/mean_terminated_length": 170.2916717529297, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.46639089968976216, + "grad_norm": 3.9459517839159957, + "kl": 0.06591796875, + "learning_rate": 5.543100870458537e-07, + "loss": 0.0026, + "num_tokens": 36855542.0, + "reward": 0.7708333730697632, + "reward_std": 0.3492930829524994, + "rewards/reasoning_reward/mean": 0.7708333134651184, + "rewards/reasoning_reward/std": 0.4657664895057678, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 151.0416717529297, + "completions/mean_terminated_length": 151.0416717529297, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.4674250258531541, + "grad_norm": 3.080169913436108, + "kl": 0.09619140625, + "learning_rate": 5.526950127494918e-07, + "loss": 0.0039, + "num_tokens": 36939599.0, + "reward": 1.3263888359069824, + "reward_std": 0.259171724319458, + "rewards/reasoning_reward/mean": 1.3263888359069824, + "rewards/reasoning_reward/std": 0.5549060106277466, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 144.95834350585938, + "completions/mean_terminated_length": 144.95834350585938, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.46845915201654603, + "grad_norm": 3.478946037062411, + "kl": 0.09619140625, + "learning_rate": 5.510793822724111e-07, + "loss": 0.0039, + "num_tokens": 37024790.0, + "reward": 1.2083333730697632, + "reward_std": 0.2616034746170044, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.35864076018333435, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 159.4166717529297, + "completions/mean_terminated_length": 159.4166717529297, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.46949327817993797, + "grad_norm": 4.69807004009168, + "kl": 0.07421875, + "learning_rate": 5.494632126671268e-07, + "loss": 0.003, + "num_tokens": 37105936.0, + "reward": 0.75, + "reward_std": 0.46631526947021484, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.5897678136825562, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 145.125, + "completions/mean_terminated_length": 145.125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.4705274043433299, + "grad_norm": 3.5314961615269183, + "kl": 0.08642578125, + "learning_rate": 5.478465209918449e-07, + "loss": 0.0035, + "num_tokens": 37190787.0, + "reward": 1.2291667461395264, + "reward_std": 0.37612998485565186, + "rewards/reasoning_reward/mean": 1.2291666269302368, + "rewards/reasoning_reward/std": 0.7658352255821228, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 169.2916717529297, + "completions/mean_terminated_length": 169.2916717529297, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.47156153050672184, + "grad_norm": 3.157082111223005, + "kl": 0.06298828125, + "learning_rate": 5.462293243102815e-07, + "loss": 0.0025, + "num_tokens": 37270178.0, + "reward": 1.0625, + "reward_std": 0.2644323706626892, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.5954993963241577, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 144.70834350585938, + "completions/mean_terminated_length": 144.70834350585938, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.4725956566701138, + "grad_norm": 2.2530317338339554, + "kl": 0.09326171875, + "learning_rate": 5.44611639691483e-07, + "loss": 0.0037, + "num_tokens": 37356739.0, + "reward": 1.3333333730697632, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.5646597146987915, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 129.58334350585938, + "completions/mean_terminated_length": 129.58334350585938, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.4736297828335057, + "grad_norm": 3.530213579949582, + "kl": 0.06884765625, + "learning_rate": 5.429934842096453e-07, + "loss": 0.0028, + "num_tokens": 37444233.0, + "reward": 1.3333333730697632, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 453.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 159.45834350585938, + "completions/mean_terminated_length": 146.69564819335938, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.47466390899689764, + "grad_norm": 10.937724197833212, + "kl": 0.07373046875, + "learning_rate": 5.41374874943935e-07, + "loss": 0.0029, + "num_tokens": 37530724.0, + "reward": 0.8958333730697632, + "reward_std": 0.15268757939338684, + "rewards/reasoning_reward/mean": 0.8958333134651184, + "rewards/reasoning_reward/std": 0.7515081763267517, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 126.41667175292969, + "completions/mean_terminated_length": 126.41667175292969, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.4756980351602896, + "grad_norm": 0.23424191596610372, + "kl": 0.07568359375, + "learning_rate": 5.397558289783079e-07, + "loss": 0.003, + "num_tokens": 37614622.0, + "reward": 1.3333333730697632, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 144.2916717529297, + "completions/mean_terminated_length": 144.2916717529297, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.4767321613236815, + "grad_norm": 3.5303048844343343, + "kl": 0.068359375, + "learning_rate": 5.381363634013285e-07, + "loss": 0.0027, + "num_tokens": 37697165.0, + "reward": 0.7916666865348816, + "reward_std": 0.29602527618408203, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 127.625, + "completions/mean_terminated_length": 127.625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.47776628748707345, + "grad_norm": 2.4796497555216117, + "kl": 0.042236328125, + "learning_rate": 5.365164953059911e-07, + "loss": 0.0017, + "num_tokens": 37776012.0, + "reward": 0.75, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 145.0, + "completions/mean_terminated_length": 145.0, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.4788004136504654, + "grad_norm": 3.9203231363884554, + "kl": 0.07177734375, + "learning_rate": 5.348962417895378e-07, + "loss": 0.0029, + "num_tokens": 37852260.0, + "reward": 1.1041667461395264, + "reward_std": 0.3027648627758026, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.7798991799354553, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 128.33334350585938, + "completions/mean_terminated_length": 128.33334350585938, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.47983453981385726, + "grad_norm": 2.3870572364816045, + "kl": 0.0498046875, + "learning_rate": 5.332756199532791e-07, + "loss": 0.002, + "num_tokens": 37932140.0, + "reward": 1.1666667461395264, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 159.45834350585938, + "completions/mean_terminated_length": 159.45834350585938, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.4808686659772492, + "grad_norm": 3.4551439032369498, + "kl": 0.0703125, + "learning_rate": 5.316546469024127e-07, + "loss": 0.0028, + "num_tokens": 38008975.0, + "reward": 0.8333333730697632, + "reward_std": 0.3493061661720276, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.5835920572280884, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 151.58334350585938, + "completions/mean_terminated_length": 151.58334350585938, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.48190279214064113, + "grad_norm": 2.2638186168786993, + "kl": 0.050048828125, + "learning_rate": 5.300333397458436e-07, + "loss": 0.002, + "num_tokens": 38086597.0, + "reward": 0.7291666865348816, + "reward_std": 0.12400396913290024, + "rewards/reasoning_reward/mean": 0.7291666865348816, + "rewards/reasoning_reward/std": 0.4418136179447174, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 135.6666717529297, + "completions/mean_terminated_length": 135.6666717529297, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.48293691830403307, + "grad_norm": 3.4589069312702914, + "kl": 0.0751953125, + "learning_rate": 5.284117155960025e-07, + "loss": 0.003, + "num_tokens": 38165749.0, + "reward": 0.6666666865348816, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 145.83334350585938, + "completions/mean_terminated_length": 145.83334350585938, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.483971044467425, + "grad_norm": 3.777619829419609, + "kl": 0.056640625, + "learning_rate": 5.267897915686668e-07, + "loss": 0.0023, + "num_tokens": 38247225.0, + "reward": 1.0833333730697632, + "reward_std": 0.28029152750968933, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.458415687084198, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 158.6666717529297, + "completions/mean_terminated_length": 158.6666717529297, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.48500517063081694, + "grad_norm": 3.5113858847079924, + "kl": 0.08740234375, + "learning_rate": 5.251675847827784e-07, + "loss": 0.0035, + "num_tokens": 38331265.0, + "reward": 0.9375, + "reward_std": 0.2725489139556885, + "rewards/reasoning_reward/mean": 0.9375, + "rewards/reasoning_reward/std": 0.8884831070899963, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 166.875, + "completions/mean_terminated_length": 166.875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.4860392967942089, + "grad_norm": 3.0917385670170185, + "kl": 0.07177734375, + "learning_rate": 5.235451123602641e-07, + "loss": 0.0029, + "num_tokens": 38408302.0, + "reward": 0.9375, + "reward_std": 0.29339051246643066, + "rewards/reasoning_reward/mean": 0.9375, + "rewards/reasoning_reward/std": 0.7705517411231995, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 127.5, + "completions/mean_terminated_length": 127.5, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.4870734229576008, + "grad_norm": 0.3175955227430425, + "kl": 0.06494140625, + "learning_rate": 5.219223914258538e-07, + "loss": 0.0026, + "num_tokens": 38491450.0, + "reward": 1.3333333730697632, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 150.125, + "completions/mean_terminated_length": 150.125, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.48810754912099275, + "grad_norm": 2.4136738947782126, + "kl": 0.06884765625, + "learning_rate": 5.202994391069008e-07, + "loss": 0.0028, + "num_tokens": 38570469.0, + "reward": 0.6111111640930176, + "reward_std": 0.23193079233169556, + "rewards/reasoning_reward/mean": 0.6111111044883728, + "rewards/reasoning_reward/std": 0.589084804058075, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 166.5416717529297, + "completions/mean_terminated_length": 166.5416717529297, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.4891416752843847, + "grad_norm": 3.1274290142900023, + "kl": 0.08984375, + "learning_rate": 5.186762725332008e-07, + "loss": 0.0036, + "num_tokens": 38655282.0, + "reward": 0.9791666865348816, + "reward_std": 0.2717381715774536, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.8272421956062317, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 175.125, + "completions/mean_terminated_length": 175.125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.4901758014477766, + "grad_norm": 4.7856676348279015, + "kl": 0.07958984375, + "learning_rate": 5.170529088368103e-07, + "loss": 0.0032, + "num_tokens": 38745685.0, + "reward": 0.7083333730697632, + "reward_std": 0.4939148426055908, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 165.9166717529297, + "completions/mean_terminated_length": 165.9166717529297, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.49120992761116855, + "grad_norm": 3.991471510599265, + "kl": 0.07568359375, + "learning_rate": 5.154293651518666e-07, + "loss": 0.003, + "num_tokens": 38822715.0, + "reward": 0.7777777910232544, + "reward_std": 0.44400954246520996, + "rewards/reasoning_reward/mean": 0.7777777314186096, + "rewards/reasoning_reward/std": 0.6344891786575317, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 146.375, + "completions/mean_terminated_length": 146.375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.4922440537745605, + "grad_norm": 3.6198040700819236, + "kl": 0.091796875, + "learning_rate": 5.138056586144071e-07, + "loss": 0.0037, + "num_tokens": 38905196.0, + "reward": 1.0625, + "reward_std": 0.33768826723098755, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.5954993963241577, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 159.58334350585938, + "completions/mean_terminated_length": 159.58334350585938, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.4932781799379524, + "grad_norm": 3.4082780232755794, + "kl": 0.0556640625, + "learning_rate": 5.121818063621877e-07, + "loss": 0.0022, + "num_tokens": 38995162.0, + "reward": 1.2291667461395264, + "reward_std": 0.36204060912132263, + "rewards/reasoning_reward/mean": 1.2291666269302368, + "rewards/reasoning_reward/std": 0.5706435441970825, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 135.375, + "completions/mean_terminated_length": 135.375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.49431230610134436, + "grad_norm": 3.0224789729725345, + "kl": 0.07373046875, + "learning_rate": 5.105578255345021e-07, + "loss": 0.003, + "num_tokens": 39077227.0, + "reward": 0.8125, + "reward_std": 0.22466278076171875, + "rewards/reasoning_reward/mean": 0.8125, + "rewards/reasoning_reward/std": 0.7042186260223389, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 160.0416717529297, + "completions/mean_terminated_length": 160.0416717529297, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.4953464322647363, + "grad_norm": 3.053679665790895, + "kl": 0.07861328125, + "learning_rate": 5.089337332720016e-07, + "loss": 0.0031, + "num_tokens": 39165884.0, + "reward": 1.3125, + "reward_std": 0.2041093111038208, + "rewards/reasoning_reward/mean": 1.3125, + "rewards/reasoning_reward/std": 0.8945790529251099, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 162.08334350585938, + "completions/mean_terminated_length": 162.08334350585938, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.4963805584281282, + "grad_norm": 3.1381029630629733, + "kl": 0.06640625, + "learning_rate": 5.073095467165134e-07, + "loss": 0.0027, + "num_tokens": 39248414.0, + "reward": 1.0555555820465088, + "reward_std": 0.22960862517356873, + "rewards/reasoning_reward/mean": 1.0555554628372192, + "rewards/reasoning_reward/std": 0.5766525864601135, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 148.75, + "completions/mean_terminated_length": 148.75, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.49741468459152016, + "grad_norm": 2.6932656248524913, + "kl": 0.09765625, + "learning_rate": 5.056852830108598e-07, + "loss": 0.0039, + "num_tokens": 39330824.0, + "reward": 1.5, + "reward_std": 0.28029152750968933, + "rewards/reasoning_reward/mean": 1.5, + "rewards/reasoning_reward/std": 0.48900964856147766, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 133.25, + "completions/mean_terminated_length": 133.25, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.4984488107549121, + "grad_norm": 4.963868420618398, + "kl": 0.08935546875, + "learning_rate": 5.040609592986775e-07, + "loss": 0.0036, + "num_tokens": 39413486.0, + "reward": 0.5208333730697632, + "reward_std": 0.36753225326538086, + "rewards/reasoning_reward/mean": 0.5208333134651184, + "rewards/reasoning_reward/std": 0.6507381796836853, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 153.25, + "completions/mean_terminated_length": 153.25, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.49948293691830403, + "grad_norm": 0.19760209135973245, + "kl": 0.049560546875, + "learning_rate": 5.024365927242367e-07, + "loss": 0.002, + "num_tokens": 39493220.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 145.2916717529297, + "completions/mean_terminated_length": 145.2916717529297, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.500517063081696, + "grad_norm": 4.265668637720173, + "kl": 0.09423828125, + "learning_rate": 5.008122004322597e-07, + "loss": 0.0038, + "num_tokens": 39575411.0, + "reward": 1.0208333730697632, + "reward_std": 0.45082372426986694, + "rewards/reasoning_reward/mean": 1.0208333730697632, + "rewards/reasoning_reward/std": 0.47729235887527466, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 155.875, + "completions/mean_terminated_length": 155.875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.5015511892450879, + "grad_norm": 3.577115117621814, + "kl": 0.0791015625, + "learning_rate": 4.991877995677404e-07, + "loss": 0.0032, + "num_tokens": 39659448.0, + "reward": 1.2083333730697632, + "reward_std": 0.367926687002182, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.721060037612915, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 131.4166717529297, + "completions/mean_terminated_length": 131.4166717529297, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.5025853154084798, + "grad_norm": 3.025540625801558, + "kl": 0.07470703125, + "learning_rate": 4.975634072757634e-07, + "loss": 0.003, + "num_tokens": 39742018.0, + "reward": 1.4166667461395264, + "reward_std": 0.19500279426574707, + "rewards/reasoning_reward/mean": 1.4166666269302368, + "rewards/reasoning_reward/std": 0.524749755859375, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 167.08334350585938, + "completions/mean_terminated_length": 167.08334350585938, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.5036194415718718, + "grad_norm": 3.4418671262934426, + "kl": 0.09375, + "learning_rate": 4.959390407013226e-07, + "loss": 0.0038, + "num_tokens": 39819180.0, + "reward": 1.0, + "reward_std": 0.42724665999412537, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.5107539296150208, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 174.375, + "completions/mean_terminated_length": 174.375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.5046535677352637, + "grad_norm": 3.319472739975236, + "kl": 0.06884765625, + "learning_rate": 4.943147169891402e-07, + "loss": 0.0028, + "num_tokens": 39896373.0, + "reward": 1.0416667461395264, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.35864076018333435, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 162.9166717529297, + "completions/mean_terminated_length": 162.9166717529297, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.5056876938986556, + "grad_norm": 3.3304049158014823, + "kl": 0.054931640625, + "learning_rate": 4.926904532834866e-07, + "loss": 0.0022, + "num_tokens": 39984259.0, + "reward": 0.375, + "reward_std": 0.3506905436515808, + "rewards/reasoning_reward/mean": 0.375, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 151.5416717529297, + "completions/mean_terminated_length": 151.5416717529297, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.5067218200620476, + "grad_norm": 0.18358736944603146, + "kl": 0.0576171875, + "learning_rate": 4.910662667279983e-07, + "loss": 0.0023, + "num_tokens": 40060984.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 158.0, + "completions/mean_terminated_length": 158.0, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.5077559462254395, + "grad_norm": 3.9194876650899495, + "kl": 0.061767578125, + "learning_rate": 4.894421744654979e-07, + "loss": 0.0025, + "num_tokens": 40147480.0, + "reward": 0.8125, + "reward_std": 0.4737785756587982, + "rewards/reasoning_reward/mean": 0.8125, + "rewards/reasoning_reward/std": 0.6726408004760742, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 144.75, + "completions/mean_terminated_length": 144.75, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.5087900723888314, + "grad_norm": 3.096539621007116, + "kl": 0.07666015625, + "learning_rate": 4.878181936378124e-07, + "loss": 0.0031, + "num_tokens": 40224658.0, + "reward": 1.2291667461395264, + "reward_std": 0.30366337299346924, + "rewards/reasoning_reward/mean": 1.2291666269302368, + "rewards/reasoning_reward/std": 0.45792141556739807, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 170.95834350585938, + "completions/mean_terminated_length": 170.95834350585938, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.5098241985522234, + "grad_norm": 3.575630215884605, + "kl": 0.06396484375, + "learning_rate": 4.861943413855928e-07, + "loss": 0.0026, + "num_tokens": 40301689.0, + "reward": 0.6666666865348816, + "reward_std": 0.3493061661720276, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.5450701117515564, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 143.70834350585938, + "completions/mean_terminated_length": 143.70834350585938, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.5108583247156153, + "grad_norm": 3.7344523341009213, + "kl": 0.044677734375, + "learning_rate": 4.845706348481333e-07, + "loss": 0.0018, + "num_tokens": 40381394.0, + "reward": 0.7083333730697632, + "reward_std": 0.3506905436515808, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 160.2916717529297, + "completions/mean_terminated_length": 160.2916717529297, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.5118924508790073, + "grad_norm": 1.866445752533792, + "kl": 0.11328125, + "learning_rate": 4.829470911631898e-07, + "loss": 0.0045, + "num_tokens": 40458361.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 135.33334350585938, + "completions/mean_terminated_length": 135.33334350585938, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.5129265770423992, + "grad_norm": 4.4003611615962095, + "kl": 0.080078125, + "learning_rate": 4.813237274667993e-07, + "loss": 0.0032, + "num_tokens": 40548409.0, + "reward": 1.2083333730697632, + "reward_std": 0.48112308979034424, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.8329709768295288, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 152.7916717529297, + "completions/mean_terminated_length": 152.7916717529297, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.5139607032057911, + "grad_norm": 3.3720583603504815, + "kl": 0.046142578125, + "learning_rate": 4.797005608930991e-07, + "loss": 0.0018, + "num_tokens": 40628820.0, + "reward": 0.7083333730697632, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 142.58334350585938, + "completions/mean_terminated_length": 142.58334350585938, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.5149948293691831, + "grad_norm": 0.15329245374819234, + "kl": 0.045166015625, + "learning_rate": 4.780776085741462e-07, + "loss": 0.0018, + "num_tokens": 40704098.0, + "reward": 1.1666667461395264, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.24077169597148895, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 165.33334350585938, + "completions/mean_terminated_length": 165.33334350585938, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.516028955532575, + "grad_norm": 3.4000622681936936, + "kl": 0.08349609375, + "learning_rate": 4.76454887639736e-07, + "loss": 0.0033, + "num_tokens": 40787242.0, + "reward": 1.0833333730697632, + "reward_std": 0.41387641429901123, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.5835920572280884, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 152.20834350585938, + "completions/mean_terminated_length": 152.20834350585938, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.5170630816959669, + "grad_norm": 0.1571488063532437, + "kl": 0.048583984375, + "learning_rate": 4.7483241521722154e-07, + "loss": 0.0019, + "num_tokens": 40865215.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 138.70834350585938, + "completions/mean_terminated_length": 138.70834350585938, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.5180972078593589, + "grad_norm": 2.487583254626514, + "kl": 0.06689453125, + "learning_rate": 4.7321020843133326e-07, + "loss": 0.0027, + "num_tokens": 40950344.0, + "reward": 1.3402776718139648, + "reward_std": 0.09820928424596786, + "rewards/reasoning_reward/mean": 1.3402776718139648, + "rewards/reasoning_reward/std": 0.4622962176799774, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 145.0416717529297, + "completions/mean_terminated_length": 145.0416717529297, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.5191313340227508, + "grad_norm": 4.3586596392892565, + "kl": 0.0693359375, + "learning_rate": 4.7158828440399747e-07, + "loss": 0.0028, + "num_tokens": 41034001.0, + "reward": 1.2361111640930176, + "reward_std": 0.382678747177124, + "rewards/reasoning_reward/mean": 1.236111044883728, + "rewards/reasoning_reward/std": 0.7691463828086853, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 150.58334350585938, + "completions/mean_terminated_length": 150.58334350585938, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.5201654601861427, + "grad_norm": 3.884759725833334, + "kl": 0.059326171875, + "learning_rate": 4.699666602541565e-07, + "loss": 0.0024, + "num_tokens": 41110591.0, + "reward": 0.75, + "reward_std": 0.44819486141204834, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.7372097969055176, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 176.5, + "completions/mean_terminated_length": 176.5, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.5211995863495347, + "grad_norm": 3.6687504446873556, + "kl": 0.06884765625, + "learning_rate": 4.683453530975872e-07, + "loss": 0.0028, + "num_tokens": 41188139.0, + "reward": 0.75, + "reward_std": 0.45045679807662964, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.5897678136825562, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 145.375, + "completions/mean_terminated_length": 145.375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.5222337125129266, + "grad_norm": 4.270390001851114, + "kl": 0.056396484375, + "learning_rate": 4.6672438004672074e-07, + "loss": 0.0023, + "num_tokens": 41265292.0, + "reward": 0.8958333730697632, + "reward_std": 0.24185511469841003, + "rewards/reasoning_reward/mean": 0.8958333134651184, + "rewards/reasoning_reward/std": 0.29411497712135315, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 160.83334350585938, + "completions/mean_terminated_length": 160.83334350585938, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.5232678386763185, + "grad_norm": 3.860866061268796, + "kl": 0.07177734375, + "learning_rate": 4.6510375821046204e-07, + "loss": 0.0029, + "num_tokens": 41349392.0, + "reward": 1.1666667461395264, + "reward_std": 0.47301241755485535, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.6915640830993652, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 132.4166717529297, + "completions/mean_terminated_length": 132.4166717529297, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.5243019648397105, + "grad_norm": 3.5645240907272, + "kl": 0.050048828125, + "learning_rate": 4.6348350469400885e-07, + "loss": 0.002, + "num_tokens": 41428746.0, + "reward": 0.7916666865348816, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 162.58334350585938, + "completions/mean_terminated_length": 162.58334350585938, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.5253360910031024, + "grad_norm": 4.091927437075089, + "kl": 0.06396484375, + "learning_rate": 4.618636365986714e-07, + "loss": 0.0026, + "num_tokens": 41509568.0, + "reward": 0.7291666865348816, + "reward_std": 0.4690367579460144, + "rewards/reasoning_reward/mean": 0.7291666865348816, + "rewards/reasoning_reward/std": 0.5706435441970825, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 151.70834350585938, + "completions/mean_terminated_length": 151.70834350585938, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.5263702171664943, + "grad_norm": 3.4807276011577883, + "kl": 0.07177734375, + "learning_rate": 4.602441710216922e-07, + "loss": 0.0029, + "num_tokens": 41588865.0, + "reward": 1.25, + "reward_std": 0.3314744830131531, + "rewards/reasoning_reward/mean": 1.25, + "rewards/reasoning_reward/std": 0.675663948059082, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 123.95833587646484, + "completions/mean_terminated_length": 123.95833587646484, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.5274043433298863, + "grad_norm": 3.5624810608659483, + "kl": 0.0751953125, + "learning_rate": 4.586251250560648e-07, + "loss": 0.003, + "num_tokens": 41672816.0, + "reward": 0.9166666865348816, + "reward_std": 0.3493061661720276, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.434057354927063, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 129.125, + "completions/mean_terminated_length": 129.125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.5284384694932782, + "grad_norm": 2.1052711972616454, + "kl": 0.07275390625, + "learning_rate": 4.5700651579035453e-07, + "loss": 0.0029, + "num_tokens": 41754563.0, + "reward": 1.1666667461395264, + "reward_std": 0.30860671401023865, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.8164966106414795, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 147.58334350585938, + "completions/mean_terminated_length": 147.58334350585938, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.5294725956566702, + "grad_norm": 3.6467154627194254, + "kl": 0.06982421875, + "learning_rate": 4.55388360308517e-07, + "loss": 0.0028, + "num_tokens": 41836945.0, + "reward": 1.0833333730697632, + "reward_std": 0.4629100561141968, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.7755315899848938, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 142.20834350585938, + "completions/mean_terminated_length": 142.20834350585938, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.5305067218200621, + "grad_norm": 2.9628853082893154, + "kl": 0.07861328125, + "learning_rate": 4.5377067568971837e-07, + "loss": 0.0031, + "num_tokens": 41925614.0, + "reward": 1.1875, + "reward_std": 0.183963343501091, + "rewards/reasoning_reward/mean": 1.1875, + "rewards/reasoning_reward/std": 0.28788962960243225, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 166.5416717529297, + "completions/mean_terminated_length": 166.5416717529297, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.531540847983454, + "grad_norm": 1.7751423102606667, + "kl": 0.040771484375, + "learning_rate": 4.521534790081549e-07, + "loss": 0.0016, + "num_tokens": 42009907.0, + "reward": 1.0208333730697632, + "reward_std": 0.0589255653321743, + "rewards/reasoning_reward/mean": 1.0208333730697632, + "rewards/reasoning_reward/std": 0.10206207633018494, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 157.70834350585938, + "completions/mean_terminated_length": 157.70834350585938, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.532574974146846, + "grad_norm": 2.6567778397813617, + "kl": 0.0771484375, + "learning_rate": 4.505367873328731e-07, + "loss": 0.0031, + "num_tokens": 42100620.0, + "reward": 1.5208333730697632, + "reward_std": 0.21309106051921844, + "rewards/reasoning_reward/mean": 1.5208333730697632, + "rewards/reasoning_reward/std": 0.453948050737381, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 138.9166717529297, + "completions/mean_terminated_length": 138.9166717529297, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.5336091003102379, + "grad_norm": 3.1909559071085325, + "kl": 0.0458984375, + "learning_rate": 4.489206177275889e-07, + "loss": 0.0018, + "num_tokens": 42185474.0, + "reward": 1.125, + "reward_std": 0.29602527618408203, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.4484272003173828, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 151.33334350585938, + "completions/mean_terminated_length": 151.33334350585938, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.5346432264736298, + "grad_norm": 2.715317039850287, + "kl": 0.06982421875, + "learning_rate": 4.473049872505081e-07, + "loss": 0.0028, + "num_tokens": 42269130.0, + "reward": 1.0416667461395264, + "reward_std": 0.20693820714950562, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.721060037612915, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 133.7916717529297, + "completions/mean_terminated_length": 133.7916717529297, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.5356773526370218, + "grad_norm": 4.433835698458295, + "kl": 0.07763671875, + "learning_rate": 4.4568991295414637e-07, + "loss": 0.0031, + "num_tokens": 42358405.0, + "reward": 1.1666667461395264, + "reward_std": 0.3900056481361389, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.5646597146987915, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 126.04167175292969, + "completions/mean_terminated_length": 126.04167175292969, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.5367114788004137, + "grad_norm": 2.490946137525958, + "kl": 0.07861328125, + "learning_rate": 4.440754118851486e-07, + "loss": 0.0032, + "num_tokens": 42437086.0, + "reward": 0.875, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 147.125, + "completions/mean_terminated_length": 147.125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.5377456049638056, + "grad_norm": 4.116295710152532, + "kl": 0.0703125, + "learning_rate": 4.424615010841099e-07, + "loss": 0.0028, + "num_tokens": 42521801.0, + "reward": 1.0138888359069824, + "reward_std": 0.4999142587184906, + "rewards/reasoning_reward/mean": 1.0138888359069824, + "rewards/reasoning_reward/std": 0.547097384929657, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 169.375, + "completions/mean_terminated_length": 169.375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.5387797311271976, + "grad_norm": 3.031621149484474, + "kl": 0.10595703125, + "learning_rate": 4.4084819758539506e-07, + "loss": 0.0043, + "num_tokens": 42604818.0, + "reward": 1.2986111640930176, + "reward_std": 0.1608150601387024, + "rewards/reasoning_reward/mean": 1.298611044883728, + "rewards/reasoning_reward/std": 0.39311453700065613, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 187.875, + "completions/mean_terminated_length": 187.875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.5398138572905895, + "grad_norm": 1.8896709692670526, + "kl": 0.0458984375, + "learning_rate": 4.3923551841695885e-07, + "loss": 0.0018, + "num_tokens": 42684495.0, + "reward": 1.0416667461395264, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 141.25, + "completions/mean_terminated_length": 141.25, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.5408479834539814, + "grad_norm": 2.5348710252262485, + "kl": 0.048583984375, + "learning_rate": 4.376234806001665e-07, + "loss": 0.0019, + "num_tokens": 42762557.0, + "reward": 1.4375, + "reward_std": 0.08625819534063339, + "rewards/reasoning_reward/mean": 1.4375, + "rewards/reasoning_reward/std": 0.37044334411621094, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 152.0, + "completions/mean_terminated_length": 152.0, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.5418821096173733, + "grad_norm": 4.4258462156938965, + "kl": 0.06298828125, + "learning_rate": 4.360121011496142e-07, + "loss": 0.0025, + "num_tokens": 42841621.0, + "reward": 0.6458333730697632, + "reward_std": 0.41873571276664734, + "rewards/reasoning_reward/mean": 0.6458333134651184, + "rewards/reasoning_reward/std": 0.580089271068573, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 165.2916717529297, + "completions/mean_terminated_length": 165.2916717529297, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.5429162357807652, + "grad_norm": 2.973908534715797, + "kl": 0.056640625, + "learning_rate": 4.344013970729489e-07, + "loss": 0.0023, + "num_tokens": 42926292.0, + "reward": 1.0208333730697632, + "reward_std": 0.1767766922712326, + "rewards/reasoning_reward/mean": 1.0208333730697632, + "rewards/reasoning_reward/std": 0.7868369221687317, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 155.625, + "completions/mean_terminated_length": 155.625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.5439503619441571, + "grad_norm": 4.478416886750731, + "kl": 0.0732421875, + "learning_rate": 4.327913853706893e-07, + "loss": 0.0029, + "num_tokens": 43011243.0, + "reward": 1.375, + "reward_std": 0.39814266562461853, + "rewards/reasoning_reward/mean": 1.375, + "rewards/reasoning_reward/std": 0.5366967916488647, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 133.4166717529297, + "completions/mean_terminated_length": 133.4166717529297, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.5449844881075491, + "grad_norm": 0.24843813220604533, + "kl": 0.07666015625, + "learning_rate": 4.3118208303604635e-07, + "loss": 0.0031, + "num_tokens": 43096325.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.8340576887130737, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 148.875, + "completions/mean_terminated_length": 148.875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.546018614270941, + "grad_norm": 3.677049891807094, + "kl": 0.09423828125, + "learning_rate": 4.295735070547438e-07, + "loss": 0.0038, + "num_tokens": 43184346.0, + "reward": 0.9791666865348816, + "reward_std": 0.3438849151134491, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.8905196785926819, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 159.4166717529297, + "completions/mean_terminated_length": 159.4166717529297, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.5470527404343329, + "grad_norm": 3.3666791301046612, + "kl": 0.059326171875, + "learning_rate": 4.2796567440483904e-07, + "loss": 0.0024, + "num_tokens": 43270244.0, + "reward": 0.8125, + "reward_std": 0.3438849151134491, + "rewards/reasoning_reward/mean": 0.8125, + "rewards/reasoning_reward/std": 0.5277618765830994, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 162.08334350585938, + "completions/mean_terminated_length": 162.08334350585938, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.5480868665977249, + "grad_norm": 4.015099024696647, + "kl": 0.07568359375, + "learning_rate": 4.263586020565436e-07, + "loss": 0.003, + "num_tokens": 43349494.0, + "reward": 0.6458333730697632, + "reward_std": 0.3310800790786743, + "rewards/reasoning_reward/mean": 0.6458333134651184, + "rewards/reasoning_reward/std": 0.5610387921333313, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 142.125, + "completions/mean_terminated_length": 142.125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.5491209927611168, + "grad_norm": 1.6909730449484282, + "kl": 0.07861328125, + "learning_rate": 4.2475230697204446e-07, + "loss": 0.0032, + "num_tokens": 43439009.0, + "reward": 1.3125, + "reward_std": 0.0589255653321743, + "rewards/reasoning_reward/mean": 1.3125, + "rewards/reasoning_reward/std": 0.4618605971336365, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 156.375, + "completions/mean_terminated_length": 156.375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.5501551189245087, + "grad_norm": 3.64263703695767, + "kl": 0.08056640625, + "learning_rate": 4.2314680610532445e-07, + "loss": 0.0032, + "num_tokens": 43528666.0, + "reward": 1.1666667461395264, + "reward_std": 0.23894576728343964, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 156.9166717529297, + "completions/mean_terminated_length": 156.9166717529297, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.5511892450879007, + "grad_norm": 3.424907168401309, + "kl": 0.05859375, + "learning_rate": 4.2154211640198426e-07, + "loss": 0.0023, + "num_tokens": 43611552.0, + "reward": 0.7083333730697632, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 154.2916717529297, + "completions/mean_terminated_length": 154.2916717529297, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.5522233712512926, + "grad_norm": 3.695112863190559, + "kl": 0.1298828125, + "learning_rate": 4.199382547990625e-07, + "loss": 0.0052, + "num_tokens": 43690327.0, + "reward": 0.8333333730697632, + "reward_std": 0.38613972067832947, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.5646597146987915, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 133.75, + "completions/mean_terminated_length": 133.75, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.5532574974146846, + "grad_norm": 2.0790329349480148, + "kl": 0.047119140625, + "learning_rate": 4.1833523822485766e-07, + "loss": 0.0019, + "num_tokens": 43774529.0, + "reward": 1.1666667461395264, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 170.58334350585938, + "completions/mean_terminated_length": 170.58334350585938, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.5542916235780765, + "grad_norm": 2.323046338652214, + "kl": 0.0830078125, + "learning_rate": 4.167330835987489e-07, + "loss": 0.0033, + "num_tokens": 43852103.0, + "reward": 0.7291666865348816, + "reward_std": 0.19795583188533783, + "rewards/reasoning_reward/mean": 0.7291666865348816, + "rewards/reasoning_reward/std": 0.6251811385154724, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 170.5, + "completions/mean_terminated_length": 170.5, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.5553257497414684, + "grad_norm": 4.195661413844518, + "kl": 0.08544921875, + "learning_rate": 4.1513180783101807e-07, + "loss": 0.0034, + "num_tokens": 43936347.0, + "reward": 1.1458333730697632, + "reward_std": 0.4177326560020447, + "rewards/reasoning_reward/mean": 1.1458333730697632, + "rewards/reasoning_reward/std": 0.580089271068573, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 149.9166717529297, + "completions/mean_terminated_length": 149.9166717529297, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.5563598759048604, + "grad_norm": 4.132578751644871, + "kl": 0.07080078125, + "learning_rate": 4.135314278226708e-07, + "loss": 0.0028, + "num_tokens": 44017537.0, + "reward": 1.0833333730697632, + "reward_std": 0.41387641429901123, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.8030737638473511, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 132.5416717529297, + "completions/mean_terminated_length": 132.5416717529297, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.5573940020682523, + "grad_norm": 6.445791655514141, + "kl": 0.0498046875, + "learning_rate": 4.119319604652583e-07, + "loss": 0.002, + "num_tokens": 44097166.0, + "reward": 0.625, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.625, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 145.83334350585938, + "completions/mean_terminated_length": 145.83334350585938, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.5584281282316442, + "grad_norm": 3.455599464978524, + "kl": 0.09423828125, + "learning_rate": 4.1033342264069887e-07, + "loss": 0.0038, + "num_tokens": 44176954.0, + "reward": 1.2222222089767456, + "reward_std": 0.24338775873184204, + "rewards/reasoning_reward/mean": 1.2222222089767456, + "rewards/reasoning_reward/std": 0.47565528750419617, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 151.375, + "completions/mean_terminated_length": 151.375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.5594622543950362, + "grad_norm": 31.684587209903366, + "kl": 0.333984375, + "learning_rate": 4.0873583122109986e-07, + "loss": 0.0134, + "num_tokens": 44265163.0, + "reward": 1.3333333730697632, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 135.0, + "completions/mean_terminated_length": 135.0, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.5604963805584281, + "grad_norm": 2.58155802773976, + "kl": 0.061279296875, + "learning_rate": 4.071392030685799e-07, + "loss": 0.0025, + "num_tokens": 44344491.0, + "reward": 0.8958333730697632, + "reward_std": 0.19795583188533783, + "rewards/reasoning_reward/mean": 0.8958333134651184, + "rewards/reasoning_reward/std": 0.3605300188064575, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 177.08334350585938, + "completions/mean_terminated_length": 177.60870361328125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.56153050672182, + "grad_norm": 3.9169957578271273, + "kl": 0.057373046875, + "learning_rate": 4.055435550350903e-07, + "loss": 0.0023, + "num_tokens": 44430293.0, + "reward": 1.2986111640930176, + "reward_std": 0.34180140495300293, + "rewards/reasoning_reward/mean": 1.298611044883728, + "rewards/reasoning_reward/std": 0.46878182888031006, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 125.625, + "completions/mean_terminated_length": 125.625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.562564632885212, + "grad_norm": 3.5379641241095476, + "kl": 0.049560546875, + "learning_rate": 4.039489039622376e-07, + "loss": 0.002, + "num_tokens": 44511572.0, + "reward": 0.7916666865348816, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 136.7916717529297, + "completions/mean_terminated_length": 136.7916717529297, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.5635987590486039, + "grad_norm": 2.7760862641762913, + "kl": 0.0576171875, + "learning_rate": 4.023552666811056e-07, + "loss": 0.0023, + "num_tokens": 44591183.0, + "reward": 0.6875, + "reward_std": 0.0589255653321743, + "rewards/reasoning_reward/mean": 0.6875, + "rewards/reasoning_reward/std": 0.5067479610443115, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 156.70834350585938, + "completions/mean_terminated_length": 156.70834350585938, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.5646328852119958, + "grad_norm": 3.308306062539149, + "kl": 0.0439453125, + "learning_rate": 4.0076266001207796e-07, + "loss": 0.0018, + "num_tokens": 44672504.0, + "reward": 0.9791666865348816, + "reward_std": 0.3255884051322937, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.40322521328926086, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 109.91667175292969, + "completions/mean_terminated_length": 109.91667175292969, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.5656670113753878, + "grad_norm": 2.3476228247469417, + "kl": 0.0478515625, + "learning_rate": 3.9917110076466054e-07, + "loss": 0.0019, + "num_tokens": 44748886.0, + "reward": 0.375, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.375, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 139.375, + "completions/mean_terminated_length": 139.375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.5667011375387797, + "grad_norm": 1.9607809752536416, + "kl": 0.04296875, + "learning_rate": 3.9758060573730376e-07, + "loss": 0.0017, + "num_tokens": 44826023.0, + "reward": 0.7916666865348816, + "reward_std": 0.2314550280570984, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4871538281440735, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 136.20834350585938, + "completions/mean_terminated_length": 136.20834350585938, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.5677352637021716, + "grad_norm": 4.07950906711803, + "kl": 0.057861328125, + "learning_rate": 3.9599119171722575e-07, + "loss": 0.0023, + "num_tokens": 44906724.0, + "reward": 0.8333333730697632, + "reward_std": 0.416355699300766, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.5450701713562012, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 157.875, + "completions/mean_terminated_length": 157.875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.5687693898655636, + "grad_norm": 2.389180002667207, + "kl": 0.0634765625, + "learning_rate": 3.9440287548023484e-07, + "loss": 0.0025, + "num_tokens": 44984433.0, + "reward": 0.7083333730697632, + "reward_std": 0.07715167850255966, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.4402732849121094, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 143.75, + "completions/mean_terminated_length": 143.75, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.5698035160289555, + "grad_norm": 3.4273893764803365, + "kl": 0.083984375, + "learning_rate": 3.928156737905525e-07, + "loss": 0.0034, + "num_tokens": 45067011.0, + "reward": 1.3333333730697632, + "reward_std": 0.34503278136253357, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 146.6666717529297, + "completions/mean_terminated_length": 146.6666717529297, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.5708376421923474, + "grad_norm": 3.5104041268905806, + "kl": 0.0830078125, + "learning_rate": 3.912296034006365e-07, + "loss": 0.0033, + "num_tokens": 45143987.0, + "reward": 1.1458333730697632, + "reward_std": 0.24999213218688965, + "rewards/reasoning_reward/mean": 1.1458333730697632, + "rewards/reasoning_reward/std": 0.3120467960834503, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 149.95834350585938, + "completions/mean_terminated_length": 149.95834350585938, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.5718717683557394, + "grad_norm": 0.29409247983573755, + "kl": 0.059814453125, + "learning_rate": 3.896446810510041e-07, + "loss": 0.0024, + "num_tokens": 45229122.0, + "reward": 1.3333333730697632, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 177.75, + "completions/mean_terminated_length": 177.75, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.5729058945191313, + "grad_norm": 3.9211078693164234, + "kl": 0.07275390625, + "learning_rate": 3.880609234700554e-07, + "loss": 0.0029, + "num_tokens": 45306684.0, + "reward": 0.25, + "reward_std": 0.43459486961364746, + "rewards/reasoning_reward/mean": 0.25, + "rewards/reasoning_reward/std": 0.41702884435653687, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 144.0, + "completions/mean_terminated_length": 144.0, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.5739400206825233, + "grad_norm": 12.819817865471965, + "kl": 0.080078125, + "learning_rate": 3.8647834737389637e-07, + "loss": 0.0032, + "num_tokens": 45389860.0, + "reward": 0.9166666865348816, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.28232985734939575, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 111.45833587646484, + "completions/mean_terminated_length": 111.45833587646484, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.5749741468459152, + "grad_norm": 2.5262171083580074, + "kl": 0.048583984375, + "learning_rate": 3.8489696946616334e-07, + "loss": 0.0019, + "num_tokens": 45469223.0, + "reward": 0.7083333730697632, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 158.625, + "completions/mean_terminated_length": 158.625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.5760082730093071, + "grad_norm": 2.651554531736017, + "kl": 0.0595703125, + "learning_rate": 3.833168064378455e-07, + "loss": 0.0024, + "num_tokens": 45546182.0, + "reward": 0.375, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.375, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 119.25, + "completions/mean_terminated_length": 119.25, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.5770423991726991, + "grad_norm": 3.386660184100593, + "kl": 0.0625, + "learning_rate": 3.817378749671095e-07, + "loss": 0.0025, + "num_tokens": 45630044.0, + "reward": 1.0833333730697632, + "reward_std": 0.34503278136253357, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.5835920572280884, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 171.08334350585938, + "completions/mean_terminated_length": 171.08334350585938, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.578076525336091, + "grad_norm": 3.757186136491232, + "kl": 0.0712890625, + "learning_rate": 3.801601917191237e-07, + "loss": 0.0029, + "num_tokens": 45714430.0, + "reward": 1.1666667461395264, + "reward_std": 0.34930619597435, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.6197241544723511, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 133.375, + "completions/mean_terminated_length": 133.375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.5791106514994829, + "grad_norm": 4.509762581052894, + "kl": 0.060302734375, + "learning_rate": 3.7858377334588127e-07, + "loss": 0.0024, + "num_tokens": 45800863.0, + "reward": 1.1041667461395264, + "reward_std": 0.30217814445495605, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.6590369343757629, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 169.9166717529297, + "completions/mean_terminated_length": 169.9166717529297, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.5801447776628749, + "grad_norm": 4.1190032329728385, + "kl": 0.07421875, + "learning_rate": 3.7700863648602516e-07, + "loss": 0.003, + "num_tokens": 45882437.0, + "reward": 1.25, + "reward_std": 0.48957228660583496, + "rewards/reasoning_reward/mean": 1.25, + "rewards/reasoning_reward/std": 0.7518094182014465, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 145.33334350585938, + "completions/mean_terminated_length": 145.33334350585938, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.5811789038262668, + "grad_norm": 3.870893194153702, + "kl": 0.09912109375, + "learning_rate": 3.7543479776467244e-07, + "loss": 0.004, + "num_tokens": 45962437.0, + "reward": 0.6666666865348816, + "reward_std": 0.3900056481361389, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.868114709854126, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 185.625, + "completions/mean_terminated_length": 185.625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.5822130299896587, + "grad_norm": 3.3202411092809654, + "kl": 0.09814453125, + "learning_rate": 3.7386227379323855e-07, + "loss": 0.0039, + "num_tokens": 46040004.0, + "reward": 0.1875, + "reward_std": 0.33108004927635193, + "rewards/reasoning_reward/mean": 0.1875, + "rewards/reasoning_reward/std": 0.4121128022670746, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 139.0, + "completions/mean_terminated_length": 139.0, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.5832471561530507, + "grad_norm": 4.0347158309096764, + "kl": 0.0654296875, + "learning_rate": 3.7229108116926223e-07, + "loss": 0.0026, + "num_tokens": 46116284.0, + "reward": 0.9375, + "reward_std": 0.35970625281333923, + "rewards/reasoning_reward/mean": 0.9375, + "rewards/reasoning_reward/std": 0.7845311760902405, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 166.75, + "completions/mean_terminated_length": 166.75, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.5842812823164426, + "grad_norm": 4.136783811313354, + "kl": 0.091796875, + "learning_rate": 3.707212364762301e-07, + "loss": 0.0037, + "num_tokens": 46194998.0, + "reward": 0.6875, + "reward_std": 0.5491421222686768, + "rewards/reasoning_reward/mean": 0.6875, + "rewards/reasoning_reward/std": 0.7634660601615906, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 148.2916717529297, + "completions/mean_terminated_length": 148.2916717529297, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.5853154084798345, + "grad_norm": 3.7478667802868526, + "kl": 0.07421875, + "learning_rate": 3.691527562834018e-07, + "loss": 0.003, + "num_tokens": 46278789.0, + "reward": 1.0416667461395264, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.6902530789375305, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 167.75, + "completions/mean_terminated_length": 167.75, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.5863495346432265, + "grad_norm": 2.8655968955809286, + "kl": 0.08056640625, + "learning_rate": 3.6758565714563534e-07, + "loss": 0.0032, + "num_tokens": 46355567.0, + "reward": 1.0416667461395264, + "reward_std": 0.2985045611858368, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.8198179602622986, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 160.58334350585938, + "completions/mean_terminated_length": 160.58334350585938, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.5873836608066184, + "grad_norm": 4.048409611227816, + "kl": 0.06640625, + "learning_rate": 3.6601995560321164e-07, + "loss": 0.0027, + "num_tokens": 46435733.0, + "reward": 0.7291666865348816, + "reward_std": 0.47280198335647583, + "rewards/reasoning_reward/mean": 0.7291666865348816, + "rewards/reasoning_reward/std": 0.48854634165763855, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 159.9166717529297, + "completions/mean_terminated_length": 159.9166717529297, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.5884177869700103, + "grad_norm": 4.048728445965589, + "kl": 0.078125, + "learning_rate": 3.6445566818166075e-07, + "loss": 0.0031, + "num_tokens": 46518451.0, + "reward": 1.2916667461395264, + "reward_std": 0.3610706031322479, + "rewards/reasoning_reward/mean": 1.2916666269302368, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 131.6666717529297, + "completions/mean_terminated_length": 131.6666717529297, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.5894519131334023, + "grad_norm": 3.9194927568414175, + "kl": 0.06298828125, + "learning_rate": 3.6289281139158685e-07, + "loss": 0.0025, + "num_tokens": 46601875.0, + "reward": 0.9583333730697632, + "reward_std": 0.42645785212516785, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.8064504861831665, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 170.6666717529297, + "completions/mean_terminated_length": 170.6666717529297, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.5904860392967942, + "grad_norm": 3.375166233888406, + "kl": 0.08935546875, + "learning_rate": 3.613314017284943e-07, + "loss": 0.0036, + "num_tokens": 46677899.0, + "reward": 0.8541666865348816, + "reward_std": 0.23144195973873138, + "rewards/reasoning_reward/mean": 0.8541666865348816, + "rewards/reasoning_reward/std": 0.7442411184310913, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 139.95834350585938, + "completions/mean_terminated_length": 139.95834350585938, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.5915201654601862, + "grad_norm": 2.187391650107713, + "kl": 0.0478515625, + "learning_rate": 3.5977145567261355e-07, + "loss": 0.0019, + "num_tokens": 46755594.0, + "reward": 0.5, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 0.5, + "rewards/reasoning_reward/std": 0.5107539296150208, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 144.9166717529297, + "completions/mean_terminated_length": 144.9166717529297, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.5925542916235781, + "grad_norm": 3.5297891429966186, + "kl": 0.0615234375, + "learning_rate": 3.5821298968872696e-07, + "loss": 0.0025, + "num_tokens": 46835112.0, + "reward": 0.9583333730697632, + "reward_std": 0.243839293718338, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.2917960286140442, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 145.0416717529297, + "completions/mean_terminated_length": 145.0416717529297, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.59358841778697, + "grad_norm": 3.362879841575184, + "kl": 0.0732421875, + "learning_rate": 3.566560202259951e-07, + "loss": 0.0029, + "num_tokens": 46911793.0, + "reward": 0.9375, + "reward_std": 0.3688412308692932, + "rewards/reasoning_reward/mean": 0.9375, + "rewards/reasoning_reward/std": 0.49590715765953064, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 116.29167175292969, + "completions/mean_terminated_length": 116.29167175292969, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.594622543950362, + "grad_norm": 2.429845277216557, + "kl": 0.0625, + "learning_rate": 3.5510056371778337e-07, + "loss": 0.0025, + "num_tokens": 46990464.0, + "reward": 1.125, + "reward_std": 0.14773420989513397, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.30395936965942383, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 162.83334350585938, + "completions/mean_terminated_length": 162.83334350585938, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.5956566701137539, + "grad_norm": 2.922280213132429, + "kl": 0.0654296875, + "learning_rate": 3.5354663658148834e-07, + "loss": 0.0026, + "num_tokens": 47079588.0, + "reward": 1.1875, + "reward_std": 0.2041093111038208, + "rewards/reasoning_reward/mean": 1.1875, + "rewards/reasoning_reward/std": 0.8945790529251099, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 143.20834350585938, + "completions/mean_terminated_length": 143.21739196777344, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.5966907962771458, + "grad_norm": 4.143240683081162, + "kl": 0.041259765625, + "learning_rate": 3.5199425521836445e-07, + "loss": 0.0017, + "num_tokens": 47157417.0, + "reward": 0.5833333730697632, + "reward_std": 0.3900056481361389, + "rewards/reasoning_reward/mean": 0.5833333134651184, + "rewards/reasoning_reward/std": 0.5036101341247559, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 140.4166717529297, + "completions/mean_terminated_length": 140.4166717529297, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.5977249224405378, + "grad_norm": 1.9659319807373157, + "kl": 0.0634765625, + "learning_rate": 3.50443436013351e-07, + "loss": 0.0025, + "num_tokens": 47235395.0, + "reward": 0.9583333730697632, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.20412415266036987, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 123.33333587646484, + "completions/mean_terminated_length": 123.33333587646484, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.5987590486039297, + "grad_norm": 2.2199998500371265, + "kl": 0.06396484375, + "learning_rate": 3.4889419533489895e-07, + "loss": 0.0026, + "num_tokens": 47321179.0, + "reward": 1.2152777910232544, + "reward_std": 0.07534459978342056, + "rewards/reasoning_reward/mean": 1.2152777910232544, + "rewards/reasoning_reward/std": 0.33506491780281067, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 179.20834350585938, + "completions/mean_terminated_length": 179.20834350585938, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.5997931747673216, + "grad_norm": 4.027929778275415, + "kl": 0.056640625, + "learning_rate": 3.4734654953479863e-07, + "loss": 0.0023, + "num_tokens": 47401472.0, + "reward": 0.6666666865348816, + "reward_std": 0.48678088188171387, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 145.2916717529297, + "completions/mean_terminated_length": 145.2916717529297, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.6008273009307136, + "grad_norm": 4.203672290672796, + "kl": 0.08349609375, + "learning_rate": 3.458005149480068e-07, + "loss": 0.0033, + "num_tokens": 47483511.0, + "reward": 0.9861111044883728, + "reward_std": 0.3927455544471741, + "rewards/reasoning_reward/mean": 0.9861111044883728, + "rewards/reasoning_reward/std": 0.6683251857757568, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 136.0, + "completions/mean_terminated_length": 136.0, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.6018614270941055, + "grad_norm": 0.1514238170358154, + "kl": 0.042236328125, + "learning_rate": 3.4425610789247415e-07, + "loss": 0.0017, + "num_tokens": 47564375.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 143.0, + "completions/mean_terminated_length": 143.0, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.6028955532574974, + "grad_norm": 3.123715432165474, + "kl": 0.06689453125, + "learning_rate": 3.4271334466897353e-07, + "loss": 0.0027, + "num_tokens": 47646583.0, + "reward": 1.1458333730697632, + "reward_std": 0.23709973692893982, + "rewards/reasoning_reward/mean": 1.1458333730697632, + "rewards/reasoning_reward/std": 0.40322521328926086, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 156.70834350585938, + "completions/mean_terminated_length": 156.70834350585938, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.6039296794208894, + "grad_norm": 3.5821953064727694, + "kl": 0.0439453125, + "learning_rate": 3.411722415609275e-07, + "loss": 0.0018, + "num_tokens": 47731776.0, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.47204458713531494, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 150.08334350585938, + "completions/mean_terminated_length": 150.08334350585938, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.6049638055842813, + "grad_norm": 2.9553822886142007, + "kl": 0.046630859375, + "learning_rate": 3.396328148342366e-07, + "loss": 0.0019, + "num_tokens": 47811218.0, + "reward": 0.4791666865348816, + "reward_std": 0.31580695509910583, + "rewards/reasoning_reward/mean": 0.4791666567325592, + "rewards/reasoning_reward/std": 0.5413181781768799, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 136.4166717529297, + "completions/mean_terminated_length": 136.4166717529297, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.6059979317476732, + "grad_norm": 3.452553119930219, + "kl": 0.059326171875, + "learning_rate": 3.3809508073710754e-07, + "loss": 0.0024, + "num_tokens": 47899740.0, + "reward": 1.0833333730697632, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.40824830532073975, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 180.4166717529297, + "completions/mean_terminated_length": 180.4166717529297, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.6070320579110652, + "grad_norm": 2.0970296040353067, + "kl": 0.052001953125, + "learning_rate": 3.365590554998819e-07, + "loss": 0.0021, + "num_tokens": 47978398.0, + "reward": 1.25, + "reward_std": 0.1259881556034088, + "rewards/reasoning_reward/mean": 1.25, + "rewards/reasoning_reward/std": 0.41702884435653687, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 151.625, + "completions/mean_terminated_length": 147.7391357421875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.6080661840744571, + "grad_norm": 4.189419981041684, + "kl": 0.0546875, + "learning_rate": 3.350247553348647e-07, + "loss": 0.0022, + "num_tokens": 48060621.0, + "reward": 0.875, + "reward_std": 0.4082186818122864, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.4484272003173828, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 158.25, + "completions/mean_terminated_length": 158.25, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.609100310237849, + "grad_norm": 3.5703666580841418, + "kl": 0.054443359375, + "learning_rate": 3.3349219643615344e-07, + "loss": 0.0022, + "num_tokens": 48139971.0, + "reward": 1.0833333730697632, + "reward_std": 0.27392348647117615, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.35098204016685486, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 182.5, + "completions/mean_terminated_length": 182.5, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.610134436401241, + "grad_norm": 3.272644263091865, + "kl": 0.060791015625, + "learning_rate": 3.319613949794668e-07, + "loss": 0.0024, + "num_tokens": 48223079.0, + "reward": 0.6111111640930176, + "reward_std": 0.31463193893432617, + "rewards/reasoning_reward/mean": 0.6111111044883728, + "rewards/reasoning_reward/std": 0.48070666193962097, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 178.2916717529297, + "completions/mean_terminated_length": 178.2916717529297, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.6111685625646329, + "grad_norm": 3.8491229946452425, + "kl": 0.06884765625, + "learning_rate": 3.304323671219744e-07, + "loss": 0.0028, + "num_tokens": 48302822.0, + "reward": 0.9166666865348816, + "reward_std": 0.24602244794368744, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.7708455324172974, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 108.08333587646484, + "completions/mean_terminated_length": 108.08333587646484, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.6122026887280249, + "grad_norm": 3.2825922300107773, + "kl": 0.052978515625, + "learning_rate": 3.2890512900212585e-07, + "loss": 0.0021, + "num_tokens": 48380512.0, + "reward": 0.5833333730697632, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 0.5833333134651184, + "rewards/reasoning_reward/std": 0.5036101341247559, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 152.375, + "completions/mean_terminated_length": 152.375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.6132368148914168, + "grad_norm": 2.5916120299999803, + "kl": 0.053955078125, + "learning_rate": 3.273796967394809e-07, + "loss": 0.0022, + "num_tokens": 48465905.0, + "reward": 1.2083333730697632, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.6580052971839905, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 161.0, + "completions/mean_terminated_length": 161.0, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.6142709410548087, + "grad_norm": 4.020983723037926, + "kl": 0.050048828125, + "learning_rate": 3.2585608643453867e-07, + "loss": 0.002, + "num_tokens": 48543945.0, + "reward": 0.9791666865348816, + "reward_std": 0.1767766922712326, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.8139966726303101, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 133.6666717529297, + "completions/mean_terminated_length": 133.6666717529297, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.6153050672182007, + "grad_norm": 2.05299852274566, + "kl": 0.04052734375, + "learning_rate": 3.2433431416856816e-07, + "loss": 0.0016, + "num_tokens": 48622017.0, + "reward": 0.9583333730697632, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.20412415266036987, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 171.20834350585938, + "completions/mean_terminated_length": 171.20834350585938, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.6163391933815926, + "grad_norm": 3.1066601718856584, + "kl": 0.087890625, + "learning_rate": 3.2281439600343835e-07, + "loss": 0.0035, + "num_tokens": 48704886.0, + "reward": 0.5833333730697632, + "reward_std": 0.21161314845085144, + "rewards/reasoning_reward/mean": 0.5833333134651184, + "rewards/reasoning_reward/std": 0.8427009582519531, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 165.2916717529297, + "completions/mean_terminated_length": 165.2916717529297, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.6173733195449845, + "grad_norm": 4.305444073516952, + "kl": 0.07763671875, + "learning_rate": 3.2129634798144885e-07, + "loss": 0.0031, + "num_tokens": 48787885.0, + "reward": 1.5625, + "reward_std": 0.37917613983154297, + "rewards/reasoning_reward/mean": 1.5625, + "rewards/reasoning_reward/std": 0.517361581325531, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 157.1666717529297, + "completions/mean_terminated_length": 157.1666717529297, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.6184074457083765, + "grad_norm": 3.1348740582949395, + "kl": 0.07470703125, + "learning_rate": 3.1978018612516024e-07, + "loss": 0.003, + "num_tokens": 48870625.0, + "reward": 0.7291666865348816, + "reward_std": 0.12400396913290024, + "rewards/reasoning_reward/mean": 0.7291666865348816, + "rewards/reasoning_reward/std": 0.9438492059707642, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 137.45834350585938, + "completions/mean_terminated_length": 137.45834350585938, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.6194415718717684, + "grad_norm": 3.57892132145044, + "kl": 0.0751953125, + "learning_rate": 3.182659264372254e-07, + "loss": 0.003, + "num_tokens": 48953788.0, + "reward": 1.1666667461395264, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.637022078037262, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 142.2916717529297, + "completions/mean_terminated_length": 142.2916717529297, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.6204756980351603, + "grad_norm": 4.301884451387064, + "kl": 0.0673828125, + "learning_rate": 3.1675358490022006e-07, + "loss": 0.0027, + "num_tokens": 49032971.0, + "reward": 0.375, + "reward_std": 0.48112308979034424, + "rewards/reasoning_reward/mean": 0.375, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 155.4166717529297, + "completions/mean_terminated_length": 155.4166717529297, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.6215098241985523, + "grad_norm": 2.256674358684405, + "kl": 0.056884765625, + "learning_rate": 3.1524317747647487e-07, + "loss": 0.0023, + "num_tokens": 49112829.0, + "reward": 1.1805555820465088, + "reward_std": 0.12858611345291138, + "rewards/reasoning_reward/mean": 1.1805554628372192, + "rewards/reasoning_reward/std": 0.3366382122039795, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 154.95834350585938, + "completions/mean_terminated_length": 154.95834350585938, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.6225439503619442, + "grad_norm": 2.155514941763339, + "kl": 0.0771484375, + "learning_rate": 3.1373472010790613e-07, + "loss": 0.0031, + "num_tokens": 49189396.0, + "reward": 0.8333333730697632, + "reward_std": 0.267261266708374, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.5036101937294006, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 158.375, + "completions/mean_terminated_length": 158.375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.6235780765253361, + "grad_norm": 3.1569287636174064, + "kl": 0.0810546875, + "learning_rate": 3.122282287158479e-07, + "loss": 0.0032, + "num_tokens": 49267197.0, + "reward": 0.5625, + "reward_std": 0.2848889231681824, + "rewards/reasoning_reward/mean": 0.5625, + "rewards/reasoning_reward/std": 0.517361581325531, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 149.125, + "completions/mean_terminated_length": 149.125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.6246122026887281, + "grad_norm": 3.5123089746645637, + "kl": 0.058349609375, + "learning_rate": 3.1072371920088393e-07, + "loss": 0.0023, + "num_tokens": 49352248.0, + "reward": 1.2083333730697632, + "reward_std": 0.243839293718338, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.4871537983417511, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 164.95834350585938, + "completions/mean_terminated_length": 164.95834350585938, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.6256463288521199, + "grad_norm": 2.617225700730397, + "kl": 0.080078125, + "learning_rate": 3.092212074426799e-07, + "loss": 0.0032, + "num_tokens": 49430807.0, + "reward": 1.0625, + "reward_std": 0.08625819534063339, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.1689159870147705, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 159.70834350585938, + "completions/mean_terminated_length": 159.70834350585938, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.6266804550155118, + "grad_norm": 3.775214071699908, + "kl": 0.07568359375, + "learning_rate": 3.0772070929981587e-07, + "loss": 0.003, + "num_tokens": 49520960.0, + "reward": 1.3125, + "reward_std": 0.36753225326538086, + "rewards/reasoning_reward/mean": 1.3125, + "rewards/reasoning_reward/std": 0.6222766637802124, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 166.20834350585938, + "completions/mean_terminated_length": 166.20834350585938, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.6277145811789038, + "grad_norm": 3.7546362665727453, + "kl": 0.0615234375, + "learning_rate": 3.062222406096183e-07, + "loss": 0.0025, + "num_tokens": 49602421.0, + "reward": 0.6041666865348816, + "reward_std": 0.4294546842575073, + "rewards/reasoning_reward/mean": 0.6041666865348816, + "rewards/reasoning_reward/std": 0.5893837213516235, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 110.54167175292969, + "completions/mean_terminated_length": 110.54167175292969, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.6287487073422957, + "grad_norm": 0.1828600047194931, + "kl": 0.039794921875, + "learning_rate": 3.047258171879939e-07, + "loss": 0.0016, + "num_tokens": 49682514.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 170.1666717529297, + "completions/mean_terminated_length": 170.1666717529297, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.6297828335056876, + "grad_norm": 1.781481839446534, + "kl": 0.033447265625, + "learning_rate": 3.032314548292618e-07, + "loss": 0.0013, + "num_tokens": 49762454.0, + "reward": 0.875, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 145.1666717529297, + "completions/mean_terminated_length": 145.1666717529297, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.6308169596690796, + "grad_norm": 3.893664449459787, + "kl": 0.057861328125, + "learning_rate": 3.0173916930598743e-07, + "loss": 0.0023, + "num_tokens": 49848010.0, + "reward": 1.0763888359069824, + "reward_std": 0.3868226408958435, + "rewards/reasoning_reward/mean": 1.0763888359069824, + "rewards/reasoning_reward/std": 0.6078773736953735, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 154.375, + "completions/mean_terminated_length": 154.375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.6318510858324715, + "grad_norm": 2.91304378338177, + "kl": 0.07275390625, + "learning_rate": 3.0024897636881556e-07, + "loss": 0.0029, + "num_tokens": 49926275.0, + "reward": 0.8333333730697632, + "reward_std": 0.24966806173324585, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.458415687084198, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 160.33334350585938, + "completions/mean_terminated_length": 160.33334350585938, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.6328852119958635, + "grad_norm": 4.645327728337031, + "kl": 0.08935546875, + "learning_rate": 2.9876089174630465e-07, + "loss": 0.0036, + "num_tokens": 50008955.0, + "reward": 1.0625, + "reward_std": 0.3766257166862488, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.5954993963241577, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 114.58333587646484, + "completions/mean_terminated_length": 114.58333587646484, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.6339193381592554, + "grad_norm": 2.3500928948101976, + "kl": 0.043701171875, + "learning_rate": 2.972749311447602e-07, + "loss": 0.0017, + "num_tokens": 50086817.0, + "reward": 0.9166666865348816, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.28232985734939575, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 144.33334350585938, + "completions/mean_terminated_length": 144.33334350585938, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.6349534643226473, + "grad_norm": 4.815665886134638, + "kl": 0.06689453125, + "learning_rate": 2.957911102480694e-07, + "loss": 0.0027, + "num_tokens": 50165777.0, + "reward": 0.7708333730697632, + "reward_std": 0.5201427936553955, + "rewards/reasoning_reward/mean": 0.7708333134651184, + "rewards/reasoning_reward/std": 0.5512666702270508, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 150.7916717529297, + "completions/mean_terminated_length": 150.7916717529297, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.6359875904860393, + "grad_norm": 0.5275728635268186, + "kl": 0.04052734375, + "learning_rate": 2.943094447175356e-07, + "loss": 0.0016, + "num_tokens": 50248660.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 169.6666717529297, + "completions/mean_terminated_length": 169.6666717529297, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.6370217166494312, + "grad_norm": 2.0323032948060398, + "kl": 0.058837890625, + "learning_rate": 2.9282995019171276e-07, + "loss": 0.0024, + "num_tokens": 50334548.0, + "reward": 0.9583333730697632, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.8064504265785217, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 154.20834350585938, + "completions/mean_terminated_length": 154.20834350585938, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.6380558428128231, + "grad_norm": 3.175018020932587, + "kl": 0.068359375, + "learning_rate": 2.9135264228624036e-07, + "loss": 0.0027, + "num_tokens": 50413561.0, + "reward": 1.0416667461395264, + "reward_std": 0.31925714015960693, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.417752206325531, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 153.6666717529297, + "completions/mean_terminated_length": 153.6666717529297, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.6390899689762151, + "grad_norm": 4.35022573120695, + "kl": 0.07275390625, + "learning_rate": 2.8987753659367884e-07, + "loss": 0.0029, + "num_tokens": 50488753.0, + "reward": 0.9375, + "reward_std": 0.24185511469841003, + "rewards/reasoning_reward/mean": 0.9375, + "rewards/reasoning_reward/std": 0.6645119190216064, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 137.25, + "completions/mean_terminated_length": 137.25, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.640124095139607, + "grad_norm": 3.9637606696357435, + "kl": 0.064453125, + "learning_rate": 2.884046486833453e-07, + "loss": 0.0026, + "num_tokens": 50569367.0, + "reward": 1.2083333730697632, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.705824613571167, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 181.25, + "completions/mean_terminated_length": 181.25, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.6411582213029989, + "grad_norm": 2.8397850153757553, + "kl": 0.060546875, + "learning_rate": 2.8693399410114793e-07, + "loss": 0.0024, + "num_tokens": 50649053.0, + "reward": 1.0694444179534912, + "reward_std": 0.2472916543483734, + "rewards/reasoning_reward/mean": 1.0694444179534912, + "rewards/reasoning_reward/std": 0.3506951928138733, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 176.33334350585938, + "completions/mean_terminated_length": 176.33334350585938, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.6421923474663909, + "grad_norm": 3.382273478222385, + "kl": 0.07080078125, + "learning_rate": 2.854655883694238e-07, + "loss": 0.0028, + "num_tokens": 50733821.0, + "reward": 0.7708333730697632, + "reward_std": 0.28302299976348877, + "rewards/reasoning_reward/mean": 0.7708333134651184, + "rewards/reasoning_reward/std": 0.416485458612442, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 172.2916717529297, + "completions/mean_terminated_length": 172.2916717529297, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.6432264736297828, + "grad_norm": 2.8244945361941944, + "kl": 0.06982421875, + "learning_rate": 2.83999446986773e-07, + "loss": 0.0028, + "num_tokens": 50815580.0, + "reward": 0.8958333730697632, + "reward_std": 0.2041093111038208, + "rewards/reasoning_reward/mean": 0.8958333134651184, + "rewards/reasoning_reward/std": 0.7937139868736267, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 159.1666717529297, + "completions/mean_terminated_length": 159.1666717529297, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.6442605997931747, + "grad_norm": 3.773616346150942, + "kl": 0.08056640625, + "learning_rate": 2.82535585427897e-07, + "loss": 0.0032, + "num_tokens": 50897280.0, + "reward": 0.8958333730697632, + "reward_std": 0.33768826723098755, + "rewards/reasoning_reward/mean": 0.8958333134651184, + "rewards/reasoning_reward/std": 0.4418136179447174, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 159.5, + "completions/mean_terminated_length": 159.5, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.6452947259565667, + "grad_norm": 3.3186602952698903, + "kl": 0.07470703125, + "learning_rate": 2.8107401914343363e-07, + "loss": 0.003, + "num_tokens": 50980988.0, + "reward": 1.2708333730697632, + "reward_std": 0.33086174726486206, + "rewards/reasoning_reward/mean": 1.2708333730697632, + "rewards/reasoning_reward/std": 0.4418136179447174, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 148.4166717529297, + "completions/mean_terminated_length": 148.4166717529297, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.6463288521199586, + "grad_norm": 4.081124219760009, + "kl": 0.06689453125, + "learning_rate": 2.796147635597954e-07, + "loss": 0.0027, + "num_tokens": 51065942.0, + "reward": 1.4583333730697632, + "reward_std": 0.36124157905578613, + "rewards/reasoning_reward/mean": 1.4583333730697632, + "rewards/reasoning_reward/std": 0.550032913684845, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 155.5416717529297, + "completions/mean_terminated_length": 155.5416717529297, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.6473629782833505, + "grad_norm": 4.171951829209572, + "kl": 0.07373046875, + "learning_rate": 2.781578340790053e-07, + "loss": 0.003, + "num_tokens": 51148627.0, + "reward": 1.0833333730697632, + "reward_std": 0.39000558853149414, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.6863049864768982, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 175.4166717529297, + "completions/mean_terminated_length": 175.4166717529297, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.6483971044467425, + "grad_norm": 3.816589069723601, + "kl": 0.07763671875, + "learning_rate": 2.767032460785356e-07, + "loss": 0.0031, + "num_tokens": 51238597.0, + "reward": 1.0208333730697632, + "reward_std": 0.42099714279174805, + "rewards/reasoning_reward/mean": 1.0208333730697632, + "rewards/reasoning_reward/std": 0.6164266467094421, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 153.625, + "completions/mean_terminated_length": 153.625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.6494312306101344, + "grad_norm": 1.955866878147104, + "kl": 0.05078125, + "learning_rate": 2.752510149111449e-07, + "loss": 0.002, + "num_tokens": 51316820.0, + "reward": 1.1875, + "reward_std": 0.13908717036247253, + "rewards/reasoning_reward/mean": 1.1875, + "rewards/reasoning_reward/std": 0.355469673871994, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 150.58334350585938, + "completions/mean_terminated_length": 150.58334350585938, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.6504653567735263, + "grad_norm": 3.0788457748288915, + "kl": 0.06396484375, + "learning_rate": 2.738011559047155e-07, + "loss": 0.0026, + "num_tokens": 51402106.0, + "reward": 0.8541666865348816, + "reward_std": 0.1767766922712326, + "rewards/reasoning_reward/mean": 0.8541666865348816, + "rewards/reasoning_reward/std": 0.5985338687896729, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 162.20834350585938, + "completions/mean_terminated_length": 162.20834350585938, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.6514994829369183, + "grad_norm": 3.631670896990646, + "kl": 0.064453125, + "learning_rate": 2.723536843620931e-07, + "loss": 0.0026, + "num_tokens": 51486543.0, + "reward": 1.0208333730697632, + "reward_std": 0.4769924581050873, + "rewards/reasoning_reward/mean": 1.0208333730697632, + "rewards/reasoning_reward/std": 0.6833289265632629, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 168.5, + "completions/mean_terminated_length": 168.5, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.6525336091003102, + "grad_norm": 3.121087345839574, + "kl": 0.07275390625, + "learning_rate": 2.7090861556092347e-07, + "loss": 0.0029, + "num_tokens": 51570619.0, + "reward": 0.9236111044883728, + "reward_std": 0.18046043813228607, + "rewards/reasoning_reward/mean": 0.9236111044883728, + "rewards/reasoning_reward/std": 0.7239504456520081, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 173.33334350585938, + "completions/mean_terminated_length": 173.33334350585938, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.6535677352637022, + "grad_norm": 4.0132486156921, + "kl": 0.07275390625, + "learning_rate": 2.6946596475349305e-07, + "loss": 0.0029, + "num_tokens": 51655891.0, + "reward": 1.125, + "reward_std": 0.40037286281585693, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.5299029350280762, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 193.45834350585938, + "completions/mean_terminated_length": 193.45834350585938, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.6546018614270941, + "grad_norm": 3.686435293590089, + "kl": 0.0712890625, + "learning_rate": 2.680257471665661e-07, + "loss": 0.0029, + "num_tokens": 51745486.0, + "reward": 1.25, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 1.25, + "rewards/reasoning_reward/std": 0.4662523865699768, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 154.08334350585938, + "completions/mean_terminated_length": 154.08334350585938, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.655635987590486, + "grad_norm": 3.142110321096429, + "kl": 0.058837890625, + "learning_rate": 2.66587978001226e-07, + "loss": 0.0024, + "num_tokens": 51829440.0, + "reward": 1.2083333730697632, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 147.5, + "completions/mean_terminated_length": 147.5, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.656670113753878, + "grad_norm": 4.105530331463831, + "kl": 0.09521484375, + "learning_rate": 2.651526724327127e-07, + "loss": 0.0038, + "num_tokens": 51906116.0, + "reward": 1.0, + "reward_std": 0.4981178641319275, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.5316095352172852, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 130.1666717529297, + "completions/mean_terminated_length": 130.1666717529297, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.6577042399172699, + "grad_norm": 3.203427593680448, + "kl": 0.0537109375, + "learning_rate": 2.6371984561026416e-07, + "loss": 0.0021, + "num_tokens": 51985168.0, + "reward": 0.9861111044883728, + "reward_std": 0.29100531339645386, + "rewards/reasoning_reward/mean": 0.9861111044883728, + "rewards/reasoning_reward/std": 0.35751646757125854, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 155.5, + "completions/mean_terminated_length": 155.5, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.6587383660806618, + "grad_norm": 4.913600177708958, + "kl": 0.06396484375, + "learning_rate": 2.622895126569562e-07, + "loss": 0.0026, + "num_tokens": 52063404.0, + "reward": 0.7916666865348816, + "reward_std": 0.4082186818122864, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 174.1666717529297, + "completions/mean_terminated_length": 174.1666717529297, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.6597724922440538, + "grad_norm": 2.4339757197483873, + "kl": 0.058837890625, + "learning_rate": 2.6086168866954175e-07, + "loss": 0.0024, + "num_tokens": 52148776.0, + "reward": 1.1666667461395264, + "reward_std": 0.08908708393573761, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.28232985734939575, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 128.25, + "completions/mean_terminated_length": 128.25, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.6608066184074457, + "grad_norm": 2.3656571936353594, + "kl": 0.072265625, + "learning_rate": 2.5943638871829296e-07, + "loss": 0.0029, + "num_tokens": 52232894.0, + "reward": 1.0416667461395264, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.8064504265785217, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 154.0416717529297, + "completions/mean_terminated_length": 154.0416717529297, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.6618407445708376, + "grad_norm": 2.370538964682846, + "kl": 0.060302734375, + "learning_rate": 2.5801362784684104e-07, + "loss": 0.0024, + "num_tokens": 52311279.0, + "reward": 1.0, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.39009472727775574, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 136.70834350585938, + "completions/mean_terminated_length": 136.70834350585938, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.6628748707342296, + "grad_norm": 4.307337721384269, + "kl": 0.0693359375, + "learning_rate": 2.5659342107201857e-07, + "loss": 0.0028, + "num_tokens": 52395696.0, + "reward": 1.1041667461395264, + "reward_std": 0.4240165948867798, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.642332136631012, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 126.04167175292969, + "completions/mean_terminated_length": 126.04167175292969, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.6639089968976215, + "grad_norm": 3.4267578238060703, + "kl": 0.047119140625, + "learning_rate": 2.551757833836996e-07, + "loss": 0.0019, + "num_tokens": 52477881.0, + "reward": 0.5416666865348816, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.5416666865348816, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 150.125, + "completions/mean_terminated_length": 150.125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.6649431230610134, + "grad_norm": 3.0652745827085965, + "kl": 0.06640625, + "learning_rate": 2.537607297446428e-07, + "loss": 0.0027, + "num_tokens": 52567396.0, + "reward": 1.4583333730697632, + "reward_std": 0.1451837718486786, + "rewards/reasoning_reward/mean": 1.4583333730697632, + "rewards/reasoning_reward/std": 0.3877657949924469, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 133.25, + "completions/mean_terminated_length": 133.25, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.6659772492244054, + "grad_norm": 4.114200525159867, + "kl": 0.0546875, + "learning_rate": 2.5234827509033294e-07, + "loss": 0.0022, + "num_tokens": 52643658.0, + "reward": 1.1666667461395264, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 162.95834350585938, + "completions/mean_terminated_length": 162.95834350585938, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.6670113753877973, + "grad_norm": 4.283824327880422, + "kl": 0.061279296875, + "learning_rate": 2.509384343288227e-07, + "loss": 0.0024, + "num_tokens": 52728177.0, + "reward": 1.2291667461395264, + "reward_std": 0.35970625281333923, + "rewards/reasoning_reward/mean": 1.2291666269302368, + "rewards/reasoning_reward/std": 0.8720186948776245, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 112.29167175292969, + "completions/mean_terminated_length": 112.29167175292969, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.6680455015511892, + "grad_norm": 2.5631822617842257, + "kl": 0.0517578125, + "learning_rate": 2.495312223405766e-07, + "loss": 0.0021, + "num_tokens": 52805880.0, + "reward": 0.5416666865348816, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.5416666865348816, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 159.5, + "completions/mean_terminated_length": 159.5, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.6690796277145812, + "grad_norm": 3.1846394660646826, + "kl": 0.0693359375, + "learning_rate": 2.4812665397831243e-07, + "loss": 0.0028, + "num_tokens": 52888628.0, + "reward": 1.1597222089767456, + "reward_std": 0.2325192391872406, + "rewards/reasoning_reward/mean": 1.1597222089767456, + "rewards/reasoning_reward/std": 0.5118857026100159, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 157.95834350585938, + "completions/mean_terminated_length": 157.95834350585938, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.6701137538779731, + "grad_norm": 2.9464142757577507, + "kl": 0.0771484375, + "learning_rate": 2.467247440668462e-07, + "loss": 0.0031, + "num_tokens": 52966915.0, + "reward": 1.3333333730697632, + "reward_std": 0.28029152750968933, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.4584156572818756, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 161.625, + "completions/mean_terminated_length": 161.625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.671147880041365, + "grad_norm": 3.492845045813914, + "kl": 0.06689453125, + "learning_rate": 2.453255074029336e-07, + "loss": 0.0027, + "num_tokens": 53052506.0, + "reward": 1.0625, + "reward_std": 0.2041093111038208, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.3061862289905548, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 165.58334350585938, + "completions/mean_terminated_length": 165.58334350585938, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.672182006204757, + "grad_norm": 3.471526846228643, + "kl": 0.06494140625, + "learning_rate": 2.4392895875511613e-07, + "loss": 0.0026, + "num_tokens": 53128112.0, + "reward": 0.25, + "reward_std": 0.30416232347488403, + "rewards/reasoning_reward/mean": 0.25, + "rewards/reasoning_reward/std": 0.41702884435653687, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 172.58334350585938, + "completions/mean_terminated_length": 172.58334350585938, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.6732161323681489, + "grad_norm": 3.4426169831993034, + "kl": 0.06787109375, + "learning_rate": 2.425351128635632e-07, + "loss": 0.0027, + "num_tokens": 53211558.0, + "reward": 1.5, + "reward_std": 0.30860671401023865, + "rewards/reasoning_reward/mean": 1.5, + "rewards/reasoning_reward/std": 0.6593804359436035, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 151.5, + "completions/mean_terminated_length": 151.5, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.6742502585315409, + "grad_norm": 2.6977128922942453, + "kl": 0.06787109375, + "learning_rate": 2.411439844399177e-07, + "loss": 0.0027, + "num_tokens": 53290618.0, + "reward": 0.8541666865348816, + "reward_std": 0.18766528367996216, + "rewards/reasoning_reward/mean": 0.8541666865348816, + "rewards/reasoning_reward/std": 0.3753018081188202, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 135.70834350585938, + "completions/mean_terminated_length": 135.70834350585938, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.6752843846949328, + "grad_norm": 4.273684646889255, + "kl": 0.06396484375, + "learning_rate": 2.3975558816714073e-07, + "loss": 0.0026, + "num_tokens": 53366675.0, + "reward": 1.0, + "reward_std": 0.28029152750968933, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.39009472727775574, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 175.58334350585938, + "completions/mean_terminated_length": 178.56521606445312, + "completions/min_length": 107.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.6763185108583247, + "grad_norm": 3.922489743480712, + "kl": 0.08544921875, + "learning_rate": 2.383699386993557e-07, + "loss": 0.0034, + "num_tokens": 53451353.0, + "reward": 1.1111111640930176, + "reward_std": 0.4623008966445923, + "rewards/reasoning_reward/mean": 1.111111044883728, + "rewards/reasoning_reward/std": 0.647676408290863, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 177.2916717529297, + "completions/mean_terminated_length": 177.2916717529297, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.6773526370217167, + "grad_norm": 3.720452216088832, + "kl": 0.08984375, + "learning_rate": 2.3698705066169483e-07, + "loss": 0.0036, + "num_tokens": 53539944.0, + "reward": 0.6666666865348816, + "reward_std": 0.6266384720802307, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.6538625359535217, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 160.70834350585938, + "completions/mean_terminated_length": 160.70834350585938, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.6783867631851086, + "grad_norm": 13.755429616291899, + "kl": 0.310546875, + "learning_rate": 2.356069386501438e-07, + "loss": 0.0125, + "num_tokens": 53618241.0, + "reward": 1.2291667461395264, + "reward_std": 0.21322892606258392, + "rewards/reasoning_reward/mean": 1.2291666269302368, + "rewards/reasoning_reward/std": 0.5893837213516235, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 147.58334350585938, + "completions/mean_terminated_length": 147.58334350585938, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.6794208893485005, + "grad_norm": 3.308998342120999, + "kl": 0.0732421875, + "learning_rate": 2.342296172313886e-07, + "loss": 0.0029, + "num_tokens": 53706335.0, + "reward": 1.4722222089767456, + "reward_std": 0.2105759084224701, + "rewards/reasoning_reward/mean": 1.4722222089767456, + "rewards/reasoning_reward/std": 0.4441424012184143, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 160.6666717529297, + "completions/mean_terminated_length": 160.6666717529297, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.6804550155118925, + "grad_norm": 2.0128009083781153, + "kl": 0.07666015625, + "learning_rate": 2.3285510094266087e-07, + "loss": 0.0031, + "num_tokens": 53784703.0, + "reward": 1.1875, + "reward_std": 0.10681166499853134, + "rewards/reasoning_reward/mean": 1.1875, + "rewards/reasoning_reward/std": 0.3234494626522064, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 144.875, + "completions/mean_terminated_length": 144.875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.6814891416752844, + "grad_norm": 2.9811727952229545, + "kl": 0.06591796875, + "learning_rate": 2.3148340429158526e-07, + "loss": 0.0027, + "num_tokens": 53870556.0, + "reward": 1.0416667461395264, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.35864076018333435, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 154.2916717529297, + "completions/mean_terminated_length": 154.2916717529297, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.6825232678386763, + "grad_norm": 3.6867423359764477, + "kl": 0.040283203125, + "learning_rate": 2.3011454175602558e-07, + "loss": 0.0016, + "num_tokens": 53949691.0, + "reward": 0.875, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 221.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 156.875, + "completions/mean_terminated_length": 154.0869598388672, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.6835573940020683, + "grad_norm": 3.4323592527418394, + "kl": 0.111328125, + "learning_rate": 2.2874852778393266e-07, + "loss": 0.0045, + "num_tokens": 54037808.0, + "reward": 0.9583333730697632, + "reward_std": 0.2985045611858368, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.5882299542427063, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 143.4166717529297, + "completions/mean_terminated_length": 139.95652770996094, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.6845915201654602, + "grad_norm": 3.3556977958017895, + "kl": 0.07958984375, + "learning_rate": 2.273853767931918e-07, + "loss": 0.0032, + "num_tokens": 54116794.0, + "reward": 1.0416667461395264, + "reward_std": 0.3506905436515808, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.5500329732894897, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 160.25, + "completions/mean_terminated_length": 160.25, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.6856256463288521, + "grad_norm": 2.337364555066446, + "kl": 0.049072265625, + "learning_rate": 2.2602510317146956e-07, + "loss": 0.002, + "num_tokens": 54201880.0, + "reward": 1.125, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.7408866882324219, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 154.83334350585938, + "completions/mean_terminated_length": 154.83334350585938, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.6866597724922441, + "grad_norm": 3.4959986502490086, + "kl": 0.09521484375, + "learning_rate": 2.246677212760636e-07, + "loss": 0.0038, + "num_tokens": 54290276.0, + "reward": 1.4930555820465088, + "reward_std": 0.35425281524658203, + "rewards/reasoning_reward/mean": 1.4930554628372192, + "rewards/reasoning_reward/std": 0.7010673880577087, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 184.875, + "completions/mean_terminated_length": 184.875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.687693898655636, + "grad_norm": 3.6836605606951633, + "kl": 0.07763671875, + "learning_rate": 2.233132454337494e-07, + "loss": 0.0031, + "num_tokens": 54368369.0, + "reward": 0.9166666865348816, + "reward_std": 0.41387641429901123, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.524749755859375, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 141.70834350585938, + "completions/mean_terminated_length": 141.70834350585938, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.688728024819028, + "grad_norm": 3.8448609470179624, + "kl": 0.07275390625, + "learning_rate": 2.2196168994063075e-07, + "loss": 0.0029, + "num_tokens": 54446450.0, + "reward": 0.8541666865348816, + "reward_std": 0.23709973692893982, + "rewards/reasoning_reward/mean": 0.8541666865348816, + "rewards/reasoning_reward/std": 0.40322521328926086, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 181.58334350585938, + "completions/mean_terminated_length": 181.58334350585938, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.6897621509824199, + "grad_norm": 3.2290937591662097, + "kl": 0.08544921875, + "learning_rate": 2.2061306906198707e-07, + "loss": 0.0034, + "num_tokens": 54530368.0, + "reward": 1.3541667461395264, + "reward_std": 0.27830731868743896, + "rewards/reasoning_reward/mean": 1.3541666269302368, + "rewards/reasoning_reward/std": 0.453948050737381, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 184.33334350585938, + "completions/mean_terminated_length": 184.33334350585938, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.6907962771458118, + "grad_norm": 3.9502785850753477, + "kl": 0.08154296875, + "learning_rate": 2.1926739703212472e-07, + "loss": 0.0033, + "num_tokens": 54613976.0, + "reward": 0.9375, + "reward_std": 0.34602540731430054, + "rewards/reasoning_reward/mean": 0.9375, + "rewards/reasoning_reward/std": 0.6806725859642029, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 171.9166717529297, + "completions/mean_terminated_length": 171.9166717529297, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.6918304033092038, + "grad_norm": 2.466267818183153, + "kl": 0.056884765625, + "learning_rate": 2.1792468805422487e-07, + "loss": 0.0023, + "num_tokens": 54695686.0, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.3061862289905548, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 149.1666717529297, + "completions/mean_terminated_length": 149.1666717529297, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.6928645294725957, + "grad_norm": 3.9637637220123643, + "kl": 0.08447265625, + "learning_rate": 2.1658495630019518e-07, + "loss": 0.0034, + "num_tokens": 54782322.0, + "reward": 1.3680555820465088, + "reward_std": 0.36238986253738403, + "rewards/reasoning_reward/mean": 1.3680554628372192, + "rewards/reasoning_reward/std": 0.5059528946876526, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 176.25, + "completions/mean_terminated_length": 176.25, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.6938986556359876, + "grad_norm": 3.8385551187680393, + "kl": 0.0771484375, + "learning_rate": 2.152482159105196e-07, + "loss": 0.0031, + "num_tokens": 54865144.0, + "reward": 0.930555522441864, + "reward_std": 0.4847185015678406, + "rewards/reasoning_reward/mean": 0.930555522441864, + "rewards/reasoning_reward/std": 0.8411469459533691, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 173.33334350585938, + "completions/mean_terminated_length": 173.33334350585938, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.6949327817993796, + "grad_norm": 3.164139243922661, + "kl": 0.0791015625, + "learning_rate": 2.1391448099410853e-07, + "loss": 0.0032, + "num_tokens": 54948368.0, + "reward": 1.1597222089767456, + "reward_std": 0.33324792981147766, + "rewards/reasoning_reward/mean": 1.1597222089767456, + "rewards/reasoning_reward/std": 0.5118856430053711, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 147.875, + "completions/mean_terminated_length": 147.875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.6959669079627715, + "grad_norm": 3.4067201589070613, + "kl": 0.054443359375, + "learning_rate": 2.1258376562815112e-07, + "loss": 0.0022, + "num_tokens": 55029237.0, + "reward": 1.0833333730697632, + "reward_std": 0.32025060057640076, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.5646597146987915, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 162.58334350585938, + "completions/mean_terminated_length": 162.58334350585938, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.6970010341261634, + "grad_norm": 2.5831577803082353, + "kl": 0.07275390625, + "learning_rate": 2.112560838579653e-07, + "loss": 0.0029, + "num_tokens": 55120027.0, + "reward": 1.5763888359069824, + "reward_std": 0.1376926600933075, + "rewards/reasoning_reward/mean": 1.5763888359069824, + "rewards/reasoning_reward/std": 0.45038676261901855, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 172.4166717529297, + "completions/mean_terminated_length": 172.4166717529297, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.6980351602895554, + "grad_norm": 3.9629234668552953, + "kl": 0.06640625, + "learning_rate": 2.0993144969685106e-07, + "loss": 0.0027, + "num_tokens": 55195797.0, + "reward": 0.8333333730697632, + "reward_std": 0.39000558853149414, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.6863049864768982, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 166.375, + "completions/mean_terminated_length": 166.375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.6990692864529473, + "grad_norm": 2.8177613643424504, + "kl": 0.09228515625, + "learning_rate": 2.0860987712594103e-07, + "loss": 0.0037, + "num_tokens": 55284382.0, + "reward": 1.2708333730697632, + "reward_std": 0.20249390602111816, + "rewards/reasoning_reward/mean": 1.2708333730697632, + "rewards/reasoning_reward/std": 0.8509904742240906, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 151.70834350585938, + "completions/mean_terminated_length": 151.70834350585938, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.7001034126163392, + "grad_norm": 0.17490723954585877, + "kl": 0.0546875, + "learning_rate": 2.0729138009405417e-07, + "loss": 0.0022, + "num_tokens": 55363807.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 151.70834350585938, + "completions/mean_terminated_length": 151.70834350585938, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.7011375387797312, + "grad_norm": 2.1768886150741418, + "kl": 0.07861328125, + "learning_rate": 2.05975972517548e-07, + "loss": 0.0031, + "num_tokens": 55452096.0, + "reward": 1.6458333730697632, + "reward_std": 0.0589255653321743, + "rewards/reasoning_reward/mean": 1.6458333730697632, + "rewards/reasoning_reward/std": 0.47729232907295227, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 125.66667175292969, + "completions/mean_terminated_length": 125.66667175292969, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.7021716649431231, + "grad_norm": 3.6068735599469863, + "kl": 0.044189453125, + "learning_rate": 2.0466366828017113e-07, + "loss": 0.0018, + "num_tokens": 55538384.0, + "reward": 1.2083333730697632, + "reward_std": 0.20693820714950562, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 156.375, + "completions/mean_terminated_length": 156.375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.703205791106515, + "grad_norm": 3.5043504652823727, + "kl": 0.08251953125, + "learning_rate": 2.033544812329181e-07, + "loss": 0.0033, + "num_tokens": 55626745.0, + "reward": 1.3541667461395264, + "reward_std": 0.32196044921875, + "rewards/reasoning_reward/mean": 1.3541666269302368, + "rewards/reasoning_reward/std": 0.5800893306732178, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 143.875, + "completions/mean_terminated_length": 143.875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.704239917269907, + "grad_norm": 2.9250034907846185, + "kl": 0.076171875, + "learning_rate": 2.020484251938817e-07, + "loss": 0.0031, + "num_tokens": 55712742.0, + "reward": 0.875, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.8998792171478271, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 139.5416717529297, + "completions/mean_terminated_length": 139.5416717529297, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.7052740434332989, + "grad_norm": 2.899108027116337, + "kl": 0.0595703125, + "learning_rate": 2.007455139481085e-07, + "loss": 0.0024, + "num_tokens": 55792819.0, + "reward": 0.75, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 157.70834350585938, + "completions/mean_terminated_length": 157.70834350585938, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.7063081695966908, + "grad_norm": 0.19921601907261913, + "kl": 0.055908203125, + "learning_rate": 1.9944576124745205e-07, + "loss": 0.0022, + "num_tokens": 55869540.0, + "reward": 1.1666667461395264, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.24077169597148895, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 183.33334350585938, + "completions/mean_terminated_length": 183.33334350585938, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.7073422957600828, + "grad_norm": 3.8943822086746436, + "kl": 0.0791015625, + "learning_rate": 1.9814918081042887e-07, + "loss": 0.0032, + "num_tokens": 55948236.0, + "reward": 1.0625, + "reward_std": 0.3584126830101013, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.42509588599205017, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 161.4166717529297, + "completions/mean_terminated_length": 161.4166717529297, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.7083764219234746, + "grad_norm": 2.3459301471771914, + "kl": 0.06591796875, + "learning_rate": 1.9685578632207268e-07, + "loss": 0.0026, + "num_tokens": 56033366.0, + "reward": 0.7916666865348816, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.9315329194068909, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 142.20834350585938, + "completions/mean_terminated_length": 142.20834350585938, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.7094105480868665, + "grad_norm": 3.786217018380023, + "kl": 0.060302734375, + "learning_rate": 1.9556559143379097e-07, + "loss": 0.0024, + "num_tokens": 56110987.0, + "reward": 0.875, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 174.875, + "completions/mean_terminated_length": 174.875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.7104446742502585, + "grad_norm": 3.277700175814856, + "kl": 0.0634765625, + "learning_rate": 1.9427860976321996e-07, + "loss": 0.0025, + "num_tokens": 56191280.0, + "reward": 0.7916666865348816, + "reward_std": 0.29602527618408203, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 139.70834350585938, + "completions/mean_terminated_length": 139.70834350585938, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.7114788004136504, + "grad_norm": 2.2482590984095934, + "kl": 0.0830078125, + "learning_rate": 1.9299485489408125e-07, + "loss": 0.0033, + "num_tokens": 56275553.0, + "reward": 0.9722222089767456, + "reward_std": 0.0514344647526741, + "rewards/reasoning_reward/mean": 0.9722221493721008, + "rewards/reasoning_reward/std": 0.8040757775306702, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 168.4166717529297, + "completions/mean_terminated_length": 168.4166717529297, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.7125129265770423, + "grad_norm": 1.952148524225847, + "kl": 0.06982421875, + "learning_rate": 1.9171434037603883e-07, + "loss": 0.0028, + "num_tokens": 56354091.0, + "reward": 1.2013888359069824, + "reward_std": 0.058925554156303406, + "rewards/reasoning_reward/mean": 1.2013888359069824, + "rewards/reasoning_reward/std": 0.3068428933620453, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 151.1666717529297, + "completions/mean_terminated_length": 151.1666717529297, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.7135470527404343, + "grad_norm": 3.0579537299955377, + "kl": 0.06591796875, + "learning_rate": 1.9043707972455537e-07, + "loss": 0.0026, + "num_tokens": 56437463.0, + "reward": 1.0, + "reward_std": 0.31100785732269287, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.6370220184326172, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 154.875, + "completions/mean_terminated_length": 154.875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.7145811789038262, + "grad_norm": 2.1711645432342626, + "kl": 0.12158203125, + "learning_rate": 1.8916308642075007e-07, + "loss": 0.0049, + "num_tokens": 56525892.0, + "reward": 1.8125, + "reward_std": 0.0589255653321743, + "rewards/reasoning_reward/mean": 1.8125, + "rewards/reasoning_reward/std": 0.28788962960243225, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 162.75, + "completions/mean_terminated_length": 162.75, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.7156153050672182, + "grad_norm": 2.18124506294311, + "kl": 0.10888671875, + "learning_rate": 1.8789237391125644e-07, + "loss": 0.0044, + "num_tokens": 56620214.0, + "reward": 1.375, + "reward_std": 0.21362332999706268, + "rewards/reasoning_reward/mean": 1.375, + "rewards/reasoning_reward/std": 0.5757792592048645, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 141.5416717529297, + "completions/mean_terminated_length": 141.5416717529297, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.7166494312306101, + "grad_norm": 2.1892743478043566, + "kl": 0.06689453125, + "learning_rate": 1.8662495560807957e-07, + "loss": 0.0027, + "num_tokens": 56702419.0, + "reward": 1.1875, + "reward_std": 0.0589255653321743, + "rewards/reasoning_reward/mean": 1.1875, + "rewards/reasoning_reward/std": 0.28788962960243225, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 165.2916717529297, + "completions/mean_terminated_length": 165.2916717529297, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.717683557394002, + "grad_norm": 4.212537614597707, + "kl": 0.09326171875, + "learning_rate": 1.8536084488845583e-07, + "loss": 0.0037, + "num_tokens": 56779346.0, + "reward": 0.5625, + "reward_std": 0.37478944659233093, + "rewards/reasoning_reward/mean": 0.5625, + "rewards/reasoning_reward/std": 0.7270025014877319, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 177.7916717529297, + "completions/mean_terminated_length": 177.7916717529297, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.718717683557394, + "grad_norm": 2.0102770916091472, + "kl": 0.056396484375, + "learning_rate": 1.8410005509471028e-07, + "loss": 0.0023, + "num_tokens": 56860157.0, + "reward": 1.0625, + "reward_std": 0.12400396913290024, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.2242136001586914, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 130.375, + "completions/mean_terminated_length": 130.375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.7197518097207859, + "grad_norm": 2.329478391630926, + "kl": 0.0439453125, + "learning_rate": 1.828425995341173e-07, + "loss": 0.0018, + "num_tokens": 56939902.0, + "reward": 0.375, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.375, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 151.875, + "completions/mean_terminated_length": 151.875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.7207859358841778, + "grad_norm": 3.1877025746249577, + "kl": 0.055908203125, + "learning_rate": 1.815884914787587e-07, + "loss": 0.0022, + "num_tokens": 57024539.0, + "reward": 1.0208333730697632, + "reward_std": 0.24056154489517212, + "rewards/reasoning_reward/mean": 1.0208333730697632, + "rewards/reasoning_reward/std": 0.3753018081188202, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 159.7916717529297, + "completions/mean_terminated_length": 159.0, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.7218200620475698, + "grad_norm": 3.3274373464580114, + "kl": 0.055908203125, + "learning_rate": 1.80337744165385e-07, + "loss": 0.0022, + "num_tokens": 57108438.0, + "reward": 0.6041666865348816, + "reward_std": 0.3803371787071228, + "rewards/reasoning_reward/mean": 0.6041666865348816, + "rewards/reasoning_reward/std": 0.642332136631012, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 144.2916717529297, + "completions/mean_terminated_length": 144.2916717529297, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.7228541882109617, + "grad_norm": 0.20006012207095064, + "kl": 0.07568359375, + "learning_rate": 1.7909037079527433e-07, + "loss": 0.003, + "num_tokens": 57192853.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.9630867838859558, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 146.83334350585938, + "completions/mean_terminated_length": 146.83334350585938, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.7238883143743536, + "grad_norm": 3.3580145045189655, + "kl": 0.07666015625, + "learning_rate": 1.7784638453409451e-07, + "loss": 0.0031, + "num_tokens": 57271785.0, + "reward": 1.0555555820465088, + "reward_std": 0.3299504518508911, + "rewards/reasoning_reward/mean": 1.0555555820465088, + "rewards/reasoning_reward/std": 0.4189550578594208, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 140.95834350585938, + "completions/mean_terminated_length": 140.95834350585938, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.7249224405377456, + "grad_norm": 0.22405126131157563, + "kl": 0.059814453125, + "learning_rate": 1.7660579851176317e-07, + "loss": 0.0024, + "num_tokens": 57347536.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 315.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 180.5, + "completions/mean_terminated_length": 174.6521759033203, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.7259565667011375, + "grad_norm": 2.9389018302570014, + "kl": 0.055419921875, + "learning_rate": 1.7536862582230893e-07, + "loss": 0.0022, + "num_tokens": 57430948.0, + "reward": 1.5833333730697632, + "reward_std": 0.27392348647117615, + "rewards/reasoning_reward/mean": 1.5833333730697632, + "rewards/reasoning_reward/std": 0.4584156572818756, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 178.0416717529297, + "completions/mean_terminated_length": 174.60870361328125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.7269906928645294, + "grad_norm": 2.282134417830866, + "kl": 0.0576171875, + "learning_rate": 1.7413487952373455e-07, + "loss": 0.0023, + "num_tokens": 57508453.0, + "reward": 1.0833333730697632, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.40824830532073975, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 184.58334350585938, + "completions/mean_terminated_length": 181.69564819335938, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.7280248190279214, + "grad_norm": 4.1836131721858765, + "kl": 0.08935546875, + "learning_rate": 1.7290457263787728e-07, + "loss": 0.0036, + "num_tokens": 57593995.0, + "reward": 1.3125, + "reward_std": 0.3492930829524994, + "rewards/reasoning_reward/mean": 1.3125, + "rewards/reasoning_reward/std": 0.7775728702545166, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 166.70834350585938, + "completions/mean_terminated_length": 166.70834350585938, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.7290589451913133, + "grad_norm": 3.165264879865031, + "kl": 0.058837890625, + "learning_rate": 1.7167771815027317e-07, + "loss": 0.0024, + "num_tokens": 57672700.0, + "reward": 0.6041666865348816, + "reward_std": 0.30217814445495605, + "rewards/reasoning_reward/mean": 0.6041666865348816, + "rewards/reasoning_reward/std": 0.6590369343757629, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 156.625, + "completions/mean_terminated_length": 156.625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.7300930713547052, + "grad_norm": 2.672806554030225, + "kl": 0.054931640625, + "learning_rate": 1.7045432901001844e-07, + "loss": 0.0022, + "num_tokens": 57751843.0, + "reward": 0.875, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 136.70834350585938, + "completions/mean_terminated_length": 136.70834350585938, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.7311271975180972, + "grad_norm": 3.035864140075777, + "kl": 0.07080078125, + "learning_rate": 1.6923441812963434e-07, + "loss": 0.0028, + "num_tokens": 57839420.0, + "reward": 1.3333333730697632, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 162.95834350585938, + "completions/mean_terminated_length": 162.95834350585938, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.7321613236814891, + "grad_norm": 2.6352705999114425, + "kl": 0.07861328125, + "learning_rate": 1.6801799838492942e-07, + "loss": 0.0032, + "num_tokens": 57917803.0, + "reward": 0.7083333730697632, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 177.25, + "completions/mean_terminated_length": 177.25, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.733195449844881, + "grad_norm": 2.839232223213205, + "kl": 0.07666015625, + "learning_rate": 1.6680508261486465e-07, + "loss": 0.0031, + "num_tokens": 58001793.0, + "reward": 0.7083333730697632, + "reward_std": 0.33034375309944153, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.7649476528167725, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 166.20834350585938, + "completions/mean_terminated_length": 166.20834350585938, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.734229576008273, + "grad_norm": 3.7820274298450687, + "kl": 0.07177734375, + "learning_rate": 1.6559568362141769e-07, + "loss": 0.0029, + "num_tokens": 58086726.0, + "reward": 0.8125, + "reward_std": 0.4476938843727112, + "rewards/reasoning_reward/mean": 0.8125, + "rewards/reasoning_reward/std": 0.6562823057174683, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 154.5416717529297, + "completions/mean_terminated_length": 154.5416717529297, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.7352637021716649, + "grad_norm": 3.1124009121892198, + "kl": 0.09228515625, + "learning_rate": 1.6438981416944708e-07, + "loss": 0.0037, + "num_tokens": 58173611.0, + "reward": 0.8333333730697632, + "reward_std": 0.30860671401023865, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.8164966106414795, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 196.0416717529297, + "completions/mean_terminated_length": 196.0416717529297, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.7362978283350569, + "grad_norm": 3.1443322611380684, + "kl": 0.08544921875, + "learning_rate": 1.631874869865587e-07, + "loss": 0.0034, + "num_tokens": 58251924.0, + "reward": 1.3958333730697632, + "reward_std": 0.33768826723098755, + "rewards/reasoning_reward/mean": 1.3958333730697632, + "rewards/reasoning_reward/std": 0.48854637145996094, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 124.125, + "completions/mean_terminated_length": 124.125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.7373319544984488, + "grad_norm": 2.3752422112829867, + "kl": 0.054443359375, + "learning_rate": 1.6198871476297033e-07, + "loss": 0.0022, + "num_tokens": 58332575.0, + "reward": 0.8958333730697632, + "reward_std": 0.19795583188533783, + "rewards/reasoning_reward/mean": 0.8958333134651184, + "rewards/reasoning_reward/std": 0.3605300188064575, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 150.4166717529297, + "completions/mean_terminated_length": 150.4166717529297, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.7383660806618407, + "grad_norm": 2.0212470615420246, + "kl": 0.051025390625, + "learning_rate": 1.607935101513785e-07, + "loss": 0.002, + "num_tokens": 58418201.0, + "reward": 0.9166666865348816, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.28232985734939575, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 171.58334350585938, + "completions/mean_terminated_length": 171.58334350585938, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.7394002068252327, + "grad_norm": 3.251685083157416, + "kl": 0.061767578125, + "learning_rate": 1.596018857668242e-07, + "loss": 0.0025, + "num_tokens": 58495583.0, + "reward": 0.6875, + "reward_std": 0.3709374666213989, + "rewards/reasoning_reward/mean": 0.6875, + "rewards/reasoning_reward/std": 0.7194880843162537, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 212.08334350585938, + "completions/mean_terminated_length": 212.08334350585938, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.7404343329886246, + "grad_norm": 3.242133442856897, + "kl": 0.07177734375, + "learning_rate": 1.5841385418656068e-07, + "loss": 0.0029, + "num_tokens": 58581425.0, + "reward": 1.4583333730697632, + "reward_std": 0.29602527618408203, + "rewards/reasoning_reward/mean": 1.4583333730697632, + "rewards/reasoning_reward/std": 0.5882299542427063, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 168.33334350585938, + "completions/mean_terminated_length": 168.33334350585938, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.7414684591520165, + "grad_norm": 3.0656410513536554, + "kl": 0.07373046875, + "learning_rate": 1.5722942794991995e-07, + "loss": 0.003, + "num_tokens": 58665161.0, + "reward": 1.2291667461395264, + "reward_std": 0.39486488699913025, + "rewards/reasoning_reward/mean": 1.2291666269302368, + "rewards/reasoning_reward/std": 0.6251811385154724, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 144.2916717529297, + "completions/mean_terminated_length": 144.2916717529297, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.7425025853154085, + "grad_norm": 3.1287480808791184, + "kl": 0.07177734375, + "learning_rate": 1.5604861955818038e-07, + "loss": 0.0029, + "num_tokens": 58752048.0, + "reward": 0.9166666865348816, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.7172814607620239, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 161.9166717529297, + "completions/mean_terminated_length": 161.9166717529297, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.7435367114788004, + "grad_norm": 4.0449765747453545, + "kl": 0.08740234375, + "learning_rate": 1.548714414744356e-07, + "loss": 0.0035, + "num_tokens": 58834862.0, + "reward": 0.9583333730697632, + "reward_std": 0.39814266562461853, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.7359800934791565, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 172.08334350585938, + "completions/mean_terminated_length": 172.08334350585938, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.7445708376421923, + "grad_norm": 3.011337708200572, + "kl": 0.06298828125, + "learning_rate": 1.5369790612346168e-07, + "loss": 0.0025, + "num_tokens": 58914176.0, + "reward": 1.0694444179534912, + "reward_std": 0.2472916543483734, + "rewards/reasoning_reward/mean": 1.0694444179534912, + "rewards/reasoning_reward/std": 0.3506951928138733, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 170.5, + "completions/mean_terminated_length": 170.5, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.7456049638055843, + "grad_norm": 4.6480620380414965, + "kl": 0.0625, + "learning_rate": 1.5252802589158737e-07, + "loss": 0.0025, + "num_tokens": 58996844.0, + "reward": 1.0694445371627808, + "reward_std": 0.30431777238845825, + "rewards/reasoning_reward/mean": 1.0694445371627808, + "rewards/reasoning_reward/std": 0.7659994959831238, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 148.83334350585938, + "completions/mean_terminated_length": 148.83334350585938, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.7466390899689762, + "grad_norm": 3.1996652428167045, + "kl": 0.056884765625, + "learning_rate": 1.513618131265621e-07, + "loss": 0.0023, + "num_tokens": 59076408.0, + "reward": 0.9791666865348816, + "reward_std": 0.24056154489517212, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.6672325730323792, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 172.625, + "completions/mean_terminated_length": 172.625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.7476732161323681, + "grad_norm": 3.280704854907078, + "kl": 0.06982421875, + "learning_rate": 1.5019928013742682e-07, + "loss": 0.0028, + "num_tokens": 59163431.0, + "reward": 1.2083333730697632, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.9315329790115356, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 134.70834350585938, + "completions/mean_terminated_length": 134.70834350585938, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.7487073422957601, + "grad_norm": 2.185541011879563, + "kl": 0.06640625, + "learning_rate": 1.490404391943829e-07, + "loss": 0.0026, + "num_tokens": 59246096.0, + "reward": 1.25, + "reward_std": 0.07273930311203003, + "rewards/reasoning_reward/mean": 1.25, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 165.0416717529297, + "completions/mean_terminated_length": 165.0416717529297, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.749741468459152, + "grad_norm": 2.51252824919629, + "kl": 0.09521484375, + "learning_rate": 1.4788530252866372e-07, + "loss": 0.0038, + "num_tokens": 59325185.0, + "reward": 0.625, + "reward_std": 0.21362332999706268, + "rewards/reasoning_reward/mean": 0.625, + "rewards/reasoning_reward/std": 0.5757792592048645, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 162.6666717529297, + "completions/mean_terminated_length": 162.6666717529297, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.750775594622544, + "grad_norm": 2.8053048835881818, + "kl": 0.060546875, + "learning_rate": 1.4673388233240502e-07, + "loss": 0.0024, + "num_tokens": 59416513.0, + "reward": 1.4375, + "reward_std": 0.12400396913290024, + "rewards/reasoning_reward/mean": 1.4375, + "rewards/reasoning_reward/std": 0.47348156571388245, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 181.45834350585938, + "completions/mean_terminated_length": 178.3913116455078, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.7518097207859359, + "grad_norm": 4.163295102377276, + "kl": 0.060546875, + "learning_rate": 1.455861907585158e-07, + "loss": 0.0024, + "num_tokens": 59494236.0, + "reward": 0.9375, + "reward_std": 0.5304333567619324, + "rewards/reasoning_reward/mean": 0.9375, + "rewards/reasoning_reward/std": 0.613480806350708, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 169.83334350585938, + "completions/mean_terminated_length": 169.83334350585938, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.7528438469493278, + "grad_norm": 3.420329539869673, + "kl": 0.08251953125, + "learning_rate": 1.4444223992055116e-07, + "loss": 0.0033, + "num_tokens": 59578752.0, + "reward": 1.0, + "reward_std": 0.36585909128189087, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.4662523865699768, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 147.9166717529297, + "completions/mean_terminated_length": 147.9166717529297, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.7538779731127198, + "grad_norm": 4.210005348002778, + "kl": 0.06298828125, + "learning_rate": 1.4330204189258327e-07, + "loss": 0.0025, + "num_tokens": 59655494.0, + "reward": 0.6180555820465088, + "reward_std": 0.47668325901031494, + "rewards/reasoning_reward/mean": 0.618055522441864, + "rewards/reasoning_reward/std": 0.5948230028152466, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 151.7916717529297, + "completions/mean_terminated_length": 151.7916717529297, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.7549120992761117, + "grad_norm": 3.359678282929136, + "kl": 0.0771484375, + "learning_rate": 1.4216560870907496e-07, + "loss": 0.0031, + "num_tokens": 59733049.0, + "reward": 1.1111111640930176, + "reward_std": 0.3074157238006592, + "rewards/reasoning_reward/mean": 1.1111111640930176, + "rewards/reasoning_reward/std": 0.3796345293521881, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 169.6666717529297, + "completions/mean_terminated_length": 169.43478393554688, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.7559462254395036, + "grad_norm": 4.109241986433982, + "kl": 0.072265625, + "learning_rate": 1.4103295236475166e-07, + "loss": 0.0029, + "num_tokens": 59811105.0, + "reward": 1.125, + "reward_std": 0.42645785212516785, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.6123724579811096, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 164.0416717529297, + "completions/mean_terminated_length": 164.0416717529297, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.7569803516028956, + "grad_norm": 3.609914878076435, + "kl": 0.07958984375, + "learning_rate": 1.3990408481447596e-07, + "loss": 0.0032, + "num_tokens": 59895338.0, + "reward": 0.7916666865348816, + "reward_std": 0.4563485085964203, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.832970917224884, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 170.9166717529297, + "completions/mean_terminated_length": 170.9166717529297, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.7580144777662875, + "grad_norm": 3.4849706899707638, + "kl": 0.051513671875, + "learning_rate": 1.387790179731202e-07, + "loss": 0.0021, + "num_tokens": 59974200.0, + "reward": 1.0625, + "reward_std": 0.29339051246643066, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.4499396085739136, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 151.625, + "completions/mean_terminated_length": 151.625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.7590486039296794, + "grad_norm": 4.775586282531071, + "kl": 0.06298828125, + "learning_rate": 1.3765776371544173e-07, + "loss": 0.0025, + "num_tokens": 60050775.0, + "reward": 0.8472222685813904, + "reward_std": 0.3458244502544403, + "rewards/reasoning_reward/mean": 0.8472222685813904, + "rewards/reasoning_reward/std": 0.38359054923057556, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 176.20834350585938, + "completions/mean_terminated_length": 176.20834350585938, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.7600827300930714, + "grad_norm": 3.511611967896255, + "kl": 0.083984375, + "learning_rate": 1.3654033387595732e-07, + "loss": 0.0034, + "num_tokens": 60134140.0, + "reward": 1.2916667461395264, + "reward_std": 0.30078065395355225, + "rewards/reasoning_reward/mean": 1.2916666269302368, + "rewards/reasoning_reward/std": 0.4148510992527008, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 125.5, + "completions/mean_terminated_length": 125.5, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.7611168562564633, + "grad_norm": 4.1099857019944395, + "kl": 0.10205078125, + "learning_rate": 1.3542674024881746e-07, + "loss": 0.0041, + "num_tokens": 60211512.0, + "reward": 1.0416667461395264, + "reward_std": 0.29602527618408203, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.3877657949924469, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 123.95833587646484, + "completions/mean_terminated_length": 123.95833587646484, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.7621509824198552, + "grad_norm": 2.5865983458888597, + "kl": 0.056396484375, + "learning_rate": 1.3431699458768332e-07, + "loss": 0.0023, + "num_tokens": 60289567.0, + "reward": 0.9583333730697632, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.20412415266036987, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 148.33334350585938, + "completions/mean_terminated_length": 148.33334350585938, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.7631851085832472, + "grad_norm": 4.027481526615197, + "kl": 0.07470703125, + "learning_rate": 1.332111086056011e-07, + "loss": 0.003, + "num_tokens": 60370247.0, + "reward": 1.3125, + "reward_std": 0.39615845680236816, + "rewards/reasoning_reward/mean": 1.3125, + "rewards/reasoning_reward/std": 0.6045569777488708, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 134.70834350585938, + "completions/mean_terminated_length": 134.70834350585938, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.7642192347466391, + "grad_norm": 3.3303426892145067, + "kl": 0.0419921875, + "learning_rate": 1.3210909397487995e-07, + "loss": 0.0017, + "num_tokens": 60450768.0, + "reward": 0.8333333730697632, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 137.2916717529297, + "completions/mean_terminated_length": 137.2916717529297, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.765253360910031, + "grad_norm": 4.295523478132368, + "kl": 0.087890625, + "learning_rate": 1.3101096232696735e-07, + "loss": 0.0035, + "num_tokens": 60540887.0, + "reward": 0.993055522441864, + "reward_std": 0.25392836332321167, + "rewards/reasoning_reward/mean": 0.993055522441864, + "rewards/reasoning_reward/std": 0.6245368719100952, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 159.7916717529297, + "completions/mean_terminated_length": 159.7916717529297, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.766287487073423, + "grad_norm": 3.7021465221363217, + "kl": 0.08740234375, + "learning_rate": 1.2991672525232756e-07, + "loss": 0.0035, + "num_tokens": 60629106.0, + "reward": 1.5416667461395264, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 1.5416666269302368, + "rewards/reasoning_reward/std": 0.6580053567886353, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 172.6666717529297, + "completions/mean_terminated_length": 172.6666717529297, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.7673216132368149, + "grad_norm": 3.9994888009560037, + "kl": 0.07763671875, + "learning_rate": 1.2882639430031833e-07, + "loss": 0.0031, + "num_tokens": 60718066.0, + "reward": 1.5625, + "reward_std": 0.25392839312553406, + "rewards/reasoning_reward/mean": 1.5625, + "rewards/reasoning_reward/std": 0.5578004121780396, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 165.375, + "completions/mean_terminated_length": 165.375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.7683557394002068, + "grad_norm": 0.3657715562784098, + "kl": 0.0693359375, + "learning_rate": 1.2773998097906962e-07, + "loss": 0.0028, + "num_tokens": 60800979.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.8340576887130737, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 159.125, + "completions/mean_terminated_length": 159.125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.7693898655635988, + "grad_norm": 2.1996165323485397, + "kl": 0.052978515625, + "learning_rate": 1.2665749675536209e-07, + "loss": 0.0021, + "num_tokens": 60879894.0, + "reward": 0.9166666865348816, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.28232985734939575, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 159.125, + "completions/mean_terminated_length": 159.125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.7704239917269907, + "grad_norm": 3.291864531573179, + "kl": 0.056640625, + "learning_rate": 1.2557895305450533e-07, + "loss": 0.0023, + "num_tokens": 60962449.0, + "reward": 1.125, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.6123724579811096, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 159.875, + "completions/mean_terminated_length": 159.875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.7714581178903827, + "grad_norm": 4.09716328217157, + "kl": 0.06396484375, + "learning_rate": 1.2450436126021863e-07, + "loss": 0.0026, + "num_tokens": 61045190.0, + "reward": 1.2569444179534912, + "reward_std": 0.40088510513305664, + "rewards/reasoning_reward/mean": 1.2569444179534912, + "rewards/reasoning_reward/std": 0.6738367080688477, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 156.33334350585938, + "completions/mean_terminated_length": 156.33334350585938, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.7724922440537746, + "grad_norm": 3.237616260351, + "kl": 0.05859375, + "learning_rate": 1.234337327145092e-07, + "loss": 0.0023, + "num_tokens": 61130822.0, + "reward": 1.2777777910232544, + "reward_std": 0.15713486075401306, + "rewards/reasoning_reward/mean": 1.2777777910232544, + "rewards/reasoning_reward/std": 0.5353825092315674, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 128.625, + "completions/mean_terminated_length": 128.625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.7735263702171665, + "grad_norm": 2.731762557253584, + "kl": 0.05859375, + "learning_rate": 1.2236707871755403e-07, + "loss": 0.0023, + "num_tokens": 61209893.0, + "reward": 1.0208333730697632, + "reward_std": 0.16517187654972076, + "rewards/reasoning_reward/mean": 1.0208333730697632, + "rewards/reasoning_reward/std": 0.2750164568424225, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 183.75, + "completions/mean_terminated_length": 183.75, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.7745604963805585, + "grad_norm": 3.5907800559210017, + "kl": 0.07275390625, + "learning_rate": 1.2130441052757939e-07, + "loss": 0.0029, + "num_tokens": 61289695.0, + "reward": 0.75, + "reward_std": 0.3247893452644348, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.48900964856147766, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 149.375, + "completions/mean_terminated_length": 149.375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.7755946225439504, + "grad_norm": 2.491098383777402, + "kl": 0.095703125, + "learning_rate": 1.2024573936074274e-07, + "loss": 0.0038, + "num_tokens": 61373152.0, + "reward": 0.8541666865348816, + "reward_std": 0.16517187654972076, + "rewards/reasoning_reward/mean": 0.8541666865348816, + "rewards/reasoning_reward/std": 0.7144344449043274, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 188.75, + "completions/mean_terminated_length": 188.75, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.7766287487073423, + "grad_norm": 2.7802792084915904, + "kl": 0.062255859375, + "learning_rate": 1.1919107639101423e-07, + "loss": 0.0025, + "num_tokens": 61452002.0, + "reward": 0.75, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.5897678136825562, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 150.2916717529297, + "completions/mean_terminated_length": 150.2916717529297, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.7776628748707343, + "grad_norm": 0.23470415565994152, + "kl": 0.07177734375, + "learning_rate": 1.181404327500582e-07, + "loss": 0.0029, + "num_tokens": 61528641.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 122.625, + "completions/mean_terminated_length": 122.625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.7786970010341262, + "grad_norm": 3.327920369451495, + "kl": 0.07470703125, + "learning_rate": 1.1709381952711667e-07, + "loss": 0.003, + "num_tokens": 61609248.0, + "reward": 1.0, + "reward_std": 0.19500279426574707, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.2553769648075104, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 144.08334350585938, + "completions/mean_terminated_length": 144.08334350585938, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.7797311271975181, + "grad_norm": 3.335800938480737, + "kl": 0.057373046875, + "learning_rate": 1.1605124776889125e-07, + "loss": 0.0023, + "num_tokens": 61685530.0, + "reward": 1.1666667461395264, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.637022078037262, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 175.08334350585938, + "completions/mean_terminated_length": 175.08334350585938, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.7807652533609101, + "grad_norm": 3.294221512666946, + "kl": 0.06884765625, + "learning_rate": 1.150127284794275e-07, + "loss": 0.0027, + "num_tokens": 61768532.0, + "reward": 1.3888888359069824, + "reward_std": 0.11878277361392975, + "rewards/reasoning_reward/mean": 1.3888888359069824, + "rewards/reasoning_reward/std": 0.3134145140647888, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 173.125, + "completions/mean_terminated_length": 173.125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.781799379524302, + "grad_norm": 3.396721463922415, + "kl": 0.06298828125, + "learning_rate": 1.1397827261999793e-07, + "loss": 0.0025, + "num_tokens": 61853439.0, + "reward": 1.1875, + "reward_std": 0.4671441912651062, + "rewards/reasoning_reward/mean": 1.1875, + "rewards/reasoning_reward/std": 0.6395055651664734, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 147.5, + "completions/mean_terminated_length": 147.5, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.7828335056876939, + "grad_norm": 4.325100386537247, + "kl": 0.08447265625, + "learning_rate": 1.1294789110898711e-07, + "loss": 0.0034, + "num_tokens": 61932395.0, + "reward": 0.75, + "reward_std": 0.5782498121261597, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.6255432367324829, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 171.7916717529297, + "completions/mean_terminated_length": 171.7916717529297, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.7838676318510859, + "grad_norm": 4.370174208909052, + "kl": 0.07275390625, + "learning_rate": 1.119215948217756e-07, + "loss": 0.0029, + "num_tokens": 62009526.0, + "reward": 0.9166666865348816, + "reward_std": 0.3493061661720276, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 173.08334350585938, + "completions/mean_terminated_length": 173.08334350585938, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.7849017580144778, + "grad_norm": 2.9640064799173294, + "kl": 0.07568359375, + "learning_rate": 1.1089939459062602e-07, + "loss": 0.003, + "num_tokens": 62092712.0, + "reward": 1.0416667461395264, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.6902530789375305, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 181.25, + "completions/mean_terminated_length": 181.25, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.7859358841778697, + "grad_norm": 3.2949381510622544, + "kl": 0.08251953125, + "learning_rate": 1.0988130120456813e-07, + "loss": 0.0033, + "num_tokens": 62169958.0, + "reward": 0.7083333730697632, + "reward_std": 0.29602527618408203, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.7790277004241943, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 154.75, + "completions/mean_terminated_length": 154.75, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.7869700103412617, + "grad_norm": 2.3992415178790885, + "kl": 0.07373046875, + "learning_rate": 1.088673254092849e-07, + "loss": 0.003, + "num_tokens": 62252120.0, + "reward": 1.25, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 1.25, + "rewards/reasoning_reward/std": 0.6079187393188477, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 177.875, + "completions/mean_terminated_length": 177.875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.7880041365046536, + "grad_norm": 3.3520352974736984, + "kl": 0.042236328125, + "learning_rate": 1.0785747790699978e-07, + "loss": 0.0017, + "num_tokens": 62333205.0, + "reward": 0.7916666865348816, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 161.83334350585938, + "completions/mean_terminated_length": 161.83334350585938, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.7890382626680456, + "grad_norm": 3.9881962321706217, + "kl": 0.08740234375, + "learning_rate": 1.0685176935636265e-07, + "loss": 0.0035, + "num_tokens": 62416633.0, + "reward": 1.375, + "reward_std": 0.31285393238067627, + "rewards/reasoning_reward/mean": 1.375, + "rewards/reasoning_reward/std": 0.5160468220710754, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 125.29167175292969, + "completions/mean_terminated_length": 125.29167175292969, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.7900723888314375, + "grad_norm": 4.570037175008805, + "kl": 0.07177734375, + "learning_rate": 1.0585021037233871e-07, + "loss": 0.0029, + "num_tokens": 62492632.0, + "reward": 1.0833333730697632, + "reward_std": 0.43459486961364746, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.637022078037262, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 170.83334350585938, + "completions/mean_terminated_length": 170.83334350585938, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.7911065149948294, + "grad_norm": 4.0290312319505395, + "kl": 0.07763671875, + "learning_rate": 1.0485281152609482e-07, + "loss": 0.0031, + "num_tokens": 62575660.0, + "reward": 1.1666667461395264, + "reward_std": 0.36444199085235596, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.40824830532073975, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 157.4166717529297, + "completions/mean_terminated_length": 157.4166717529297, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.7921406411582212, + "grad_norm": 1.9636414153768984, + "kl": 0.07568359375, + "learning_rate": 1.0385958334488965e-07, + "loss": 0.003, + "num_tokens": 62654838.0, + "reward": 0.375, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.375, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 156.4166717529297, + "completions/mean_terminated_length": 156.4166717529297, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.7931747673216132, + "grad_norm": 3.236809976550798, + "kl": 0.09765625, + "learning_rate": 1.0287053631196108e-07, + "loss": 0.0039, + "num_tokens": 62731304.0, + "reward": 0.4583333432674408, + "reward_std": 0.2842140197753906, + "rewards/reasoning_reward/mean": 0.4583333432674408, + "rewards/reasoning_reward/std": 0.6412736177444458, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 151.33334350585938, + "completions/mean_terminated_length": 151.33334350585938, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.7942088934850051, + "grad_norm": 4.220315176240241, + "kl": 0.08447265625, + "learning_rate": 1.0188568086641614e-07, + "loss": 0.0034, + "num_tokens": 62813816.0, + "reward": 1.1458333730697632, + "reward_std": 0.4039583206176758, + "rewards/reasoning_reward/mean": 1.1458333730697632, + "rewards/reasoning_reward/std": 0.5610387921333313, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 175.9166717529297, + "completions/mean_terminated_length": 175.9166717529297, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.795243019648397, + "grad_norm": 3.8313849909839535, + "kl": 0.05859375, + "learning_rate": 1.0090502740312152e-07, + "loss": 0.0023, + "num_tokens": 62893494.0, + "reward": 0.6875, + "reward_std": 0.47950729727745056, + "rewards/reasoning_reward/mean": 0.6875, + "rewards/reasoning_reward/std": 0.4618605971336365, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 144.4166717529297, + "completions/mean_terminated_length": 144.4166717529297, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.796277145811789, + "grad_norm": 4.064061665509299, + "kl": 0.09033203125, + "learning_rate": 9.992858627259237e-08, + "loss": 0.0036, + "num_tokens": 62975872.0, + "reward": 1.125, + "reward_std": 0.5049939155578613, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.6796738505363464, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 169.45834350585938, + "completions/mean_terminated_length": 169.45834350585938, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.7973112719751809, + "grad_norm": 3.87391780953322, + "kl": 0.07666015625, + "learning_rate": 9.895636778088457e-08, + "loss": 0.0031, + "num_tokens": 63058859.0, + "reward": 0.6875, + "reward_std": 0.4851650893688202, + "rewards/reasoning_reward/mean": 0.6875, + "rewards/reasoning_reward/std": 0.7490936517715454, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 142.5416717529297, + "completions/mean_terminated_length": 142.5416717529297, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.7983453981385729, + "grad_norm": 4.501546300705031, + "kl": 0.0859375, + "learning_rate": 9.798838218948468e-08, + "loss": 0.0034, + "num_tokens": 63135176.0, + "reward": 1.0833333730697632, + "reward_std": 0.42156457901000977, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.7322785258293152, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 156.08334350585938, + "completions/mean_terminated_length": 156.08334350585938, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.7993795243019648, + "grad_norm": 2.414414647480725, + "kl": 0.055908203125, + "learning_rate": 9.702463971520264e-08, + "loss": 0.0022, + "num_tokens": 63217306.0, + "reward": 0.75, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 148.2916717529297, + "completions/mean_terminated_length": 148.2916717529297, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.8004136504653567, + "grad_norm": 4.229101545256886, + "kl": 0.09716796875, + "learning_rate": 9.606515053006347e-08, + "loss": 0.0039, + "num_tokens": 63305329.0, + "reward": 1.375, + "reward_std": 0.42645785212516785, + "rewards/reasoning_reward/mean": 1.375, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 115.79167175292969, + "completions/mean_terminated_length": 115.79167175292969, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.8014477766287487, + "grad_norm": 4.537579562049641, + "kl": 0.050537109375, + "learning_rate": 9.510992476119962e-08, + "loss": 0.002, + "num_tokens": 63385604.0, + "reward": 0.6666666865348816, + "reward_std": 0.4446708858013153, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 152.0, + "completions/mean_terminated_length": 152.0, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.8024819027921406, + "grad_norm": 0.2165151880931561, + "kl": 0.06982421875, + "learning_rate": 9.415897249074478e-08, + "loss": 0.0028, + "num_tokens": 63474276.0, + "reward": 1.6666667461395264, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.6666666269302368, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 145.20834350585938, + "completions/mean_terminated_length": 145.20834350585938, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.8035160289555325, + "grad_norm": 2.756431222536208, + "kl": 0.044189453125, + "learning_rate": 9.321230375572681e-08, + "loss": 0.0018, + "num_tokens": 63553017.0, + "reward": 0.9166666865348816, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.28232985734939575, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 166.83334350585938, + "completions/mean_terminated_length": 166.83334350585938, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.8045501551189245, + "grad_norm": 3.3926466918323492, + "kl": 0.048095703125, + "learning_rate": 9.226992854796234e-08, + "loss": 0.0019, + "num_tokens": 63633093.0, + "reward": 1.1041667461395264, + "reward_std": 0.28302299976348877, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.416485458612442, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 164.375, + "completions/mean_terminated_length": 164.375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.8055842812823164, + "grad_norm": 3.0968300481941142, + "kl": 0.068359375, + "learning_rate": 9.133185681395072e-08, + "loss": 0.0027, + "num_tokens": 63716078.0, + "reward": 1.0416667461395264, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 153.625, + "completions/mean_terminated_length": 153.625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.8066184074457083, + "grad_norm": 3.196506746978936, + "kl": 0.0810546875, + "learning_rate": 9.03980984547697e-08, + "loss": 0.0032, + "num_tokens": 63798485.0, + "reward": 0.7708333730697632, + "reward_std": 0.2644323706626892, + "rewards/reasoning_reward/mean": 0.7708333134651184, + "rewards/reasoning_reward/std": 0.3895137906074524, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 164.25, + "completions/mean_terminated_length": 164.25, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.8076525336091003, + "grad_norm": 3.6585350966268906, + "kl": 0.048095703125, + "learning_rate": 8.946866332597064e-08, + "loss": 0.0019, + "num_tokens": 63885723.0, + "reward": 1.2083333730697632, + "reward_std": 0.31285393238067627, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.5694518685340881, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 158.45834350585938, + "completions/mean_terminated_length": 158.45834350585938, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.8086866597724922, + "grad_norm": 3.5060979679162885, + "kl": 0.049560546875, + "learning_rate": 8.854356123747392e-08, + "loss": 0.002, + "num_tokens": 63964702.0, + "reward": 0.625, + "reward_std": 0.3506905436515808, + "rewards/reasoning_reward/mean": 0.625, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 174.6666717529297, + "completions/mean_terminated_length": 174.6666717529297, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.8097207859358841, + "grad_norm": 3.1782626992547836, + "kl": 0.076171875, + "learning_rate": 8.762280195346655e-08, + "loss": 0.003, + "num_tokens": 64047942.0, + "reward": 1.1666667461395264, + "reward_std": 0.30860671401023865, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.5646597146987915, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 146.375, + "completions/mean_terminated_length": 146.375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.8107549120992761, + "grad_norm": 3.2114912520386825, + "kl": 0.08154296875, + "learning_rate": 8.6706395192298e-08, + "loss": 0.0033, + "num_tokens": 64128223.0, + "reward": 0.8333333730697632, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 150.375, + "completions/mean_terminated_length": 150.375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.811789038262668, + "grad_norm": 2.4667511559934585, + "kl": 0.0576171875, + "learning_rate": 8.579435062637863e-08, + "loss": 0.0023, + "num_tokens": 64212144.0, + "reward": 0.875, + "reward_std": 0.07715167850255966, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.6954823136329651, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 140.375, + "completions/mean_terminated_length": 140.375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.81282316442606, + "grad_norm": 2.527486671838808, + "kl": 0.08642578125, + "learning_rate": 8.488667788207642e-08, + "loss": 0.0035, + "num_tokens": 64296945.0, + "reward": 1.3125, + "reward_std": 0.13908717036247253, + "rewards/reasoning_reward/mean": 1.3125, + "rewards/reasoning_reward/std": 0.5479705333709717, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 147.25, + "completions/mean_terminated_length": 147.25, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.8138572905894519, + "grad_norm": 4.580987567602128, + "kl": 0.09521484375, + "learning_rate": 8.398338653961673e-08, + "loss": 0.0038, + "num_tokens": 64384935.0, + "reward": 1.2083333730697632, + "reward_std": 0.3478729724884033, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.8670706748962402, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 190.0, + "completions/mean_terminated_length": 190.0, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.8148914167528438, + "grad_norm": 3.6226405946498326, + "kl": 0.08935546875, + "learning_rate": 8.30844861329798e-08, + "loss": 0.0036, + "num_tokens": 64462951.0, + "reward": 1.152777910232544, + "reward_std": 0.237970232963562, + "rewards/reasoning_reward/mean": 1.1527777910232544, + "rewards/reasoning_reward/std": 0.30263206362724304, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 142.0, + "completions/mean_terminated_length": 142.0, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.8159255429162358, + "grad_norm": 4.446754103716307, + "kl": 0.08447265625, + "learning_rate": 8.218998614980132e-08, + "loss": 0.0034, + "num_tokens": 64539215.0, + "reward": 0.375, + "reward_std": 0.47419947385787964, + "rewards/reasoning_reward/mean": 0.375, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 179.0416717529297, + "completions/mean_terminated_length": 179.0416717529297, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.8169596690796277, + "grad_norm": 3.921864271005609, + "kl": 0.061279296875, + "learning_rate": 8.12998960312718e-08, + "loss": 0.0024, + "num_tokens": 64622584.0, + "reward": 1.3402776718139648, + "reward_std": 0.38187703490257263, + "rewards/reasoning_reward/mean": 1.3402776718139648, + "rewards/reasoning_reward/std": 0.5804362297058105, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 164.95834350585938, + "completions/mean_terminated_length": 164.95834350585938, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.8179937952430196, + "grad_norm": 4.340848550091104, + "kl": 0.076171875, + "learning_rate": 8.041422517203627e-08, + "loss": 0.003, + "num_tokens": 64711807.0, + "reward": 1.1666667461395264, + "reward_std": 0.4710209369659424, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.6197240948677063, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 135.375, + "completions/mean_terminated_length": 135.375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.8190279214064116, + "grad_norm": 2.649981760760686, + "kl": 0.0537109375, + "learning_rate": 7.953298292009658e-08, + "loss": 0.0021, + "num_tokens": 64792344.0, + "reward": 0.9583333730697632, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.20412415266036987, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 153.6666717529297, + "completions/mean_terminated_length": 153.6666717529297, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.8200620475698035, + "grad_norm": 3.2958211549645253, + "kl": 0.06689453125, + "learning_rate": 7.86561785767112e-08, + "loss": 0.0027, + "num_tokens": 64868632.0, + "reward": 0.7916666865348816, + "reward_std": 0.3020375669002533, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.3877657949924469, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 144.875, + "completions/mean_terminated_length": 144.875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.8210961737331954, + "grad_norm": 3.09060522662954, + "kl": 0.0693359375, + "learning_rate": 7.77838213962983e-08, + "loss": 0.0028, + "num_tokens": 64950501.0, + "reward": 1.1666667461395264, + "reward_std": 0.3666771650314331, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.5450701713562012, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 144.4166717529297, + "completions/mean_terminated_length": 144.4166717529297, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.8221302998965874, + "grad_norm": 3.8226975362183455, + "kl": 0.06787109375, + "learning_rate": 7.691592058633694e-08, + "loss": 0.0027, + "num_tokens": 65028095.0, + "reward": 0.9375, + "reward_std": 0.35495084524154663, + "rewards/reasoning_reward/mean": 0.9375, + "rewards/reasoning_reward/std": 0.5379611253738403, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 127.91667175292969, + "completions/mean_terminated_length": 127.91667175292969, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.8231644260599793, + "grad_norm": 2.3041894307690525, + "kl": 0.061767578125, + "learning_rate": 7.605248530727115e-08, + "loss": 0.0025, + "num_tokens": 65112941.0, + "reward": 1.2916667461395264, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 1.2916666269302368, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 167.83334350585938, + "completions/mean_terminated_length": 167.83334350585938, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.8241985522233712, + "grad_norm": 4.091470007656279, + "kl": 0.10986328125, + "learning_rate": 7.519352467241197e-08, + "loss": 0.0044, + "num_tokens": 65195697.0, + "reward": 1.0833333730697632, + "reward_std": 0.3794546127319336, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.6197241544723511, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 159.0, + "completions/mean_terminated_length": 159.0, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.8252326783867632, + "grad_norm": 3.2064801888792744, + "kl": 0.07958984375, + "learning_rate": 7.433904774784216e-08, + "loss": 0.0032, + "num_tokens": 65275673.0, + "reward": 1.076388955116272, + "reward_std": 0.12554192543029785, + "rewards/reasoning_reward/mean": 1.076388955116272, + "rewards/reasoning_reward/std": 0.17706114053726196, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 149.0416717529297, + "completions/mean_terminated_length": 149.0416717529297, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.8262668045501551, + "grad_norm": 2.331881060813706, + "kl": 0.06396484375, + "learning_rate": 7.348906355232027e-08, + "loss": 0.0026, + "num_tokens": 65353674.0, + "reward": 0.7291666865348816, + "reward_std": 0.08625819534063339, + "rewards/reasoning_reward/mean": 0.7291666865348816, + "rewards/reasoning_reward/std": 0.551266610622406, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 149.0, + "completions/mean_terminated_length": 149.0, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.827300930713547, + "grad_norm": 3.5190409760967327, + "kl": 0.09228515625, + "learning_rate": 7.264358105718505e-08, + "loss": 0.0037, + "num_tokens": 65443242.0, + "reward": 1.125, + "reward_std": 0.367926687002182, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.4484272003173828, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 135.83334350585938, + "completions/mean_terminated_length": 135.83334350585938, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.828335056876939, + "grad_norm": 2.332152031715114, + "kl": 0.0703125, + "learning_rate": 7.180260918626152e-08, + "loss": 0.0028, + "num_tokens": 65530702.0, + "reward": 1.3125, + "reward_std": 0.0589255653321743, + "rewards/reasoning_reward/mean": 1.3125, + "rewards/reasoning_reward/std": 0.4618605971336365, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 172.625, + "completions/mean_terminated_length": 172.625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.8293691830403309, + "grad_norm": 3.0269063067385193, + "kl": 0.057373046875, + "learning_rate": 7.096615681576596e-08, + "loss": 0.0023, + "num_tokens": 65607861.0, + "reward": 0.5625, + "reward_std": 0.33768826723098755, + "rewards/reasoning_reward/mean": 0.5625, + "rewards/reasoning_reward/std": 0.5954993963241577, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 168.6666717529297, + "completions/mean_terminated_length": 168.6666717529297, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.8304033092037229, + "grad_norm": 3.273127666339797, + "kl": 0.055419921875, + "learning_rate": 7.013423277421299e-08, + "loss": 0.0022, + "num_tokens": 65686197.0, + "reward": 0.8541666865348816, + "reward_std": 0.1767766922712326, + "rewards/reasoning_reward/mean": 0.8541666865348816, + "rewards/reasoning_reward/std": 0.5985338091850281, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 198.95834350585938, + "completions/mean_terminated_length": 198.95834350585938, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.8314374353671148, + "grad_norm": 3.550378331709515, + "kl": 0.07373046875, + "learning_rate": 6.93068458423216e-08, + "loss": 0.0029, + "num_tokens": 65774468.0, + "reward": 0.9722222685813904, + "reward_std": 0.3949992060661316, + "rewards/reasoning_reward/mean": 0.9722222685813904, + "rewards/reasoning_reward/std": 0.5660837888717651, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 157.5, + "completions/mean_terminated_length": 157.5, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.8324715615305067, + "grad_norm": 2.947936195797767, + "kl": 0.06689453125, + "learning_rate": 6.848400475292343e-08, + "loss": 0.0027, + "num_tokens": 65854024.0, + "reward": 1.0486111640930176, + "reward_std": 0.06924575567245483, + "rewards/reasoning_reward/mean": 1.0486111640930176, + "rewards/reasoning_reward/std": 0.13440841436386108, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 183.58334350585938, + "completions/mean_terminated_length": 183.58334350585938, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.8335056876938987, + "grad_norm": 3.8760059999402343, + "kl": 0.08544921875, + "learning_rate": 6.766571819086941e-08, + "loss": 0.0034, + "num_tokens": 65936142.0, + "reward": 1.0625, + "reward_std": 0.44589531421661377, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.7658352255821228, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 183.4166717529297, + "completions/mean_terminated_length": 183.4166717529297, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.8345398138572906, + "grad_norm": 3.429218077443575, + "kl": 0.06787109375, + "learning_rate": 6.685199479293929e-08, + "loss": 0.0027, + "num_tokens": 66027528.0, + "reward": 1.2291667461395264, + "reward_std": 0.34140023589134216, + "rewards/reasoning_reward/mean": 1.2291666269302368, + "rewards/reasoning_reward/std": 0.8678538203239441, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 203.125, + "completions/mean_terminated_length": 203.125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.8355739400206825, + "grad_norm": 3.4009884082930597, + "kl": 0.049072265625, + "learning_rate": 6.604284314774983e-08, + "loss": 0.002, + "num_tokens": 66111411.0, + "reward": 1.3819444179534912, + "reward_std": 0.36571210622787476, + "rewards/reasoning_reward/mean": 1.3819442987442017, + "rewards/reasoning_reward/std": 0.5189155340194702, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 129.45834350585938, + "completions/mean_terminated_length": 129.45834350585938, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.8366080661840745, + "grad_norm": 6.175187456794133, + "kl": 0.1650390625, + "learning_rate": 6.523827179566394e-08, + "loss": 0.0066, + "num_tokens": 66195886.0, + "reward": 0.8333333730697632, + "reward_std": 0.48678088188171387, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.7019641399383545, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 134.375, + "completions/mean_terminated_length": 134.375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.8376421923474664, + "grad_norm": 3.893758481978179, + "kl": 0.058837890625, + "learning_rate": 6.443828922870127e-08, + "loss": 0.0024, + "num_tokens": 66277543.0, + "reward": 1.0416667461395264, + "reward_std": 0.3506905436515808, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.6240935921669006, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 152.08334350585938, + "completions/mean_terminated_length": 152.08334350585938, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.8386763185108583, + "grad_norm": 3.882217378171403, + "kl": 0.08642578125, + "learning_rate": 6.364290389044769e-08, + "loss": 0.0035, + "num_tokens": 66365673.0, + "reward": 1.4444444179534912, + "reward_std": 0.3385341763496399, + "rewards/reasoning_reward/mean": 1.4444442987442017, + "rewards/reasoning_reward/std": 0.5787431597709656, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 137.5416717529297, + "completions/mean_terminated_length": 138.3913116455078, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.8397104446742503, + "grad_norm": 3.974211152800969, + "kl": 0.06787109375, + "learning_rate": 6.285212417596719e-08, + "loss": 0.0027, + "num_tokens": 66448998.0, + "reward": 0.9166666865348816, + "reward_std": 0.4446708858013153, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.7172814607620239, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 159.83334350585938, + "completions/mean_terminated_length": 159.83334350585938, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.8407445708376422, + "grad_norm": 3.374272533210629, + "kl": 0.0537109375, + "learning_rate": 6.206595843171225e-08, + "loss": 0.0021, + "num_tokens": 66525802.0, + "reward": 0.8541666865348816, + "reward_std": 0.4042079448699951, + "rewards/reasoning_reward/mean": 0.8541666865348816, + "rewards/reasoning_reward/std": 0.5413181781768799, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 188.7916717529297, + "completions/mean_terminated_length": 188.7916717529297, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.8417786970010341, + "grad_norm": 2.707109875272163, + "kl": 0.07421875, + "learning_rate": 6.128441495543646e-08, + "loss": 0.003, + "num_tokens": 66605133.0, + "reward": 0.9166666865348816, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.28232985734939575, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 159.0, + "completions/mean_terminated_length": 159.0, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.8428128231644261, + "grad_norm": 11.687611255036002, + "kl": 0.46875, + "learning_rate": 6.050750199610682e-08, + "loss": 0.0189, + "num_tokens": 66684349.0, + "reward": 0.6666666865348816, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 179.625, + "completions/mean_terminated_length": 179.625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.843846949327818, + "grad_norm": 3.5189829393185375, + "kl": 0.06298828125, + "learning_rate": 5.973522775381618e-08, + "loss": 0.0025, + "num_tokens": 66762020.0, + "reward": 1.2083333730697632, + "reward_std": 0.4261821210384369, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.5882299542427063, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 163.0, + "completions/mean_terminated_length": 163.0, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.8448810754912099, + "grad_norm": 3.111695745233412, + "kl": 0.0693359375, + "learning_rate": 5.896760037969739e-08, + "loss": 0.0028, + "num_tokens": 66846116.0, + "reward": 0.7777777910232544, + "reward_std": 0.2375655621290207, + "rewards/reasoning_reward/mean": 0.7777777314186096, + "rewards/reasoning_reward/std": 0.8493627309799194, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 144.9166717529297, + "completions/mean_terminated_length": 144.9166717529297, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.8459152016546019, + "grad_norm": 2.7736990622484976, + "kl": 0.054931640625, + "learning_rate": 5.8204627975836696e-08, + "loss": 0.0022, + "num_tokens": 66924266.0, + "reward": 0.625, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.625, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 170.625, + "completions/mean_terminated_length": 170.625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.8469493278179938, + "grad_norm": 4.60558678901362, + "kl": 0.068359375, + "learning_rate": 5.744631859518878e-08, + "loss": 0.0027, + "num_tokens": 67002865.0, + "reward": 1.1666667461395264, + "reward_std": 0.2840898931026459, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.35098204016685486, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 151.7916717529297, + "completions/mean_terminated_length": 151.7916717529297, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.8479834539813857, + "grad_norm": 3.9470723321035543, + "kl": 0.0556640625, + "learning_rate": 5.66926802414911e-08, + "loss": 0.0022, + "num_tokens": 67087588.0, + "reward": 0.8541666865348816, + "reward_std": 0.4386448860168457, + "rewards/reasoning_reward/mean": 0.8541666865348816, + "rewards/reasoning_reward/std": 0.6780058741569519, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 160.6666717529297, + "completions/mean_terminated_length": 160.6666717529297, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.8490175801447777, + "grad_norm": 3.2433337512879303, + "kl": 0.052978515625, + "learning_rate": 5.594372086918009e-08, + "loss": 0.0021, + "num_tokens": 67166756.0, + "reward": 0.9791666865348816, + "reward_std": 0.32520395517349243, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.47729232907295227, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 155.95834350585938, + "completions/mean_terminated_length": 155.95834350585938, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.8500517063081696, + "grad_norm": 2.8260741217314123, + "kl": 0.07080078125, + "learning_rate": 5.519944838330659e-08, + "loss": 0.0028, + "num_tokens": 67249443.0, + "reward": 1.1875, + "reward_std": 0.27053868770599365, + "rewards/reasoning_reward/mean": 1.1875, + "rewards/reasoning_reward/std": 0.4848240315914154, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 164.58334350585938, + "completions/mean_terminated_length": 164.58334350585938, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.8510858324715616, + "grad_norm": 4.350268666915288, + "kl": 0.06494140625, + "learning_rate": 5.4459870639452897e-08, + "loss": 0.0026, + "num_tokens": 67327881.0, + "reward": 1.0208333730697632, + "reward_std": 0.494476854801178, + "rewards/reasoning_reward/mean": 1.0208333730697632, + "rewards/reasoning_reward/std": 0.580089271068573, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 151.20834350585938, + "completions/mean_terminated_length": 151.20834350585938, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.8521199586349535, + "grad_norm": 4.367925297416209, + "kl": 0.09228515625, + "learning_rate": 5.372499544364972e-08, + "loss": 0.0037, + "num_tokens": 67416214.0, + "reward": 1.625, + "reward_std": 0.4151468276977539, + "rewards/reasoning_reward/mean": 1.625, + "rewards/reasoning_reward/std": 0.42633703351020813, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 166.875, + "completions/mean_terminated_length": 166.875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.8531540847983454, + "grad_norm": 3.804426970311705, + "kl": 0.083984375, + "learning_rate": 5.2994830552293365e-08, + "loss": 0.0033, + "num_tokens": 67502027.0, + "reward": 1.1666667461395264, + "reward_std": 0.4114243686199188, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.7172815203666687, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 164.6666717529297, + "completions/mean_terminated_length": 164.6666717529297, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.8541882109617374, + "grad_norm": 2.9609678472482113, + "kl": 0.046875, + "learning_rate": 5.2269383672064736e-08, + "loss": 0.0019, + "num_tokens": 67585763.0, + "reward": 1.0833333730697632, + "reward_std": 0.26726123690605164, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.6197240948677063, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 147.70834350585938, + "completions/mean_terminated_length": 147.70834350585938, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.8552223371251293, + "grad_norm": 3.152704440105035, + "kl": 0.08837890625, + "learning_rate": 5.154866245984696e-08, + "loss": 0.0035, + "num_tokens": 67663468.0, + "reward": 1.1041667461395264, + "reward_std": 0.43070170283317566, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.5103103518486023, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 154.33334350585938, + "completions/mean_terminated_length": 154.33334350585938, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.8562564632885212, + "grad_norm": 0.17170167898547498, + "kl": 0.0634765625, + "learning_rate": 5.083267452264556e-08, + "loss": 0.0025, + "num_tokens": 67747188.0, + "reward": 1.3333333730697632, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 131.2916717529297, + "completions/mean_terminated_length": 131.2916717529297, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.8572905894519132, + "grad_norm": 0.18506989130036433, + "kl": 0.053955078125, + "learning_rate": 5.012142741750725e-08, + "loss": 0.0022, + "num_tokens": 67824483.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 141.25, + "completions/mean_terminated_length": 141.25, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.8583247156153051, + "grad_norm": 4.548140603940961, + "kl": 0.130859375, + "learning_rate": 4.941492865144115e-08, + "loss": 0.0052, + "num_tokens": 67906433.0, + "reward": 1.0208333730697632, + "reward_std": 0.41873571276664734, + "rewards/reasoning_reward/mean": 1.0208333730697632, + "rewards/reasoning_reward/std": 0.7144345045089722, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 165.0, + "completions/mean_terminated_length": 165.0, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.859358841778697, + "grad_norm": 3.507638044504915, + "kl": 0.080078125, + "learning_rate": 4.8713185681338477e-08, + "loss": 0.0032, + "num_tokens": 67994881.0, + "reward": 1.2083333730697632, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.6240935325622559, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 179.625, + "completions/mean_terminated_length": 179.625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.860392967942089, + "grad_norm": 2.847799628996861, + "kl": 0.06591796875, + "learning_rate": 4.801620591389477e-08, + "loss": 0.0026, + "num_tokens": 68074736.0, + "reward": 1.1666667461395264, + "reward_std": 0.08908708393573761, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.28232985734939575, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 156.5416717529297, + "completions/mean_terminated_length": 156.5416717529297, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.8614270941054809, + "grad_norm": 3.2176346783915846, + "kl": 0.0439453125, + "learning_rate": 4.7323996705531335e-08, + "loss": 0.0018, + "num_tokens": 68152653.0, + "reward": 1.1875, + "reward_std": 0.2041093111038208, + "rewards/reasoning_reward/mean": 1.1875, + "rewards/reasoning_reward/std": 0.4376940429210663, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 122.91667175292969, + "completions/mean_terminated_length": 122.91667175292969, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.8624612202688728, + "grad_norm": 2.119171203323999, + "kl": 0.05859375, + "learning_rate": 4.6636565362317304e-08, + "loss": 0.0023, + "num_tokens": 68230691.0, + "reward": 0.75, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 151.4166717529297, + "completions/mean_terminated_length": 151.4166717529297, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.8634953464322648, + "grad_norm": 2.9688355068863923, + "kl": 0.06591796875, + "learning_rate": 4.59539191398931e-08, + "loss": 0.0026, + "num_tokens": 68307461.0, + "reward": 1.1458333730697632, + "reward_std": 0.10681166499853134, + "rewards/reasoning_reward/mean": 1.1458333730697632, + "rewards/reasoning_reward/std": 0.2750164568424225, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 183.70834350585938, + "completions/mean_terminated_length": 183.70834350585938, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.8645294725956567, + "grad_norm": 4.374431997930627, + "kl": 0.10595703125, + "learning_rate": 4.527606524339328e-08, + "loss": 0.0043, + "num_tokens": 68391278.0, + "reward": 1.3958333730697632, + "reward_std": 0.4973698854446411, + "rewards/reasoning_reward/mean": 1.3958333730697632, + "rewards/reasoning_reward/std": 0.5512666702270508, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 117.33333587646484, + "completions/mean_terminated_length": 117.33333587646484, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.8655635987590486, + "grad_norm": 3.368684441355164, + "kl": 0.12890625, + "learning_rate": 4.4603010827371224e-08, + "loss": 0.0052, + "num_tokens": 68475710.0, + "reward": 0.9583333730697632, + "reward_std": 0.21362332999706268, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.35864076018333435, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 110.04167175292969, + "completions/mean_terminated_length": 110.04167175292969, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.8665977249224406, + "grad_norm": 0.4287350953005526, + "kl": 0.060546875, + "learning_rate": 4.393476299572263e-08, + "loss": 0.0024, + "num_tokens": 68552959.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 172.4166717529297, + "completions/mean_terminated_length": 172.4166717529297, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.8676318510858325, + "grad_norm": 3.1249910497191165, + "kl": 0.0546875, + "learning_rate": 4.327132880161161e-08, + "loss": 0.0022, + "num_tokens": 68630249.0, + "reward": 0.4791666865348816, + "reward_std": 0.2041093111038208, + "rewards/reasoning_reward/mean": 0.4791666567325592, + "rewards/reasoning_reward/std": 0.6507381796836853, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 168.70834350585938, + "completions/mean_terminated_length": 168.70834350585938, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.8686659772492245, + "grad_norm": 3.2822755480432844, + "kl": 0.052490234375, + "learning_rate": 4.261271524739524e-08, + "loss": 0.0021, + "num_tokens": 68714258.0, + "reward": 1.4027776718139648, + "reward_std": 0.2818480134010315, + "rewards/reasoning_reward/mean": 1.4027776718139648, + "rewards/reasoning_reward/std": 0.44482171535491943, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 139.0416717529297, + "completions/mean_terminated_length": 139.0416717529297, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.8697001034126164, + "grad_norm": 3.768686239797151, + "kl": 0.052978515625, + "learning_rate": 4.195892928455047e-08, + "loss": 0.0021, + "num_tokens": 68792851.0, + "reward": 1.0416667461395264, + "reward_std": 0.2314550280570984, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.3877657949924469, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 149.45834350585938, + "completions/mean_terminated_length": 149.45834350585938, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.8707342295760083, + "grad_norm": 2.9671230713429453, + "kl": 0.043212890625, + "learning_rate": 4.130997781360035e-08, + "loss": 0.0017, + "num_tokens": 68872318.0, + "reward": 0.7916666865348816, + "reward_std": 0.29602527618408203, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 150.1666717529297, + "completions/mean_terminated_length": 150.1666717529297, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.8717683557394003, + "grad_norm": 3.480205852106029, + "kl": 0.068359375, + "learning_rate": 4.0665867684041013e-08, + "loss": 0.0027, + "num_tokens": 68947754.0, + "reward": 0.7708333730697632, + "reward_std": 0.28302299976348877, + "rewards/reasoning_reward/mean": 0.7708333134651184, + "rewards/reasoning_reward/std": 0.416485458612442, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 161.9166717529297, + "completions/mean_terminated_length": 161.9166717529297, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.8728024819027922, + "grad_norm": 2.661968506889756, + "kl": 0.07373046875, + "learning_rate": 4.002660569426997e-08, + "loss": 0.003, + "num_tokens": 69031840.0, + "reward": 1.3541667461395264, + "reward_std": 0.0589255653321743, + "rewards/reasoning_reward/mean": 1.3541666269302368, + "rewards/reasoning_reward/std": 0.47729232907295227, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 189.875, + "completions/mean_terminated_length": 189.26087951660156, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.8738366080661841, + "grad_norm": 3.489151843177023, + "kl": 0.06689453125, + "learning_rate": 3.939219859151377e-08, + "loss": 0.0027, + "num_tokens": 69110189.0, + "reward": 0.7847222685813904, + "reward_std": 0.5546908974647522, + "rewards/reasoning_reward/mean": 0.7847222685813904, + "rewards/reasoning_reward/std": 0.699342668056488, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 150.2916717529297, + "completions/mean_terminated_length": 150.2916717529297, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.8748707342295761, + "grad_norm": 3.1194409662404183, + "kl": 0.0859375, + "learning_rate": 3.876265307175714e-08, + "loss": 0.0034, + "num_tokens": 69188180.0, + "reward": 0.5416666865348816, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.5416666865348816, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 160.95834350585938, + "completions/mean_terminated_length": 160.95834350585938, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.8759048603929679, + "grad_norm": 4.060590754998399, + "kl": 0.06396484375, + "learning_rate": 3.813797577967209e-08, + "loss": 0.0026, + "num_tokens": 69265083.0, + "reward": 1.2291667461395264, + "reward_std": 0.33108004927635193, + "rewards/reasoning_reward/mean": 1.2291666269302368, + "rewards/reasoning_reward/std": 0.5103103518486023, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 197.4166717529297, + "completions/mean_terminated_length": 197.4166717529297, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.8769389865563598, + "grad_norm": 1.7203510455755477, + "kl": 0.07275390625, + "learning_rate": 3.751817330854806e-08, + "loss": 0.0029, + "num_tokens": 69346101.0, + "reward": 0.7083333730697632, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.5500329732894897, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 162.25, + "completions/mean_terminated_length": 162.25, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.8779731127197518, + "grad_norm": 3.1548182837694405, + "kl": 0.0751953125, + "learning_rate": 3.6903252200222e-08, + "loss": 0.003, + "num_tokens": 69430283.0, + "reward": 0.375, + "reward_std": 0.2314550280570984, + "rewards/reasoning_reward/mean": 0.375, + "rewards/reasoning_reward/std": 0.47204458713531494, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 153.5, + "completions/mean_terminated_length": 153.5, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.8790072388831437, + "grad_norm": 3.229700554376665, + "kl": 0.0751953125, + "learning_rate": 3.6293218945009364e-08, + "loss": 0.003, + "num_tokens": 69508583.0, + "reward": 0.7708333730697632, + "reward_std": 0.2041093111038208, + "rewards/reasoning_reward/mean": 0.7708333134651184, + "rewards/reasoning_reward/std": 0.5311833620071411, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 140.0416717529297, + "completions/mean_terminated_length": 140.0416717529297, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.8800413650465356, + "grad_norm": 3.6374367800290743, + "kl": 0.08837890625, + "learning_rate": 3.56880799816362e-08, + "loss": 0.0035, + "num_tokens": 69597664.0, + "reward": 1.2916667461395264, + "reward_std": 0.252508282661438, + "rewards/reasoning_reward/mean": 1.2916666269302368, + "rewards/reasoning_reward/std": 0.8795173168182373, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 151.58334350585938, + "completions/mean_terminated_length": 151.58334350585938, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.8810754912099276, + "grad_norm": 3.8095535645459964, + "kl": 0.07373046875, + "learning_rate": 3.50878416971701e-08, + "loss": 0.0029, + "num_tokens": 69679846.0, + "reward": 0.8541666865348816, + "reward_std": 0.42002925276756287, + "rewards/reasoning_reward/mean": 0.8541666865348816, + "rewards/reasoning_reward/std": 0.8531165719032288, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 130.58334350585938, + "completions/mean_terminated_length": 130.58334350585938, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.8821096173733195, + "grad_norm": 2.5914771322358585, + "kl": 0.06494140625, + "learning_rate": 3.449251042695378e-08, + "loss": 0.0026, + "num_tokens": 69766932.0, + "reward": 1.2916667461395264, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 1.2916666269302368, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 139.625, + "completions/mean_terminated_length": 139.625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.8831437435367114, + "grad_norm": 2.3610444312496774, + "kl": 0.07373046875, + "learning_rate": 3.39020924545379e-08, + "loss": 0.0029, + "num_tokens": 69848611.0, + "reward": 1.25, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 1.25, + "rewards/reasoning_reward/std": 0.6079187393188477, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 160.0, + "completions/mean_terminated_length": 160.0, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.8841778697001034, + "grad_norm": 3.907652167180534, + "kl": 0.07861328125, + "learning_rate": 3.331659401161435e-08, + "loss": 0.0032, + "num_tokens": 69936947.0, + "reward": 1.7083333730697632, + "reward_std": 0.3897872865200043, + "rewards/reasoning_reward/mean": 1.7083333730697632, + "rewards/reasoning_reward/std": 0.4402732849121094, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 163.5416717529297, + "completions/mean_terminated_length": 163.5416717529297, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.8852119958634953, + "grad_norm": 3.4271002980194485, + "kl": 0.072265625, + "learning_rate": 3.2736021277951055e-08, + "loss": 0.0029, + "num_tokens": 70016560.0, + "reward": 0.6666666865348816, + "reward_std": 0.39000558853149414, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.5646597146987915, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 309.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 168.875, + "completions/mean_terminated_length": 162.78260803222656, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.8862461220268872, + "grad_norm": 3.242036202397786, + "kl": 0.10400390625, + "learning_rate": 3.216038038132623e-08, + "loss": 0.0042, + "num_tokens": 70099381.0, + "reward": 0.75, + "reward_std": 0.3900056481361389, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.6079187393188477, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 124.16667175292969, + "completions/mean_terminated_length": 124.16667175292969, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.8872802481902792, + "grad_norm": 2.4008627320484064, + "kl": 0.039306640625, + "learning_rate": 3.1589677397464433e-08, + "loss": 0.0016, + "num_tokens": 70178001.0, + "reward": 0.875, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 144.9166717529297, + "completions/mean_terminated_length": 144.9166717529297, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.8883143743536711, + "grad_norm": 2.3968224744059543, + "kl": 0.06298828125, + "learning_rate": 3.102391834997142e-08, + "loss": 0.0025, + "num_tokens": 70260223.0, + "reward": 1.0416667461395264, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.20412413775920868, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 124.16667175292969, + "completions/mean_terminated_length": 124.16667175292969, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.889348500517063, + "grad_norm": 3.3305458427955363, + "kl": 0.08056640625, + "learning_rate": 3.0463109210271566e-08, + "loss": 0.0032, + "num_tokens": 70343299.0, + "reward": 1.2708333730697632, + "reward_std": 0.30318546295166016, + "rewards/reasoning_reward/mean": 1.2708333730697632, + "rewards/reasoning_reward/std": 0.5893837809562683, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 163.33334350585938, + "completions/mean_terminated_length": 163.33334350585938, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.890382626680455, + "grad_norm": 4.099541212176896, + "kl": 0.06494140625, + "learning_rate": 2.990725589754406e-08, + "loss": 0.0026, + "num_tokens": 70434139.0, + "reward": 1.3333333730697632, + "reward_std": 0.46631526947021484, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.6197241544723511, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 149.95834350585938, + "completions/mean_terminated_length": 149.95834350585938, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.8914167528438469, + "grad_norm": 2.2220524006143076, + "kl": 0.0732421875, + "learning_rate": 2.935636427866095e-08, + "loss": 0.0029, + "num_tokens": 70517314.0, + "reward": 0.9513888359069824, + "reward_std": 0.06924576312303543, + "rewards/reasoning_reward/mean": 0.9513888359069824, + "rewards/reasoning_reward/std": 0.7824759483337402, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 161.58334350585938, + "completions/mean_terminated_length": 161.58334350585938, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.8924508790072389, + "grad_norm": 3.2564376835733837, + "kl": 0.059814453125, + "learning_rate": 2.881044016812506e-08, + "loss": 0.0024, + "num_tokens": 70593264.0, + "reward": 0.4791666865348816, + "reward_std": 0.3704721927642822, + "rewards/reasoning_reward/mean": 0.4791666567325592, + "rewards/reasoning_reward/std": 0.5610387921333313, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 157.0416717529297, + "completions/mean_terminated_length": 157.0416717529297, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.8934850051706308, + "grad_norm": 3.445585242026353, + "kl": 0.07763671875, + "learning_rate": 2.8269489328008433e-08, + "loss": 0.0031, + "num_tokens": 70677257.0, + "reward": 1.2083333730697632, + "reward_std": 0.20693820714950562, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.550032913684845, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 124.54167175292969, + "completions/mean_terminated_length": 124.54167175292969, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.8945191313340227, + "grad_norm": 3.26944024523484, + "kl": 0.07421875, + "learning_rate": 2.7733517467891822e-08, + "loss": 0.003, + "num_tokens": 70761230.0, + "reward": 1.2291667461395264, + "reward_std": 0.21322892606258392, + "rewards/reasoning_reward/mean": 1.2291666269302368, + "rewards/reasoning_reward/std": 0.5893837213516235, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 143.08334350585938, + "completions/mean_terminated_length": 143.08334350585938, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.8955532574974147, + "grad_norm": 3.6817862651467532, + "kl": 0.08154296875, + "learning_rate": 2.720253024480418e-08, + "loss": 0.0033, + "num_tokens": 70855008.0, + "reward": 1.6458333730697632, + "reward_std": 0.35878798365592957, + "rewards/reasoning_reward/mean": 1.6458333730697632, + "rewards/reasoning_reward/std": 0.5208514332771301, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 147.875, + "completions/mean_terminated_length": 147.875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.8965873836608066, + "grad_norm": 3.9516108836506296, + "kl": 0.06982421875, + "learning_rate": 2.6676533263163103e-08, + "loss": 0.0028, + "num_tokens": 70931629.0, + "reward": 0.6041666865348816, + "reward_std": 0.4130779206752777, + "rewards/reasoning_reward/mean": 0.6041666865348816, + "rewards/reasoning_reward/std": 0.5893837213516235, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 302.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 150.0416717529297, + "completions/mean_terminated_length": 143.43478393554688, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.8976215098241985, + "grad_norm": 5.483549708220734, + "kl": 0.21484375, + "learning_rate": 2.6155532074715548e-08, + "loss": 0.0086, + "num_tokens": 71011766.0, + "reward": 0.7083333730697632, + "reward_std": 0.46288391947746277, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 138.08334350585938, + "completions/mean_terminated_length": 138.08334350585938, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.8986556359875905, + "grad_norm": 3.8988737222080285, + "kl": 0.07373046875, + "learning_rate": 2.5639532178479417e-08, + "loss": 0.0029, + "num_tokens": 71096920.0, + "reward": 1.25, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 1.25, + "rewards/reasoning_reward/std": 0.6079187393188477, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 154.70834350585938, + "completions/mean_terminated_length": 154.70834350585938, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.8996897621509824, + "grad_norm": 3.83680323474309, + "kl": 0.044921875, + "learning_rate": 2.512853902068529e-08, + "loss": 0.0018, + "num_tokens": 71174081.0, + "reward": 0.875, + "reward_std": 0.46288391947746277, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 133.33334350585938, + "completions/mean_terminated_length": 133.33334350585938, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.9007238883143743, + "grad_norm": 4.143284467335127, + "kl": 0.08740234375, + "learning_rate": 2.462255799471913e-08, + "loss": 0.0035, + "num_tokens": 71258481.0, + "reward": 1.4583333730697632, + "reward_std": 0.42645785212516785, + "rewards/reasoning_reward/mean": 1.4583333730697632, + "rewards/reasoning_reward/std": 0.5882299542427063, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 141.9166717529297, + "completions/mean_terminated_length": 141.9166717529297, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.9017580144777663, + "grad_norm": 2.618618934323824, + "kl": 0.115234375, + "learning_rate": 2.412159444106543e-08, + "loss": 0.0046, + "num_tokens": 71340479.0, + "reward": 0.7083333730697632, + "reward_std": 0.21362332999706268, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.6240935921669006, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 113.25, + "completions/mean_terminated_length": 113.25, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.9027921406411582, + "grad_norm": 2.942245207754384, + "kl": 0.06396484375, + "learning_rate": 2.3625653647250388e-08, + "loss": 0.0026, + "num_tokens": 71425653.0, + "reward": 1.125, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.5366967916488647, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 149.58334350585938, + "completions/mean_terminated_length": 149.58334350585938, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.9038262668045501, + "grad_norm": 3.0454811913733706, + "kl": 0.099609375, + "learning_rate": 2.3134740847786715e-08, + "loss": 0.004, + "num_tokens": 71513515.0, + "reward": 1.5416667461395264, + "reward_std": 0.20693820714950562, + "rewards/reasoning_reward/mean": 1.5416666269302368, + "rewards/reasoning_reward/std": 0.550032913684845, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 179.33334350585938, + "completions/mean_terminated_length": 179.33334350585938, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.9048603929679421, + "grad_norm": 2.930325510601697, + "kl": 0.054931640625, + "learning_rate": 2.2648861224117856e-08, + "loss": 0.0022, + "num_tokens": 71591131.0, + "reward": 1.3541667461395264, + "reward_std": 0.3433460593223572, + "rewards/reasoning_reward/mean": 1.3541666269302368, + "rewards/reasoning_reward/std": 0.47729232907295227, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 179.20834350585938, + "completions/mean_terminated_length": 179.20834350585938, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.905894519131334, + "grad_norm": 4.432380095813641, + "kl": 0.0673828125, + "learning_rate": 2.2168019904563683e-08, + "loss": 0.0027, + "num_tokens": 71668936.0, + "reward": 0.5833333730697632, + "reward_std": 0.3900056481361389, + "rewards/reasoning_reward/mean": 0.5833333134651184, + "rewards/reasoning_reward/std": 0.5036101341247559, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 136.125, + "completions/mean_terminated_length": 136.125, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.9069286452947259, + "grad_norm": 3.138554136523989, + "kl": 0.072265625, + "learning_rate": 2.1692221964266123e-08, + "loss": 0.0029, + "num_tokens": 71752219.0, + "reward": 1.2083333730697632, + "reward_std": 0.3535533845424652, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.5882299542427063, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 146.875, + "completions/mean_terminated_length": 146.875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.9079627714581179, + "grad_norm": 2.9483481427069043, + "kl": 0.044677734375, + "learning_rate": 2.122147242513578e-08, + "loss": 0.0018, + "num_tokens": 71832360.0, + "reward": 0.7083333730697632, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.7083333134651184, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 150.08334350585938, + "completions/mean_terminated_length": 150.08334350585938, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.9089968976215098, + "grad_norm": 3.1073607500565905, + "kl": 0.048828125, + "learning_rate": 2.0755776255798718e-08, + "loss": 0.002, + "num_tokens": 71916330.0, + "reward": 0.8333333730697632, + "reward_std": 0.39000558853149414, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.637022078037262, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 175.5416717529297, + "completions/mean_terminated_length": 175.5416717529297, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.9100310237849017, + "grad_norm": 3.8150186805318964, + "kl": 0.06884765625, + "learning_rate": 2.0295138371544228e-08, + "loss": 0.0028, + "num_tokens": 71994855.0, + "reward": 0.9583333730697632, + "reward_std": 0.46288391947746277, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.7506036162376404, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 107.95833587646484, + "completions/mean_terminated_length": 107.95833587646484, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.9110651499482937, + "grad_norm": 2.4581793568003634, + "kl": 0.05078125, + "learning_rate": 1.9839563634272972e-08, + "loss": 0.002, + "num_tokens": 72073422.0, + "reward": 0.9583333730697632, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.20412415266036987, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 156.0416717529297, + "completions/mean_terminated_length": 156.0416717529297, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.9120992761116856, + "grad_norm": 2.6154409593707686, + "kl": 0.0732421875, + "learning_rate": 1.938905685244513e-08, + "loss": 0.0029, + "num_tokens": 72157559.0, + "reward": 0.5416666865348816, + "reward_std": 0.24800793826580048, + "rewards/reasoning_reward/mean": 0.5416666865348816, + "rewards/reasoning_reward/std": 0.5882299542427063, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 136.95834350585938, + "completions/mean_terminated_length": 136.95834350585938, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.9131334022750776, + "grad_norm": 3.5680063997463867, + "kl": 0.07763671875, + "learning_rate": 1.8943622781030564e-08, + "loss": 0.0031, + "num_tokens": 72236238.0, + "reward": 0.2916666865348816, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 0.2916666567325592, + "rewards/reasoning_reward/std": 0.4643056094646454, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 149.45834350585938, + "completions/mean_terminated_length": 149.45834350585938, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.9141675284384695, + "grad_norm": 3.2270543081927117, + "kl": 0.072265625, + "learning_rate": 1.850326612145775e-08, + "loss": 0.0029, + "num_tokens": 72320209.0, + "reward": 1.1041667461395264, + "reward_std": 0.323208749294281, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.6753286719322205, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 169.83334350585938, + "completions/mean_terminated_length": 169.83334350585938, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.9152016546018614, + "grad_norm": 4.260735191605596, + "kl": 0.0556640625, + "learning_rate": 1.8067991521564852e-08, + "loss": 0.0022, + "num_tokens": 72399093.0, + "reward": 0.8958333730697632, + "reward_std": 0.3960779905319214, + "rewards/reasoning_reward/mean": 0.8958333134651184, + "rewards/reasoning_reward/std": 0.7220015525817871, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 178.625, + "completions/mean_terminated_length": 178.625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.9162357807652534, + "grad_norm": 3.3552268374877303, + "kl": 0.076171875, + "learning_rate": 1.7637803575550115e-08, + "loss": 0.003, + "num_tokens": 72476500.0, + "reward": 0.8333333730697632, + "reward_std": 0.5009793043136597, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.601929247379303, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 147.33334350585938, + "completions/mean_terminated_length": 147.33334350585938, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.9172699069286453, + "grad_norm": 2.054383529313265, + "kl": 0.0517578125, + "learning_rate": 1.7212706823923674e-08, + "loss": 0.0021, + "num_tokens": 72554484.0, + "reward": 0.5416666865348816, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.5416666865348816, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 156.125, + "completions/mean_terminated_length": 156.125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.9183040330920372, + "grad_norm": 4.1619103374208235, + "kl": 0.08837890625, + "learning_rate": 1.6792705753459757e-08, + "loss": 0.0035, + "num_tokens": 72632119.0, + "reward": 0.875, + "reward_std": 0.5383754968643188, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.5757792592048645, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 152.95834350585938, + "completions/mean_terminated_length": 152.95834350585938, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.9193381592554292, + "grad_norm": 2.5645639037488097, + "kl": 0.06494140625, + "learning_rate": 1.6377804797148788e-08, + "loss": 0.0026, + "num_tokens": 72718998.0, + "reward": 1.2291667461395264, + "reward_std": 0.2041093111038208, + "rewards/reasoning_reward/mean": 1.2291666269302368, + "rewards/reasoning_reward/std": 0.48854637145996094, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 152.5416717529297, + "completions/mean_terminated_length": 152.5416717529297, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.9203722854188211, + "grad_norm": 3.725569433437225, + "kl": 0.07177734375, + "learning_rate": 1.596800833415135e-08, + "loss": 0.0029, + "num_tokens": 72796523.0, + "reward": 1.0416667461395264, + "reward_std": 0.2616034746170044, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.4871538281440735, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 191.5416717529297, + "completions/mean_terminated_length": 191.5416717529297, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.921406411582213, + "grad_norm": 2.795098631521861, + "kl": 0.0556640625, + "learning_rate": 1.5563320689751192e-08, + "loss": 0.0022, + "num_tokens": 72874368.0, + "reward": 0.8402777910232544, + "reward_std": 0.38484740257263184, + "rewards/reasoning_reward/mean": 0.8402777314186096, + "rewards/reasoning_reward/std": 0.7824759483337402, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 165.58334350585938, + "completions/mean_terminated_length": 165.58334350585938, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.922440537745605, + "grad_norm": 2.3803367727934823, + "kl": 0.0634765625, + "learning_rate": 1.5163746135310186e-08, + "loss": 0.0025, + "num_tokens": 72953078.0, + "reward": 0.875, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04166666666666663, + "completions/max_length": 355.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 193.33334350585938, + "completions/mean_terminated_length": 186.30435180664062, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.9234746639089969, + "grad_norm": 3.5618288413220407, + "kl": 0.057861328125, + "learning_rate": 1.4769288888222985e-08, + "loss": 0.0023, + "num_tokens": 73031438.0, + "reward": 0.7916666865348816, + "reward_std": 0.3917974829673767, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.5089774131774902, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 154.0416717529297, + "completions/mean_terminated_length": 154.0416717529297, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.9245087900723888, + "grad_norm": 4.201487410221962, + "kl": 0.11279296875, + "learning_rate": 1.4379953111872456e-08, + "loss": 0.0045, + "num_tokens": 73114015.0, + "reward": 0.625, + "reward_std": 0.5280382037162781, + "rewards/reasoning_reward/mean": 0.625, + "rewards/reasoning_reward/std": 0.5943574905395508, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 190.75, + "completions/mean_terminated_length": 190.75, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.9255429162357808, + "grad_norm": 1.774495070242477, + "kl": 0.083984375, + "learning_rate": 1.3995742915585806e-08, + "loss": 0.0034, + "num_tokens": 73203993.0, + "reward": 1.5208333730697632, + "reward_std": 0.13908717036247253, + "rewards/reasoning_reward/mean": 1.5208333730697632, + "rewards/reasoning_reward/std": 0.47729232907295227, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 174.7916717529297, + "completions/mean_terminated_length": 174.7916717529297, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.9265770423991727, + "grad_norm": 1.9576728453570886, + "kl": 0.0517578125, + "learning_rate": 1.3616662354591356e-08, + "loss": 0.0021, + "num_tokens": 73284876.0, + "reward": 0.875, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 186.625, + "completions/mean_terminated_length": 186.625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.9276111685625646, + "grad_norm": 3.2466683150458957, + "kl": 0.07373046875, + "learning_rate": 1.3242715429975515e-08, + "loss": 0.003, + "num_tokens": 73365827.0, + "reward": 0.8333333730697632, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.7019641399383545, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 160.9166717529297, + "completions/mean_terminated_length": 160.9166717529297, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.9286452947259566, + "grad_norm": 3.1708164450347276, + "kl": 0.09814453125, + "learning_rate": 1.2873906088640474e-08, + "loss": 0.0039, + "num_tokens": 73443857.0, + "reward": 0.7708333730697632, + "reward_std": 0.3116035461425781, + "rewards/reasoning_reward/mean": 0.7708333134651184, + "rewards/reasoning_reward/std": 0.46576645970344543, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 172.25, + "completions/mean_terminated_length": 172.25, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.9296794208893485, + "grad_norm": 2.0259653414483583, + "kl": 0.052001953125, + "learning_rate": 1.251023822326308e-08, + "loss": 0.0021, + "num_tokens": 73524719.0, + "reward": 0.375, + "reward_std": 0.07715167850255966, + "rewards/reasoning_reward/mean": 0.375, + "rewards/reasoning_reward/std": 0.5565811395645142, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 161.5416717529297, + "completions/mean_terminated_length": 161.5416717529297, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.9307135470527405, + "grad_norm": 3.3236866100891342, + "kl": 0.051025390625, + "learning_rate": 1.2151715672252983e-08, + "loss": 0.002, + "num_tokens": 73601612.0, + "reward": 0.9166666865348816, + "reward_std": 0.30416232347488403, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 147.6666717529297, + "completions/mean_terminated_length": 147.6666717529297, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.9317476732161324, + "grad_norm": 2.423969131690449, + "kl": 0.07470703125, + "learning_rate": 1.179834221971282e-08, + "loss": 0.003, + "num_tokens": 73686812.0, + "reward": 1.2916667461395264, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 1.2916666269302368, + "rewards/reasoning_reward/std": 0.550032913684845, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 140.1666717529297, + "completions/mean_terminated_length": 140.1666717529297, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.9327817993795243, + "grad_norm": 3.842346880234536, + "kl": 0.0615234375, + "learning_rate": 1.145012159539771e-08, + "loss": 0.0025, + "num_tokens": 73762864.0, + "reward": 0.6041666865348816, + "reward_std": 0.4130779504776001, + "rewards/reasoning_reward/mean": 0.6041666865348816, + "rewards/reasoning_reward/std": 0.6753286719322205, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 142.08334350585938, + "completions/mean_terminated_length": 142.08334350585938, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.9338159255429163, + "grad_norm": 3.166928125853971, + "kl": 0.10595703125, + "learning_rate": 1.110705747467644e-08, + "loss": 0.0043, + "num_tokens": 73844906.0, + "reward": 0.9652777910232544, + "reward_std": 0.2400643527507782, + "rewards/reasoning_reward/mean": 0.9652777314186096, + "rewards/reasoning_reward/std": 0.7420743107795715, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 163.1666717529297, + "completions/mean_terminated_length": 163.1666717529297, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.9348500517063082, + "grad_norm": 4.500549858418397, + "kl": 0.1015625, + "learning_rate": 1.076915347849211e-08, + "loss": 0.0041, + "num_tokens": 73922094.0, + "reward": 1.0, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.2948839068412781, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 142.6666717529297, + "completions/mean_terminated_length": 142.6666717529297, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.9358841778697001, + "grad_norm": 4.169667991174683, + "kl": 0.10302734375, + "learning_rate": 1.0436413173324387e-08, + "loss": 0.0041, + "num_tokens": 74015702.0, + "reward": 1.5416667461395264, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 1.5416666269302368, + "rewards/reasoning_reward/std": 0.6412736773490906, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 170.58334350585938, + "completions/mean_terminated_length": 170.58334350585938, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.9369183040330921, + "grad_norm": 4.512863571654002, + "kl": 0.07861328125, + "learning_rate": 1.0108840071151648e-08, + "loss": 0.0031, + "num_tokens": 74091884.0, + "reward": 1.0625, + "reward_std": 0.6556500792503357, + "rewards/reasoning_reward/mean": 1.0625, + "rewards/reasoning_reward/std": 0.6964584589004517, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 161.0, + "completions/mean_terminated_length": 161.0, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.937952430196484, + "grad_norm": 2.964181511090769, + "kl": 0.09423828125, + "learning_rate": 9.786437629413669e-09, + "loss": 0.0038, + "num_tokens": 74174724.0, + "reward": 1.4166667461395264, + "reward_std": 0.1346571445465088, + "rewards/reasoning_reward/mean": 1.4166666269302368, + "rewards/reasoning_reward/std": 0.39927470684051514, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 159.5, + "completions/mean_terminated_length": 159.5, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.9389865563598759, + "grad_norm": 0.17194578664988994, + "kl": 0.06396484375, + "learning_rate": 9.469209250975774e-09, + "loss": 0.0026, + "num_tokens": 74255520.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 164.5, + "completions/mean_terminated_length": 164.5, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.9400206825232679, + "grad_norm": 3.278300391654624, + "kl": 0.055419921875, + "learning_rate": 9.157158284092248e-09, + "loss": 0.0022, + "num_tokens": 74340668.0, + "reward": 0.7916666865348816, + "reward_std": 0.4082186222076416, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 153.5, + "completions/mean_terminated_length": 153.5, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.9410548086866598, + "grad_norm": 3.3566487108649325, + "kl": 0.055908203125, + "learning_rate": 8.850288022371478e-09, + "loss": 0.0022, + "num_tokens": 74419544.0, + "reward": 0.7916666865348816, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148510992527008, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 123.66667175292969, + "completions/mean_terminated_length": 123.66667175292969, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.9420889348500517, + "grad_norm": 2.6002648351979984, + "kl": 0.0478515625, + "learning_rate": 8.548601704740754e-09, + "loss": 0.0019, + "num_tokens": 74500136.0, + "reward": 0.7916666865348816, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.4148511290550232, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 149.875, + "completions/mean_terminated_length": 149.875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.9431230610134437, + "grad_norm": 3.0400992365987864, + "kl": 0.044189453125, + "learning_rate": 8.25210251541264e-09, + "loss": 0.0018, + "num_tokens": 74576941.0, + "reward": 0.9583333730697632, + "reward_std": 0.29602527618408203, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 159.25, + "completions/mean_terminated_length": 159.25, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.9441571871768356, + "grad_norm": 3.3459982295899047, + "kl": 0.0830078125, + "learning_rate": 7.960793583850767e-09, + "loss": 0.0033, + "num_tokens": 74667907.0, + "reward": 1.3194445371627808, + "reward_std": 0.38191652297973633, + "rewards/reasoning_reward/mean": 1.3194445371627808, + "rewards/reasoning_reward/std": 0.5580258965492249, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 147.20834350585938, + "completions/mean_terminated_length": 147.20834350585938, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.9451913133402275, + "grad_norm": 3.428054229956691, + "kl": 0.08642578125, + "learning_rate": 7.674677984737255e-09, + "loss": 0.0035, + "num_tokens": 74750248.0, + "reward": 1.1666667461395264, + "reward_std": 0.3535081148147583, + "rewards/reasoning_reward/mean": 1.1666666269302368, + "rewards/reasoning_reward/std": 0.4340573847293854, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 154.20834350585938, + "completions/mean_terminated_length": 154.20834350585938, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.9462254395036195, + "grad_norm": 1.9872121372484721, + "kl": 0.042724609375, + "learning_rate": 7.393758737940126e-09, + "loss": 0.0017, + "num_tokens": 74829717.0, + "reward": 0.9583333730697632, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 0.9583333134651184, + "rewards/reasoning_reward/std": 0.20412415266036987, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 132.0, + "completions/mean_terminated_length": 132.0, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.9472595656670114, + "grad_norm": 0.19738482494940646, + "kl": 0.0498046875, + "learning_rate": 7.1180388084811635e-09, + "loss": 0.002, + "num_tokens": 74909541.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 0.6666666865348816, + "rewards/reasoning_reward/std": 0.4815434217453003, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 156.08334350585938, + "completions/mean_terminated_length": 156.08334350585938, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.9482936918304034, + "grad_norm": 3.2992755635594575, + "kl": 0.07373046875, + "learning_rate": 6.847521106505105e-09, + "loss": 0.0029, + "num_tokens": 74987655.0, + "reward": 1.277777910232544, + "reward_std": 0.33984312415122986, + "rewards/reasoning_reward/mean": 1.2777777910232544, + "rewards/reasoning_reward/std": 0.478187620639801, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 158.75, + "completions/mean_terminated_length": 158.75, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.9493278179937953, + "grad_norm": 4.365967227862626, + "kl": 0.07666015625, + "learning_rate": 6.582208487248497e-09, + "loss": 0.0031, + "num_tokens": 75070441.0, + "reward": 1.3333333730697632, + "reward_std": 0.24043500423431396, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.5582062602043152, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 170.4166717529297, + "completions/mean_terminated_length": 170.4166717529297, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.9503619441571872, + "grad_norm": 2.937333755573114, + "kl": 0.07470703125, + "learning_rate": 6.322103751009833e-09, + "loss": 0.003, + "num_tokens": 75154867.0, + "reward": 1.1041667461395264, + "reward_std": 0.2644323706626892, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.642332136631012, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 165.875, + "completions/mean_terminated_length": 165.875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.9513960703205792, + "grad_norm": 2.975341812717674, + "kl": 0.06591796875, + "learning_rate": 6.067209643119908e-09, + "loss": 0.0026, + "num_tokens": 75245144.0, + "reward": 1.2569444179534912, + "reward_std": 0.3012464940547943, + "rewards/reasoning_reward/mean": 1.2569444179534912, + "rewards/reasoning_reward/std": 0.8708637952804565, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 160.20834350585938, + "completions/mean_terminated_length": 160.20834350585938, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.9524301964839711, + "grad_norm": 4.142950589572342, + "kl": 0.06640625, + "learning_rate": 5.817528853912735e-09, + "loss": 0.0027, + "num_tokens": 75324685.0, + "reward": 0.5416666865348816, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 0.5416666865348816, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 170.58334350585938, + "completions/mean_terminated_length": 170.58334350585938, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.953464322647363, + "grad_norm": 3.416223841212196, + "kl": 0.0537109375, + "learning_rate": 5.573064018697393e-09, + "loss": 0.0021, + "num_tokens": 75404139.0, + "reward": 0.5416666865348816, + "reward_std": 0.3268197476863861, + "rewards/reasoning_reward/mean": 0.5416666865348816, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 187.6666717529297, + "completions/mean_terminated_length": 187.6666717529297, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.954498448810755, + "grad_norm": 2.3838014741617886, + "kl": 0.07275390625, + "learning_rate": 5.333817717729894e-09, + "loss": 0.0029, + "num_tokens": 75481619.0, + "reward": 1.625, + "reward_std": 0.07715167850255966, + "rewards/reasoning_reward/mean": 1.625, + "rewards/reasoning_reward/std": 0.47204458713531494, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 132.7916717529297, + "completions/mean_terminated_length": 132.7916717529297, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.9555325749741469, + "grad_norm": 4.457264325946904, + "kl": 0.08251953125, + "learning_rate": 5.099792476186249e-09, + "loss": 0.0033, + "num_tokens": 75563334.0, + "reward": 1.125, + "reward_std": 0.47920867800712585, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.5160468220710754, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 170.20834350585938, + "completions/mean_terminated_length": 170.20834350585938, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.9565667011375388, + "grad_norm": 5.647062715557137, + "kl": 0.09619140625, + "learning_rate": 4.870990764135552e-09, + "loss": 0.0039, + "num_tokens": 75647859.0, + "reward": 0.75, + "reward_std": 0.510651707649231, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.5316095352172852, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 126.04167175292969, + "completions/mean_terminated_length": 126.04167175292969, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.9576008273009308, + "grad_norm": 3.442181606982195, + "kl": 0.055908203125, + "learning_rate": 4.647414996514276e-09, + "loss": 0.0022, + "num_tokens": 75723900.0, + "reward": 1.3958333730697632, + "reward_std": 0.23144195973873138, + "rewards/reasoning_reward/mean": 1.3958333730697632, + "rewards/reasoning_reward/std": 0.4164854884147644, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 137.83334350585938, + "completions/mean_terminated_length": 137.83334350585938, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.9586349534643226, + "grad_norm": 2.5733431583919484, + "kl": 0.0498046875, + "learning_rate": 4.429067533100294e-09, + "loss": 0.002, + "num_tokens": 75802360.0, + "reward": 0.7916666865348816, + "reward_std": 0.3053751587867737, + "rewards/reasoning_reward/mean": 0.7916666865348816, + "rewards/reasoning_reward/std": 0.7790276408195496, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 166.9166717529297, + "completions/mean_terminated_length": 166.9166717529297, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.9596690796277145, + "grad_norm": 3.220240618399756, + "kl": 0.08544921875, + "learning_rate": 4.2159506784884e-09, + "loss": 0.0034, + "num_tokens": 75885278.0, + "reward": 1.3125, + "reward_std": 0.22466278076171875, + "rewards/reasoning_reward/mean": 1.3125, + "rewards/reasoning_reward/std": 0.4848240315914154, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 174.08334350585938, + "completions/mean_terminated_length": 174.08334350585938, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.9607032057911065, + "grad_norm": 3.6058446603983922, + "kl": 0.061279296875, + "learning_rate": 4.0080666820657135e-09, + "loss": 0.0024, + "num_tokens": 75970624.0, + "reward": 0.9548611044883728, + "reward_std": 0.18729600310325623, + "rewards/reasoning_reward/mean": 0.9548611044883728, + "rewards/reasoning_reward/std": 0.6269795298576355, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 173.5, + "completions/mean_terminated_length": 173.5, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.9617373319544984, + "grad_norm": 2.476675970867201, + "kl": 0.041748046875, + "learning_rate": 3.805417737988148e-09, + "loss": 0.0017, + "num_tokens": 76049588.0, + "reward": 0.5416666865348816, + "reward_std": 0.19416078925132751, + "rewards/reasoning_reward/mean": 0.5416666865348816, + "rewards/reasoning_reward/std": 0.5299029350280762, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 146.7916717529297, + "completions/mean_terminated_length": 146.7916717529297, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.9627714581178903, + "grad_norm": 3.988164632708771, + "kl": 0.12158203125, + "learning_rate": 3.6080059851570366e-09, + "loss": 0.0049, + "num_tokens": 76137775.0, + "reward": 1.375, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 1.375, + "rewards/reasoning_reward/std": 0.494535356760025, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 159.5416717529297, + "completions/mean_terminated_length": 159.5416717529297, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.9638055842812823, + "grad_norm": 1.9405645993516027, + "kl": 0.04052734375, + "learning_rate": 3.415833507196764e-09, + "loss": 0.0016, + "num_tokens": 76215860.0, + "reward": 0.5833333730697632, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.5833333134651184, + "rewards/reasoning_reward/std": 0.5036101937294006, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 177.08334350585938, + "completions/mean_terminated_length": 168.68182373046875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.9648397104446742, + "grad_norm": 1.732646593438274, + "kl": 0.045654296875, + "learning_rate": 3.2289023324325592e-09, + "loss": 0.0018, + "num_tokens": 76291502.0, + "reward": 0.75, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 150.2916717529297, + "completions/mean_terminated_length": 150.2916717529297, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.9658738366080661, + "grad_norm": 2.1551686750168, + "kl": 0.045166015625, + "learning_rate": 3.0472144338692386e-09, + "loss": 0.0018, + "num_tokens": 76368853.0, + "reward": 0.875, + "reward_std": 0.17251639068126678, + "rewards/reasoning_reward/mean": 0.875, + "rewards/reasoning_reward/std": 0.337831974029541, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 161.0416717529297, + "completions/mean_terminated_length": 161.0416717529297, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.9669079627714581, + "grad_norm": 4.651808954793325, + "kl": 0.064453125, + "learning_rate": 2.8707717291704405e-09, + "loss": 0.0026, + "num_tokens": 76447278.0, + "reward": 0.5416666865348816, + "reward_std": 0.42201346158981323, + "rewards/reasoning_reward/mean": 0.5416666865348816, + "rewards/reasoning_reward/std": 0.4871537983417511, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 151.70834350585938, + "completions/mean_terminated_length": 151.70834350585938, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.96794208893485, + "grad_norm": 0.17710222219951927, + "kl": 0.05078125, + "learning_rate": 2.6995760806381994e-09, + "loss": 0.002, + "num_tokens": 76526415.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 163.2916717529297, + "completions/mean_terminated_length": 163.2916717529297, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.9689762150982419, + "grad_norm": 4.476052750730717, + "kl": 0.10498046875, + "learning_rate": 2.5336292951933513e-09, + "loss": 0.0042, + "num_tokens": 76603662.0, + "reward": 1.1458333730697632, + "reward_std": 0.4981882572174072, + "rewards/reasoning_reward/mean": 1.1458333730697632, + "rewards/reasoning_reward/std": 0.5800893306732178, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 178.0416717529297, + "completions/mean_terminated_length": 178.0416717529297, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.9700103412616339, + "grad_norm": 3.9771878211904466, + "kl": 0.0654296875, + "learning_rate": 2.372933124356602e-09, + "loss": 0.0026, + "num_tokens": 76681175.0, + "reward": 0.8541666865348816, + "reward_std": 0.5118739604949951, + "rewards/reasoning_reward/mean": 0.8541666865348816, + "rewards/reasoning_reward/std": 0.6507381796836853, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 125.5, + "completions/mean_terminated_length": 125.5, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.9710444674250258, + "grad_norm": 0.8056357466789215, + "kl": 0.08740234375, + "learning_rate": 2.2174892642298215e-09, + "loss": 0.0035, + "num_tokens": 76766011.0, + "reward": 1.3333333730697632, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.3333333730697632, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 132.83334350585938, + "completions/mean_terminated_length": 132.83334350585938, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.9720785935884177, + "grad_norm": 2.873333533007421, + "kl": 0.076171875, + "learning_rate": 2.0672993554783356e-09, + "loss": 0.003, + "num_tokens": 76851615.0, + "reward": 1.0416667461395264, + "reward_std": 0.1178511306643486, + "rewards/reasoning_reward/mean": 1.0416666269302368, + "rewards/reasoning_reward/std": 0.20412413775920868, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 167.20834350585938, + "completions/mean_terminated_length": 167.20834350585938, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.9731127197518097, + "grad_norm": 2.6371055405899173, + "kl": 0.060546875, + "learning_rate": 1.9223649833133847e-09, + "loss": 0.0024, + "num_tokens": 76931660.0, + "reward": 0.6527777910232544, + "reward_std": 0.35072192549705505, + "rewards/reasoning_reward/mean": 0.6527777314186096, + "rewards/reasoning_reward/std": 0.533687949180603, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 171.375, + "completions/mean_terminated_length": 171.375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.9741468459152016, + "grad_norm": 4.0770969322540385, + "kl": 0.10791015625, + "learning_rate": 1.782687677475747e-09, + "loss": 0.0043, + "num_tokens": 77014581.0, + "reward": 1.0347223281860352, + "reward_std": 0.5191335678100586, + "rewards/reasoning_reward/mean": 1.0347222089767456, + "rewards/reasoning_reward/std": 0.6862682700157166, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 136.20834350585938, + "completions/mean_terminated_length": 136.20834350585938, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.9751809720785936, + "grad_norm": 3.5475163041115847, + "kl": 0.091796875, + "learning_rate": 1.6482689122191418e-09, + "loss": 0.0037, + "num_tokens": 77096658.0, + "reward": 1.0208333730697632, + "reward_std": 0.24056154489517212, + "rewards/reasoning_reward/mean": 1.0208333730697632, + "rewards/reasoning_reward/std": 0.6672325730323792, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 164.625, + "completions/mean_terminated_length": 164.625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.9762150982419855, + "grad_norm": 2.9003959757657087, + "kl": 0.05078125, + "learning_rate": 1.5191101062950186e-09, + "loss": 0.002, + "num_tokens": 77177521.0, + "reward": 0.9791666865348816, + "reward_std": 0.2041093111038208, + "rewards/reasoning_reward/mean": 0.9791666865348816, + "rewards/reasoning_reward/std": 0.7442411780357361, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 177.125, + "completions/mean_terminated_length": 177.125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.9772492244053774, + "grad_norm": 2.6967137886335943, + "kl": 0.052490234375, + "learning_rate": 1.3952126229375693e-09, + "loss": 0.0021, + "num_tokens": 77257004.0, + "reward": 0.75, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 0.75, + "rewards/reasoning_reward/std": 0.5316095352172852, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 144.4166717529297, + "completions/mean_terminated_length": 144.4166717529297, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.9782833505687694, + "grad_norm": 3.0293411969549453, + "kl": 0.055908203125, + "learning_rate": 1.2765777698490188e-09, + "loss": 0.0022, + "num_tokens": 77339542.0, + "reward": 0.9166666865348816, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 0.9166666865348816, + "rewards/reasoning_reward/std": 0.28232985734939575, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 161.9166717529297, + "completions/mean_terminated_length": 161.9166717529297, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.9793174767321613, + "grad_norm": 3.148361986897516, + "kl": 0.044677734375, + "learning_rate": 1.163206799186245e-09, + "loss": 0.0018, + "num_tokens": 77423660.0, + "reward": 1.2083333730697632, + "reward_std": 0.2721545100212097, + "rewards/reasoning_reward/mean": 1.2083333730697632, + "rewards/reasoning_reward/std": 0.5089773535728455, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 134.20834350585938, + "completions/mean_terminated_length": 134.20834350585938, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.9803516028955532, + "grad_norm": 2.1261463422153004, + "kl": 0.056396484375, + "learning_rate": 1.0551009075471795e-09, + "loss": 0.0023, + "num_tokens": 77509713.0, + "reward": 1.25, + "reward_std": 0.15430335700511932, + "rewards/reasoning_reward/mean": 1.25, + "rewards/reasoning_reward/std": 0.4423258602619171, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 122.29167175292969, + "completions/mean_terminated_length": 122.29167175292969, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.9813857290589452, + "grad_norm": 2.9707407010412505, + "kl": 0.0703125, + "learning_rate": 9.522612359585402e-10, + "loss": 0.0028, + "num_tokens": 77594632.0, + "reward": 1.1458333730697632, + "reward_std": 0.16517187654972076, + "rewards/reasoning_reward/mean": 1.1458333730697632, + "rewards/reasoning_reward/std": 0.7144345045089722, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 134.45834350585938, + "completions/mean_terminated_length": 134.45834350585938, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.9824198552223371, + "grad_norm": 3.7149348047540585, + "kl": 0.060302734375, + "learning_rate": 8.546888698634513e-10, + "loss": 0.0024, + "num_tokens": 77671787.0, + "reward": 1.4375, + "reward_std": 0.24232356250286102, + "rewards/reasoning_reward/mean": 1.4375, + "rewards/reasoning_reward/std": 0.47348156571388245, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 175.25, + "completions/mean_terminated_length": 175.25, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.983453981385729, + "grad_norm": 3.0064934874302542, + "kl": 0.06640625, + "learning_rate": 7.623848391102305e-10, + "loss": 0.0027, + "num_tokens": 77750801.0, + "reward": 0.8333333730697632, + "reward_std": 0.30860671401023865, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 147.4166717529297, + "completions/mean_terminated_length": 147.4166717529297, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.984488107549121, + "grad_norm": 4.044111150534412, + "kl": 0.0732421875, + "learning_rate": 6.753501179413423e-10, + "loss": 0.0029, + "num_tokens": 77836395.0, + "reward": 1.125, + "reward_std": 0.36751919984817505, + "rewards/reasoning_reward/mean": 1.125, + "rewards/reasoning_reward/std": 0.6634888052940369, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 146.0, + "completions/mean_terminated_length": 146.0, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.9855222337125129, + "grad_norm": 3.045502458871555, + "kl": 0.0859375, + "learning_rate": 5.935856249833504e-10, + "loss": 0.0034, + "num_tokens": 77925307.0, + "reward": 1.3541667461395264, + "reward_std": 0.33324795961380005, + "rewards/reasoning_reward/mean": 1.3541666269302368, + "rewards/reasoning_reward/std": 0.47729232907295227, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 154.70834350585938, + "completions/mean_terminated_length": 154.70834350585938, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.9865563598759048, + "grad_norm": 0.2945822889932008, + "kl": 0.052978515625, + "learning_rate": 5.170922232369257e-10, + "loss": 0.0021, + "num_tokens": 78002764.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 130.75, + "completions/mean_terminated_length": 130.75, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.9875904860392968, + "grad_norm": 0.1833079894818197, + "kl": 0.046630859375, + "learning_rate": 4.4587072006796455e-10, + "loss": 0.0019, + "num_tokens": 78081614.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 152.75, + "completions/mean_terminated_length": 152.75, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.9886246122026887, + "grad_norm": 3.4542538672410337, + "kl": 0.051513671875, + "learning_rate": 3.7992186719892907e-10, + "loss": 0.0021, + "num_tokens": 78160656.0, + "reward": 0.3333333432674408, + "reward_std": 0.2357022613286972, + "rewards/reasoning_reward/mean": 0.3333333432674408, + "rewards/reasoning_reward/std": 0.4815433919429779, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 182.625, + "completions/mean_terminated_length": 182.625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.9896587383660806, + "grad_norm": 3.6021539388001673, + "kl": 0.10205078125, + "learning_rate": 3.1924636070107535e-10, + "loss": 0.0041, + "num_tokens": 78238399.0, + "reward": 0.7708333730697632, + "reward_std": 0.31726133823394775, + "rewards/reasoning_reward/mean": 0.7708333134651184, + "rewards/reasoning_reward/std": 0.416485458612442, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 159.9166717529297, + "completions/mean_terminated_length": 159.9166717529297, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.9906928645294726, + "grad_norm": 3.4288036998953846, + "kl": 0.0771484375, + "learning_rate": 2.6384484098690427e-10, + "loss": 0.0031, + "num_tokens": 78317653.0, + "reward": 1.3958333730697632, + "reward_std": 0.29116004705429077, + "rewards/reasoning_reward/mean": 1.3958333730697632, + "rewards/reasoning_reward/std": 0.4657664895057678, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 157.6666717529297, + "completions/mean_terminated_length": 157.6666717529297, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.9917269906928645, + "grad_norm": 3.8235718441583564, + "kl": 0.09716796875, + "learning_rate": 2.1371789280355547e-10, + "loss": 0.0039, + "num_tokens": 78400205.0, + "reward": 1.1041667461395264, + "reward_std": 0.3190067708492279, + "rewards/reasoning_reward/mean": 1.1041666269302368, + "rewards/reasoning_reward/std": 0.8072924017906189, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 145.7916717529297, + "completions/mean_terminated_length": 145.7916717529297, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.9927611168562565, + "grad_norm": 3.9856327424940687, + "kl": 0.083984375, + "learning_rate": 1.6886604522659e-10, + "loss": 0.0034, + "num_tokens": 78481128.0, + "reward": 1.0, + "reward_std": 0.5582748055458069, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.7939992547035217, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 163.9166717529297, + "completions/mean_terminated_length": 163.9166717529297, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.9937952430196484, + "grad_norm": 0.2081115188693481, + "kl": 0.06591796875, + "learning_rate": 1.292897716542729e-10, + "loss": 0.0026, + "num_tokens": 78560702.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reasoning_reward/mean": 1.0, + "rewards/reasoning_reward/std": 0.0, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 168.70834350585938, + "completions/mean_terminated_length": 168.70834350585938, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.9948293691830403, + "grad_norm": 2.780255312640307, + "kl": 0.062255859375, + "learning_rate": 9.498948980291021e-11, + "loss": 0.0025, + "num_tokens": 78643983.0, + "reward": 1.3125, + "reward_std": 0.20665977895259857, + "rewards/reasoning_reward/mean": 1.3125, + "rewards/reasoning_reward/std": 0.355469673871994, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 149.75, + "completions/mean_terminated_length": 149.75, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.9958634953464323, + "grad_norm": 3.0502524409622622, + "kl": 0.046142578125, + "learning_rate": 6.59655617020749e-11, + "loss": 0.0018, + "num_tokens": 78724921.0, + "reward": 1.0208333730697632, + "reward_std": 0.2965203523635864, + "rewards/reasoning_reward/mean": 1.0208333730697632, + "rewards/reasoning_reward/std": 0.6507381200790405, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 132.08334350585938, + "completions/mean_terminated_length": 132.08334350585938, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.9968976215098242, + "grad_norm": 2.2883893243678246, + "kl": 0.10888671875, + "learning_rate": 4.221829369094321e-11, + "loss": 0.0043, + "num_tokens": 78809019.0, + "reward": 0.8333333730697632, + "reward_std": 0.17817416787147522, + "rewards/reasoning_reward/mean": 0.8333333134651184, + "rewards/reasoning_reward/std": 0.3806934952735901, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 140.95834350585938, + "completions/mean_terminated_length": 140.95834350585938, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.9979317476732161, + "grad_norm": 3.6433341964307564, + "kl": 0.08447265625, + "learning_rate": 2.374793641518602e-11, + "loss": 0.0034, + "num_tokens": 78883898.0, + "reward": 0.7013888359069824, + "reward_std": 0.3162800669670105, + "rewards/reasoning_reward/mean": 0.7013888359069824, + "rewards/reasoning_reward/std": 0.7222802639007568, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 190.4499969482422, + "completions/mean_terminated_length": 190.4499969482422, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.9989658738366081, + "grad_norm": 2.642107476748316, + "kl": 0.05419921875, + "learning_rate": 1.0554684824137794e-11, + "loss": 0.0024, + "num_tokens": 78951363.0, + "reward": 1.0833333730697632, + "reward_std": 0.2903675436973572, + "rewards/reasoning_reward/mean": 1.0833333730697632, + "rewards/reasoning_reward/std": 0.7172815203666687, + "step": 966 + }, + { + "epoch": 0.9989658738366081, + "step": 966, + "total_flos": 0.0, + "train_loss": 0.002488073633344592, + "train_runtime": 159825.8329, + "train_samples_per_second": 0.018, + "train_steps_per_second": 0.006 + } + ], + "logging_steps": 1.0, + "max_steps": 967, + "num_input_tokens_seen": 78951363, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}