diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2547121752419766, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 321.578125, + "epoch": 0.0005094243504839531, + "grad_norm": 21.497011168465292, + "kl": 0.0, + "learning_rate": 9.997452878247579e-07, + "loss": -0.0, + "reward": -0.492842435836792, + "reward_std": 0.7784243226051331, + "rewards/accuracy_reward": -0.4125000238418579, + "rewards/cosine_rewards": -0.08018936403095722, + "rewards/format_reward": 0.0, + "rewards/repetition_rewards": -0.0001530575300421333, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.796875, + "epoch": 0.0010188487009679063, + "grad_norm": 8.570878529351686, + "kl": 0.00115203857421875, + "learning_rate": 9.99490575649516e-07, + "loss": 0.0, + "reward": -0.2021125927567482, + "reward_std": 0.686398446559906, + "rewards/accuracy_reward": -0.18437501601874828, + "rewards/cosine_rewards": -0.01752197090536356, + "rewards/format_reward": 0.0, + "rewards/repetition_rewards": -0.00021561131143243983, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.640625, + "epoch": 0.0015282730514518594, + "grad_norm": 7.698910727869972, + "kl": 0.0014190673828125, + "learning_rate": 9.99235863474274e-07, + "loss": 0.0001, + "reward": -0.6304773092269897, + "reward_std": 0.5950716435909271, + "rewards/accuracy_reward": -0.6093750298023224, + "rewards/cosine_rewards": -0.03664374351501465, + "rewards/format_reward": 0.015625, + "rewards/repetition_rewards": -8.355615136679262e-05, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.765625, + "epoch": 0.0020376974019358125, + "grad_norm": 8.264023776311538, + "kl": 0.00258636474609375, + "learning_rate": 9.98981151299032e-07, + "loss": 0.0001, + "reward": -0.4020528346300125, + "reward_std": 0.7227448225021362, + "rewards/accuracy_reward": -0.38750001788139343, + "rewards/cosine_rewards": -0.014348747674375772, + "rewards/format_reward": 0.0, + "rewards/repetition_rewards": -0.00020408956333994865, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.953125, + "epoch": 0.0025471217524197657, + "grad_norm": 9.41735274952485, + "kl": 0.00286865234375, + "learning_rate": 9.9872643912379e-07, + "loss": 0.0001, + "reward": -0.45950669050216675, + "reward_std": 0.6219092607498169, + "rewards/accuracy_reward": -0.4343750476837158, + "rewards/cosine_rewards": -0.02503613755106926, + "rewards/format_reward": 0.0, + "rewards/repetition_rewards": -9.553764903103001e-05, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.9375, + "epoch": 0.003056546102903719, + "grad_norm": 12.944765767909546, + "kl": 0.008697509765625, + "learning_rate": 9.984717269485481e-07, + "loss": 0.0003, + "reward": -0.42242346704006195, + "reward_std": 0.6794147342443466, + "rewards/accuracy_reward": -0.40937504172325134, + "rewards/cosine_rewards": -0.028209966607391834, + "rewards/format_reward": 0.015625, + "rewards/repetition_rewards": -0.0004634863289538771, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.859375, + "epoch": 0.003565970453387672, + "grad_norm": 10.259430825273313, + "kl": 0.013763427734375, + "learning_rate": 9.98217014773306e-07, + "loss": 0.0005, + "reward": -0.33318234980106354, + "reward_std": 0.7437820434570312, + "rewards/accuracy_reward": -0.35625000298023224, + "rewards/cosine_rewards": -0.00804880098439753, + "rewards/format_reward": 0.03125, + "rewards/repetition_rewards": -0.00013354701513890177, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.0625, + "epoch": 0.004075394803871625, + "grad_norm": 8.664940595308508, + "kl": 0.01800537109375, + "learning_rate": 9.979623025980642e-07, + "loss": 0.0007, + "reward": -0.3353596553206444, + "reward_std": 0.7424190640449524, + "rewards/accuracy_reward": -0.32500002533197403, + "rewards/cosine_rewards": -0.010187382809817791, + "rewards/format_reward": 0.0, + "rewards/repetition_rewards": -0.00017226976342499256, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.609375, + "epoch": 0.004584819154355578, + "grad_norm": 12.906962146752678, + "kl": 0.013641357421875, + "learning_rate": 9.977075904228221e-07, + "loss": 0.0005, + "reward": -0.5576262176036835, + "reward_std": 0.38936011493206024, + "rewards/accuracy_reward": -0.546875, + "rewards/cosine_rewards": -0.010545612312853336, + "rewards/format_reward": 0.0, + "rewards/repetition_rewards": -0.00020559210679493845, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.375, + "epoch": 0.005094243504839531, + "grad_norm": 12.675664846435772, + "kl": 0.014129638671875, + "learning_rate": 9.974528782475803e-07, + "loss": 0.0006, + "reward": -0.5825353264808655, + "reward_std": 0.32141495356336236, + "rewards/accuracy_reward": -0.5750000178813934, + "rewards/cosine_rewards": -0.0075353041756898165, + "rewards/format_reward": 0.0, + "rewards/repetition_rewards": 0.0, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.5625, + "epoch": 0.0056036678553234845, + "grad_norm": 83.14378275688269, + "kl": 0.011932373046875, + "learning_rate": 9.971981660723382e-07, + "loss": 0.0005, + "reward": -0.4973638355731964, + "reward_std": 0.6479763090610504, + "rewards/accuracy_reward": -0.4906250536441803, + "rewards/cosine_rewards": -0.006738818949088454, + "rewards/format_reward": 0.0, + "rewards/repetition_rewards": 0.0, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.5, + "epoch": 0.006113092205807438, + "grad_norm": 10.015051037156322, + "kl": 0.01776123046875, + "learning_rate": 9.969434538970963e-07, + "loss": 0.0007, + "reward": -0.5842953324317932, + "reward_std": 0.3923248201608658, + "rewards/accuracy_reward": -0.5750000029802322, + "rewards/cosine_rewards": -0.009295305702835321, + "rewards/format_reward": 0.0, + "rewards/repetition_rewards": 0.0, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.984375, + "epoch": 0.006622516556291391, + "grad_norm": 11.394446741932766, + "kl": 0.018157958984375, + "learning_rate": 9.966887417218542e-07, + "loss": 0.0007, + "reward": -0.5545713007450104, + "reward_std": 0.5603736639022827, + "rewards/accuracy_reward": -0.5468750298023224, + "rewards/cosine_rewards": -0.007696274435147643, + "rewards/format_reward": 0.0, + "rewards/repetition_rewards": 0.0, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.8125, + "epoch": 0.007131940906775344, + "grad_norm": 11.774338615514537, + "kl": 0.017730712890625, + "learning_rate": 9.964340295466124e-07, + "loss": 0.0007, + "reward": -0.24103393778204918, + "reward_std": 0.770084798336029, + "rewards/accuracy_reward": -0.23750002682209015, + "rewards/cosine_rewards": -0.0035339330206625164, + "rewards/format_reward": 0.0, + "rewards/repetition_rewards": 0.0, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.5, + "epoch": 0.007641365257259297, + "grad_norm": 12.461454822945, + "kl": 0.01995849609375, + "learning_rate": 9.961793173713703e-07, + "loss": 0.0008, + "reward": -0.7055607736110687, + "reward_std": 0.2303236834704876, + "rewards/accuracy_reward": -0.7156250178813934, + "rewards/cosine_rewards": -0.005560769001021981, + "rewards/format_reward": 0.015625, + "rewards/repetition_rewards": 0.0, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.53125, + "epoch": 0.00815078960774325, + "grad_norm": 16.951183982865736, + "kl": 0.0206298828125, + "learning_rate": 9.959246051961282e-07, + "loss": 0.0008, + "reward": -0.3540929928421974, + "reward_std": 0.7245323657989502, + "rewards/accuracy_reward": -0.3500000238418579, + "rewards/cosine_rewards": -0.004092983668670058, + "rewards/format_reward": 0.0, + "rewards/repetition_rewards": 0.0, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.578125, + "epoch": 0.008660213958227204, + "grad_norm": 9.458837096460513, + "kl": 0.025634765625, + "learning_rate": 9.956698930208864e-07, + "loss": 0.001, + "reward": -0.36599001288414, + "reward_std": 0.6569808125495911, + "rewards/accuracy_reward": -0.37812502682209015, + "rewards/cosine_rewards": -0.003489995375275612, + "rewards/format_reward": 0.015625, + "rewards/repetition_rewards": 0.0, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.625, + "epoch": 0.009169638308711156, + "grad_norm": 11.937426620152417, + "kl": 0.02801513671875, + "learning_rate": 9.954151808456443e-07, + "loss": 0.0011, + "reward": -0.40710097551345825, + "reward_std": 0.7412720322608948, + "rewards/accuracy_reward": -0.43437501788139343, + "rewards/cosine_rewards": -0.003975986503064632, + "rewards/format_reward": 0.03125, + "rewards/repetition_rewards": 0.0, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.21875, + "epoch": 0.00967906265919511, + "grad_norm": 12.317962197180934, + "kl": 0.03466796875, + "learning_rate": 9.951604686704024e-07, + "loss": 0.0014, + "reward": -0.25013431906700134, + "reward_std": 0.7123757898807526, + "rewards/accuracy_reward": -0.32500001788139343, + "rewards/cosine_rewards": -0.003259307239204645, + "rewards/format_reward": 0.078125, + "rewards/repetition_rewards": 0.0, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.109375, + "epoch": 0.010188487009679063, + "grad_norm": 24.27751022595849, + "kl": 0.037109375, + "learning_rate": 9.949057564951603e-07, + "loss": 0.0015, + "reward": -0.2632312625646591, + "reward_std": 0.6930468529462814, + "rewards/accuracy_reward": -0.4624999985098839, + "rewards/cosine_rewards": -0.003856247873045504, + "rewards/format_reward": 0.203125, + "rewards/repetition_rewards": 0.0, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.03125, + "epoch": 0.010697911360163017, + "grad_norm": 12.508736780907405, + "kl": 0.053955078125, + "learning_rate": 9.946510443199185e-07, + "loss": 0.0022, + "reward": -0.010567170567810535, + "reward_std": 0.7874742448329926, + "rewards/accuracy_reward": -0.4125000238418579, + "rewards/cosine_rewards": -0.004317150334827602, + "rewards/format_reward": 0.40625, + "rewards/repetition_rewards": 0.0, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.390625, + "epoch": 0.011207335710646969, + "grad_norm": 10.983519481785477, + "kl": 0.073974609375, + "learning_rate": 9.943963321446764e-07, + "loss": 0.003, + "reward": 0.5529356598854065, + "reward_std": 0.9540310502052307, + "rewards/accuracy_reward": -0.2093750163912773, + "rewards/cosine_rewards": -0.003314302652142942, + "rewards/format_reward": 0.765625, + "rewards/repetition_rewards": 0.0, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.796875, + "epoch": 0.011716760061130923, + "grad_norm": 59.13650095831239, + "kl": 0.084716796875, + "learning_rate": 9.941416199694345e-07, + "loss": 0.0034, + "reward": 0.49799469113349915, + "reward_std": 0.6547213792800903, + "rewards/accuracy_reward": -0.43437501788139343, + "rewards/cosine_rewards": -0.005130313569679856, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": 0.0, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.34375, + "epoch": 0.012226184411614875, + "grad_norm": 33.85321217925331, + "kl": 0.078369140625, + "learning_rate": 9.938869077941925e-07, + "loss": 0.0031, + "reward": 0.5440552532672882, + "reward_std": 0.4689805209636688, + "rewards/accuracy_reward": -0.43437501788139343, + "rewards/cosine_rewards": -0.005944762844592333, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.078125, + "epoch": 0.01273560876209883, + "grad_norm": 27.614415529764607, + "kl": 0.23876953125, + "learning_rate": 9.936321956189506e-07, + "loss": 0.0096, + "reward": 0.319291889667511, + "reward_std": 0.2991320895962417, + "rewards/accuracy_reward": -0.659375011920929, + "rewards/cosine_rewards": -0.005708091426640749, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 21.09375, + "epoch": 0.013245033112582781, + "grad_norm": 78.27860464199816, + "kl": 0.810546875, + "learning_rate": 9.933774834437085e-07, + "loss": 0.0324, + "reward": 0.758573591709137, + "reward_std": 0.8151377141475677, + "rewards/accuracy_reward": -0.24062500894069672, + "rewards/cosine_rewards": -0.0008013773494894849, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 18.75, + "epoch": 0.013754457463066735, + "grad_norm": 15.792726008582374, + "kl": 0.876953125, + "learning_rate": 9.931227712684667e-07, + "loss": 0.0351, + "reward": 0.5177058726549149, + "reward_std": 0.6054319739341736, + "rewards/accuracy_reward": -0.46562501788139343, + "rewards/cosine_rewards": -0.000942649960052222, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.000101461038866546, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 15.15625, + "epoch": 0.014263881813550688, + "grad_norm": 28.538085950991302, + "kl": 0.853515625, + "learning_rate": 9.928680590932246e-07, + "loss": 0.0342, + "reward": 0.30591557919979095, + "reward_std": 0.3449897766113281, + "rewards/accuracy_reward": -0.6625000238418579, + "rewards/cosine_rewards": -0.00033439824983361177, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": 0.0, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.1875, + "epoch": 0.014773306164034642, + "grad_norm": 19.672166251005525, + "kl": 0.939453125, + "learning_rate": 9.926133469179825e-07, + "loss": 0.0375, + "reward": 0.4091247171163559, + "reward_std": 0.46140581369400024, + "rewards/accuracy_reward": -0.5750000476837158, + "rewards/cosine_rewards": -0.0002502501738490537, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 16.171875, + "epoch": 0.015282730514518594, + "grad_norm": 35.96869326683099, + "kl": 1.416015625, + "learning_rate": 9.923586347427406e-07, + "loss": 0.0566, + "reward": 0.5554585456848145, + "reward_std": 0.7011753022670746, + "rewards/accuracy_reward": -0.3812499940395355, + "rewards/cosine_rewards": -0.0007914370798971504, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": 0.0, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 16.15625, + "epoch": 0.015792154865002548, + "grad_norm": 24.745191247130563, + "kl": 1.01171875, + "learning_rate": 9.921039225674986e-07, + "loss": 0.0405, + "reward": 0.6306657046079636, + "reward_std": 0.7620185613632202, + "rewards/accuracy_reward": -0.32187502086162567, + "rewards/cosine_rewards": -0.0005842609098181129, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": 0.0, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.140625, + "epoch": 0.0163015792154865, + "grad_norm": 12.709104159306316, + "kl": 0.7265625, + "learning_rate": 9.918492103922567e-07, + "loss": 0.0291, + "reward": 0.38606902956962585, + "reward_std": 0.8792209327220917, + "rewards/accuracy_reward": -0.39375001192092896, + "rewards/cosine_rewards": -0.001430943259038031, + "rewards/format_reward": 0.78125, + "rewards/repetition_rewards": 0.0, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 15.03125, + "epoch": 0.016811003565970453, + "grad_norm": 17.976804747397264, + "kl": 0.904296875, + "learning_rate": 9.915944982170146e-07, + "loss": 0.0361, + "reward": 0.4996982365846634, + "reward_std": 0.7649624943733215, + "rewards/accuracy_reward": -0.4687500149011612, + "rewards/cosine_rewards": -0.00030174180574249476, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": 0.0, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 22.609375, + "epoch": 0.017320427916454408, + "grad_norm": 39.58286880123024, + "kl": 0.8828125, + "learning_rate": 9.913397860417728e-07, + "loss": 0.0353, + "reward": 0.4207390695810318, + "reward_std": 0.8459209501743317, + "rewards/accuracy_reward": -0.4375000298023224, + "rewards/cosine_rewards": -0.0011358977280906402, + "rewards/format_reward": 0.859375, + "rewards/repetition_rewards": 0.0, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.328125, + "epoch": 0.01782985226693836, + "grad_norm": 19.17123986238676, + "kl": 0.955078125, + "learning_rate": 9.910850738665307e-07, + "loss": 0.0383, + "reward": 0.474868506193161, + "reward_std": 0.6449769139289856, + "rewards/accuracy_reward": -0.49375005066394806, + "rewards/cosine_rewards": -0.0001314536166319158, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": 0.0, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.328125, + "epoch": 0.018339276617422313, + "grad_norm": 23.5658227799872, + "kl": 0.95703125, + "learning_rate": 9.908303616912888e-07, + "loss": 0.0382, + "reward": 0.4700201153755188, + "reward_std": 0.7454200983047485, + "rewards/accuracy_reward": -0.41875001788139343, + "rewards/cosine_rewards": -0.0018548529915278777, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": 0.0, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.921875, + "epoch": 0.018848700967906265, + "grad_norm": 11.872294898362206, + "kl": 1.001953125, + "learning_rate": 9.905756495160467e-07, + "loss": 0.0401, + "reward": 0.5029261708259583, + "reward_std": 0.7742039263248444, + "rewards/accuracy_reward": -0.4343750327825546, + "rewards/cosine_rewards": -0.0001988118929148186, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": 0.0, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 16.40625, + "epoch": 0.01935812531839022, + "grad_norm": 14.39070050703297, + "kl": 0.978515625, + "learning_rate": 9.903209373408049e-07, + "loss": 0.0391, + "reward": 0.4616774320602417, + "reward_std": 0.7915183901786804, + "rewards/accuracy_reward": -0.4125000238418579, + "rewards/cosine_rewards": -0.000822544090624433, + "rewards/format_reward": 0.875, + "rewards/repetition_rewards": 0.0, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 22.1875, + "epoch": 0.019867549668874173, + "grad_norm": 9.423514240680602, + "kl": 0.9375, + "learning_rate": 9.900662251655628e-07, + "loss": 0.0376, + "reward": 0.5430571883916855, + "reward_std": 0.5502887666225433, + "rewards/accuracy_reward": -0.4062500447034836, + "rewards/cosine_rewards": -0.003600762978749117, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.00021701389050576836, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 12.4375, + "epoch": 0.020376974019358125, + "grad_norm": 25.806979588800377, + "kl": 0.9140625, + "learning_rate": 9.89811512990321e-07, + "loss": 0.0366, + "reward": 0.503069132566452, + "reward_std": 0.6060213148593903, + "rewards/accuracy_reward": -0.4656250327825546, + "rewards/cosine_rewards": -5.582944686466362e-05, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": 0.0, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 12.46875, + "epoch": 0.020886398369842078, + "grad_norm": 20.104239930601235, + "kl": 0.9296875, + "learning_rate": 9.895568008150789e-07, + "loss": 0.0372, + "reward": 0.631201758980751, + "reward_std": 0.7148115336894989, + "rewards/accuracy_reward": -0.35312502086162567, + "rewards/cosine_rewards": -4.822800292458851e-05, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 12.828125, + "epoch": 0.021395822720326033, + "grad_norm": 7.720832433302504, + "kl": 0.841796875, + "learning_rate": 9.89302088639837e-07, + "loss": 0.0336, + "reward": 0.5936954319477081, + "reward_std": 0.4961870163679123, + "rewards/accuracy_reward": -0.4062500298023224, + "rewards/cosine_rewards": -5.458852319861762e-05, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 12.984375, + "epoch": 0.021905247070809986, + "grad_norm": 9.831243087089065, + "kl": 0.76953125, + "learning_rate": 9.89047376464595e-07, + "loss": 0.0308, + "reward": 0.6499472558498383, + "reward_std": 0.7755721807479858, + "rewards/accuracy_reward": -0.3500000238418579, + "rewards/cosine_rewards": -5.273178430797998e-05, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.0, + "epoch": 0.022414671421293938, + "grad_norm": 15.405185965961632, + "kl": 0.79296875, + "learning_rate": 9.88792664289353e-07, + "loss": 0.0318, + "reward": 0.8749629557132721, + "reward_std": 0.8532125055789948, + "rewards/accuracy_reward": -0.1250000149011612, + "rewards/cosine_rewards": -3.706023017002735e-05, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.0, + "epoch": 0.02292409577177789, + "grad_norm": 73.99173140679622, + "kl": 0.814453125, + "learning_rate": 9.88537952114111e-07, + "loss": 0.0326, + "reward": 0.8468359708786011, + "reward_std": 0.6202812939882278, + "rewards/accuracy_reward": -0.15312501415610313, + "rewards/cosine_rewards": -3.904559889633674e-05, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.09375, + "epoch": 0.023433520122261846, + "grad_norm": 57.01467559353994, + "kl": 0.802734375, + "learning_rate": 9.882832399388691e-07, + "loss": 0.0321, + "reward": 0.7187013626098633, + "reward_std": 0.7317405939102173, + "rewards/accuracy_reward": -0.26562502793967724, + "rewards/cosine_rewards": -4.8641444664099254e-05, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 14.8125, + "epoch": 0.023942944472745798, + "grad_norm": 49.94239079179156, + "kl": 0.8125, + "learning_rate": 9.88028527763627e-07, + "loss": 0.0325, + "reward": 0.7904289066791534, + "reward_std": 0.6411640644073486, + "rewards/accuracy_reward": -0.2093750163912773, + "rewards/cosine_rewards": -0.00019609702576417476, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.984375, + "epoch": 0.02445236882322975, + "grad_norm": 29.421172213478044, + "kl": 0.8046875, + "learning_rate": 9.877738155883852e-07, + "loss": 0.0322, + "reward": 0.7342777252197266, + "reward_std": 0.3429698422551155, + "rewards/accuracy_reward": -0.2656249850988388, + "rewards/cosine_rewards": -9.731029422255233e-05, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.0, + "epoch": 0.024961793173713703, + "grad_norm": 30.98665699286148, + "kl": 0.86328125, + "learning_rate": 9.87519103413143e-07, + "loss": 0.0346, + "reward": 1.0437248945236206, + "reward_std": 0.6164620369672775, + "rewards/accuracy_reward": 0.04374997317790985, + "rewards/cosine_rewards": -2.5148013037323835e-05, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.0, + "epoch": 0.02547121752419766, + "grad_norm": 25.805195350433394, + "kl": 0.787109375, + "learning_rate": 9.872643912379012e-07, + "loss": 0.0315, + "reward": 0.6499470472335815, + "reward_std": 0.4753982424736023, + "rewards/accuracy_reward": -0.3500000238418579, + "rewards/cosine_rewards": -5.2943185437470675e-05, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.0, + "epoch": 0.02598064187468161, + "grad_norm": 60.37753917289171, + "kl": 0.865234375, + "learning_rate": 9.870096790626592e-07, + "loss": 0.0347, + "reward": 1.0999788641929626, + "reward_std": 0.716822475194931, + "rewards/accuracy_reward": 0.09999999590218067, + "rewards/cosine_rewards": -2.117727399308933e-05, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.0, + "epoch": 0.026490066225165563, + "grad_norm": 60.29300770886218, + "kl": 0.859375, + "learning_rate": 9.867549668874173e-07, + "loss": 0.0343, + "reward": 1.3249947428703308, + "reward_std": 0.6325759440660477, + "rewards/accuracy_reward": 0.32499997690320015, + "rewards/cosine_rewards": -5.294318725646008e-06, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 14.859375, + "epoch": 0.026999490575649515, + "grad_norm": 33.49731491963465, + "kl": 0.96484375, + "learning_rate": 9.865002547121752e-07, + "loss": 0.0386, + "reward": 0.6497911810874939, + "reward_std": 0.23335448652505875, + "rewards/accuracy_reward": -0.3500000163912773, + "rewards/cosine_rewards": -0.00020882973694824614, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.0, + "epoch": 0.02750891492613347, + "grad_norm": 21.106152145400298, + "kl": 0.85546875, + "learning_rate": 9.862455425369333e-07, + "loss": 0.0342, + "reward": 1.3812487125396729, + "reward_std": 0.26327238231897354, + "rewards/accuracy_reward": 0.3812499940395355, + "rewards/cosine_rewards": -1.3235799087851774e-06, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.0, + "epoch": 0.028018339276617423, + "grad_norm": 50.8468456202969, + "kl": 0.767578125, + "learning_rate": 9.859908303616913e-07, + "loss": 0.0307, + "reward": 1.493756651878357, + "reward_std": 0.3182205259799957, + "rewards/accuracy_reward": 0.4937499910593033, + "rewards/cosine_rewards": 6.617898179683834e-06, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.0, + "epoch": 0.028527763627101375, + "grad_norm": 45.22229728431362, + "kl": 0.833984375, + "learning_rate": 9.857361181864494e-07, + "loss": 0.0334, + "reward": 0.9030899405479431, + "reward_std": 0.2386654019355774, + "rewards/accuracy_reward": -0.09687501937150955, + "rewards/cosine_rewards": -3.507485962472856e-05, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.0, + "epoch": 0.029037187977585328, + "grad_norm": 251.48941881136554, + "kl": 0.828125, + "learning_rate": 9.854814060112073e-07, + "loss": 0.0331, + "reward": 1.5781376361846924, + "reward_std": 0.3039933070540428, + "rewards/accuracy_reward": 0.5781249701976776, + "rewards/cosine_rewards": 1.2574006632348755e-05, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 14.453125, + "epoch": 0.029546612328069283, + "grad_norm": 36.77436472817121, + "kl": 0.939453125, + "learning_rate": 9.852266938359653e-07, + "loss": 0.0376, + "reward": 1.334273636341095, + "reward_std": 0.34448733925819397, + "rewards/accuracy_reward": 0.34999997913837433, + "rewards/cosine_rewards": -0.00010142281280423049, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 14.4375, + "epoch": 0.030056036678553236, + "grad_norm": 46.54068675299219, + "kl": 0.89453125, + "learning_rate": 9.849719816607234e-07, + "loss": 0.0358, + "reward": 0.9967500269412994, + "reward_std": 0.4488208740949631, + "rewards/accuracy_reward": 0.012499993667006493, + "rewards/cosine_rewards": -0.00012501747096393956, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.0, + "epoch": 0.030565461029037188, + "grad_norm": 63.09725081890949, + "kl": 0.837890625, + "learning_rate": 9.847172694854813e-07, + "loss": 0.0335, + "reward": 0.9874708652496338, + "reward_std": 0.33707569539546967, + "rewards/accuracy_reward": -0.012500002980232239, + "rewards/cosine_rewards": -2.9118752991053043e-05, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.0, + "epoch": 0.03107488537952114, + "grad_norm": 105.45023488529014, + "kl": 0.8359375, + "learning_rate": 9.844625573102394e-07, + "loss": 0.0334, + "reward": 1.1281058490276337, + "reward_std": 0.3039932996034622, + "rewards/accuracy_reward": 0.12812498584389687, + "rewards/cosine_rewards": -1.919190435728524e-05, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 14.640625, + "epoch": 0.031584309730005096, + "grad_norm": 85.12929156788996, + "kl": 0.84375, + "learning_rate": 9.842078451349974e-07, + "loss": 0.0337, + "reward": 0.7748350501060486, + "reward_std": 0.5448895841836929, + "rewards/accuracy_reward": -0.2093750238418579, + "rewards/cosine_rewards": -0.00016493651855853386, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 19.078125, + "epoch": 0.032093734080489045, + "grad_norm": 8.426167428667783, + "kl": 0.814453125, + "learning_rate": 9.839531329597555e-07, + "loss": 0.0326, + "reward": 0.874523401260376, + "reward_std": 0.0010828198865056038, + "rewards/accuracy_reward": -0.1250000149011612, + "rewards/cosine_rewards": -0.0004766158472193638, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 19.34375, + "epoch": 0.032603158430973, + "grad_norm": 86.47647424288853, + "kl": 0.810546875, + "learning_rate": 9.836984207845134e-07, + "loss": 0.0324, + "reward": 1.6625866889953613, + "reward_std": 0.19662056118249893, + "rewards/accuracy_reward": 0.6625000238418579, + "rewards/cosine_rewards": 8.658922160975635e-05, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 17.6875, + "epoch": 0.033112582781456956, + "grad_norm": 59.22678939682069, + "kl": 0.86328125, + "learning_rate": 9.834437086092716e-07, + "loss": 0.0345, + "reward": 0.915763258934021, + "reward_std": 0.082692209049128, + "rewards/accuracy_reward": -0.06875000894069672, + "rewards/cosine_rewards": 0.00013823993504047394, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.671875, + "epoch": 0.033622007131940905, + "grad_norm": 170.96252285475958, + "kl": 0.7734375, + "learning_rate": 9.831889964340295e-07, + "loss": 0.0309, + "reward": 1.2965829372406006, + "reward_std": 0.32569222897291183, + "rewards/accuracy_reward": 0.2968750074505806, + "rewards/cosine_rewards": -0.0002921203849837184, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 18.890625, + "epoch": 0.03413143148242486, + "grad_norm": 302.02752287161115, + "kl": 0.84765625, + "learning_rate": 9.829342842587876e-07, + "loss": 0.0339, + "reward": 1.2968038320541382, + "reward_std": 0.27610647678375244, + "rewards/accuracy_reward": 0.296875, + "rewards/cosine_rewards": -7.123823161236942e-05, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 22.984375, + "epoch": 0.034640855832908816, + "grad_norm": 617.6786375620823, + "kl": 0.77734375, + "learning_rate": 9.826795720835456e-07, + "loss": 0.0311, + "reward": 1.4656760096549988, + "reward_std": 0.2886117473244667, + "rewards/accuracy_reward": 0.46562498807907104, + "rewards/cosine_rewards": 5.1008202717639506e-05, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.34375, + "epoch": 0.035150280183392765, + "grad_norm": 29.79960642054238, + "kl": 0.728515625, + "learning_rate": 9.824248599083037e-07, + "loss": 0.0292, + "reward": 1.309334635734558, + "reward_std": 0.20424916595220566, + "rewards/accuracy_reward": 0.32500000298023224, + "rewards/cosine_rewards": -4.041045031044632e-05, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 23.90625, + "epoch": 0.03565970453387672, + "grad_norm": 91.00871807242451, + "kl": 0.744140625, + "learning_rate": 9.821701477330616e-07, + "loss": 0.0298, + "reward": 1.2686043679714203, + "reward_std": 0.10558865318307653, + "rewards/accuracy_reward": 0.26874998211860657, + "rewards/cosine_rewards": -0.00014566810568794608, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.328125, + "epoch": 0.03616912888436067, + "grad_norm": 159.05100875041754, + "kl": 0.765625, + "learning_rate": 9.819154355578195e-07, + "loss": 0.0306, + "reward": 1.2812767028808594, + "reward_std": 0.6231541335582733, + "rewards/accuracy_reward": 0.296875, + "rewards/cosine_rewards": 2.6669338694773614e-05, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.4375, + "epoch": 0.036678553234844626, + "grad_norm": 97.83490373613579, + "kl": 0.666015625, + "learning_rate": 9.816607233825777e-07, + "loss": 0.0266, + "reward": 1.647216558456421, + "reward_std": 0.32463081181049347, + "rewards/accuracy_reward": 0.6625000089406967, + "rewards/cosine_rewards": 0.00034145097015425563, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.046875, + "epoch": 0.03718797758532858, + "grad_norm": 88.5872574021467, + "kl": 0.630859375, + "learning_rate": 9.814060112073356e-07, + "loss": 0.0253, + "reward": 1.7599374055862427, + "reward_std": 0.3454015702009201, + "rewards/accuracy_reward": 0.7750000059604645, + "rewards/cosine_rewards": 0.0005623315373668447, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.140625, + "epoch": 0.03769740193581253, + "grad_norm": 16.49274288281998, + "kl": 0.501953125, + "learning_rate": 9.811512990320937e-07, + "loss": 0.0201, + "reward": 1.7728378772735596, + "reward_std": 0.2738931328058243, + "rewards/accuracy_reward": 0.8031250238418579, + "rewards/cosine_rewards": 0.0009629083215259016, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": 0.0, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.9375, + "epoch": 0.038206826286296486, + "grad_norm": 18.283090697254373, + "kl": 0.18359375, + "learning_rate": 9.808965868568517e-07, + "loss": 0.0074, + "reward": 1.437682330608368, + "reward_std": 0.19817885756492615, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/cosine_rewards": 0.00018233060836791992, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.34375, + "epoch": 0.03871625063678044, + "grad_norm": 21.656183371701722, + "kl": 0.13330078125, + "learning_rate": 9.806418746816098e-07, + "loss": 0.0054, + "reward": 1.2524056434631348, + "reward_std": 0.14976192265748978, + "rewards/accuracy_reward": 0.26874999701976776, + "rewards/cosine_rewards": -0.0007193188357632607, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.5, + "epoch": 0.03922567498726439, + "grad_norm": 12.450187143986858, + "kl": 0.1328125, + "learning_rate": 9.803871625063677e-07, + "loss": 0.0053, + "reward": 1.535181999206543, + "reward_std": 0.04510992762516253, + "rewards/accuracy_reward": 0.5499999970197678, + "rewards/cosine_rewards": 0.0008070359472185373, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.96875, + "epoch": 0.039735099337748346, + "grad_norm": 96.22924405795379, + "kl": 0.12744140625, + "learning_rate": 9.801324503311258e-07, + "loss": 0.0051, + "reward": 1.4221445322036743, + "reward_std": 0.5387175530195236, + "rewards/accuracy_reward": 0.4374999850988388, + "rewards/cosine_rewards": 0.0002695363436941989, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.46875, + "epoch": 0.040244523688232295, + "grad_norm": 22.356744283314942, + "kl": 0.12451171875, + "learning_rate": 9.798777381558838e-07, + "loss": 0.005, + "reward": 0.9280500411987305, + "reward_std": 0.3032594621181488, + "rewards/accuracy_reward": -0.06875001452863216, + "rewards/cosine_rewards": -0.003199932281859219, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.09375, + "epoch": 0.04075394803871625, + "grad_norm": 11.587196398677966, + "kl": 0.12353515625, + "learning_rate": 9.79623025980642e-07, + "loss": 0.0049, + "reward": 1.0698014497756958, + "reward_std": 0.306557297706604, + "rewards/accuracy_reward": 0.07187498360872269, + "rewards/cosine_rewards": -0.0020735373545903713, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.359375, + "epoch": 0.041263372389200206, + "grad_norm": 15.552459183086345, + "kl": 0.115234375, + "learning_rate": 9.793683138053998e-07, + "loss": 0.0046, + "reward": 1.902881920337677, + "reward_std": 0.2866080105304718, + "rewards/accuracy_reward": 0.9156250059604645, + "rewards/cosine_rewards": 0.0028820185689255595, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.28125, + "epoch": 0.041772796739684155, + "grad_norm": 21.56424441435487, + "kl": 0.110107421875, + "learning_rate": 9.79113601630158e-07, + "loss": 0.0044, + "reward": 1.2677271366119385, + "reward_std": 0.10610348492627963, + "rewards/accuracy_reward": 0.26874999701976776, + "rewards/cosine_rewards": -0.0010228125611320138, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.484375, + "epoch": 0.04228222109016811, + "grad_norm": 11.018514876954287, + "kl": 0.125244140625, + "learning_rate": 9.788588894549159e-07, + "loss": 0.005, + "reward": 1.2675296068191528, + "reward_std": 0.16161296842619777, + "rewards/accuracy_reward": 0.26874999701976776, + "rewards/cosine_rewards": -0.0012203185469843447, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.078125, + "epoch": 0.04279164544065207, + "grad_norm": 20.603407343348504, + "kl": 0.1083984375, + "learning_rate": 9.78604177279674e-07, + "loss": 0.0043, + "reward": 1.1548139452934265, + "reward_std": 0.537171483039856, + "rewards/accuracy_reward": 0.1562499925494194, + "rewards/cosine_rewards": -0.0014360386412590742, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.640625, + "epoch": 0.043301069791136015, + "grad_norm": 60.83530697951784, + "kl": 0.18359375, + "learning_rate": 9.78349465104432e-07, + "loss": 0.0073, + "reward": 1.5197246074676514, + "reward_std": 0.5150813460350037, + "rewards/accuracy_reward": 0.518750011920929, + "rewards/cosine_rewards": 0.000974582158960402, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.015625, + "epoch": 0.04381049414161997, + "grad_norm": 13.721540678774238, + "kl": 0.12451171875, + "learning_rate": 9.780947529291899e-07, + "loss": 0.005, + "reward": 1.1832407712936401, + "reward_std": 0.18641822785139084, + "rewards/accuracy_reward": 0.18437500298023224, + "rewards/cosine_rewards": -0.001134182559326291, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.578125, + "epoch": 0.04431991849210392, + "grad_norm": 215.40977191184217, + "kl": 0.115478515625, + "learning_rate": 9.77840040753948e-07, + "loss": 0.0046, + "reward": 1.2237018644809723, + "reward_std": 0.23010382801294327, + "rewards/accuracy_reward": 0.24062499776482582, + "rewards/cosine_rewards": -0.0012981001054868102, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.125, + "epoch": 0.044829342842587876, + "grad_norm": 11.570945117677702, + "kl": 0.110595703125, + "learning_rate": 9.77585328578706e-07, + "loss": 0.0044, + "reward": 1.5510605573654175, + "reward_std": 0.0015471973456442356, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/cosine_rewards": 0.001060541602782905, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.953125, + "epoch": 0.04533876719307183, + "grad_norm": 9.451343704964001, + "kl": 0.104248046875, + "learning_rate": 9.77330616403464e-07, + "loss": 0.0042, + "reward": 1.5073344111442566, + "reward_std": 0.35981758683919907, + "rewards/accuracy_reward": 0.5218750052154064, + "rewards/cosine_rewards": 0.0010844313073903322, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.71875, + "epoch": 0.04584819154355578, + "grad_norm": 16.032910799390205, + "kl": 0.094970703125, + "learning_rate": 9.77075904228222e-07, + "loss": 0.0038, + "reward": 1.919905662536621, + "reward_std": 0.24129686888772994, + "rewards/accuracy_reward": 0.9156250059604645, + "rewards/cosine_rewards": 0.0042806623969227076, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.390625, + "epoch": 0.046357615894039736, + "grad_norm": 21.086614900693085, + "kl": 0.101806640625, + "learning_rate": 9.768211920529801e-07, + "loss": 0.0041, + "reward": 1.5919697284698486, + "reward_std": 0.2428576573729515, + "rewards/accuracy_reward": 0.6062500029802322, + "rewards/cosine_rewards": 0.0013447333476506174, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.734375, + "epoch": 0.04686704024452369, + "grad_norm": 8.093804874468816, + "kl": 0.095458984375, + "learning_rate": 9.76566479877738e-07, + "loss": 0.0038, + "reward": 1.6935226917266846, + "reward_std": 0.1869470328092575, + "rewards/accuracy_reward": 0.690625011920929, + "rewards/cosine_rewards": 0.0028977063193451613, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.203125, + "epoch": 0.04737646459500764, + "grad_norm": 10.624707519768712, + "kl": 0.099609375, + "learning_rate": 9.763117677024962e-07, + "loss": 0.004, + "reward": 1.4030739068984985, + "reward_std": 0.3680836334824562, + "rewards/accuracy_reward": 0.43437500298023224, + "rewards/cosine_rewards": -5.1158247515559196e-05, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": 0.0, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.90625, + "epoch": 0.047885888945491596, + "grad_norm": 25.156466743552734, + "kl": 0.101318359375, + "learning_rate": 9.760570555272541e-07, + "loss": 0.0041, + "reward": 1.5800000429153442, + "reward_std": 0.5084549486637115, + "rewards/accuracy_reward": 0.578125, + "rewards/cosine_rewards": 0.0018750545859802514, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.46875, + "epoch": 0.048395313295975545, + "grad_norm": 12.684719084813777, + "kl": 0.10205078125, + "learning_rate": 9.758023433520122e-07, + "loss": 0.0041, + "reward": 1.5234779119491577, + "reward_std": 0.18682076036930084, + "rewards/accuracy_reward": 0.5218749940395355, + "rewards/cosine_rewards": 0.0016028713434934616, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.109375, + "epoch": 0.0489047376464595, + "grad_norm": 13.854458043447748, + "kl": 0.108154296875, + "learning_rate": 9.755476311767702e-07, + "loss": 0.0043, + "reward": 1.6655999422073364, + "reward_std": 0.4286635220050812, + "rewards/accuracy_reward": 0.6624999791383743, + "rewards/cosine_rewards": 0.0030998505535535514, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.609375, + "epoch": 0.049414161996943456, + "grad_norm": 15.632405914543359, + "kl": 0.098388671875, + "learning_rate": 9.752929190015283e-07, + "loss": 0.0039, + "reward": 1.1535860896110535, + "reward_std": 0.36188751459121704, + "rewards/accuracy_reward": 0.1562499888241291, + "rewards/cosine_rewards": -0.002663849270902574, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.453125, + "epoch": 0.049923586347427405, + "grad_norm": 8.747829969093605, + "kl": 0.112060546875, + "learning_rate": 9.750382068262862e-07, + "loss": 0.0045, + "reward": 1.3531205654144287, + "reward_std": 0.18947682529687881, + "rewards/accuracy_reward": 0.3531249985098839, + "rewards/cosine_rewards": -4.528439603745937e-06, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.71875, + "epoch": 0.05043301069791136, + "grad_norm": 12.880266278774922, + "kl": 0.112060546875, + "learning_rate": 9.747834946510442e-07, + "loss": 0.0045, + "reward": 1.619386613368988, + "reward_std": 0.5798123776912689, + "rewards/accuracy_reward": 0.6624999940395355, + "rewards/cosine_rewards": 0.0037616335321217775, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": 0.0, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.859375, + "epoch": 0.05094243504839532, + "grad_norm": 18.637994264229288, + "kl": 0.109619140625, + "learning_rate": 9.745287824758023e-07, + "loss": 0.0044, + "reward": 1.448248565196991, + "reward_std": 0.4085986465215683, + "rewards/accuracy_reward": 0.4624999687075615, + "rewards/cosine_rewards": 0.0013735336251556873, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.375, + "epoch": 0.051451859398879266, + "grad_norm": 26.647397602965935, + "kl": 0.110107421875, + "learning_rate": 9.742740703005602e-07, + "loss": 0.0044, + "reward": 1.0668614506721497, + "reward_std": 0.35026729106903076, + "rewards/accuracy_reward": 0.07187499292194843, + "rewards/cosine_rewards": -0.004894306650385261, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.00011927480954909697, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.0, + "epoch": 0.05196128374936322, + "grad_norm": 12.612763516820905, + "kl": 0.112060546875, + "learning_rate": 9.740193581253183e-07, + "loss": 0.0045, + "reward": 1.4354371428489685, + "reward_std": 0.20804932340979576, + "rewards/accuracy_reward": 0.46562499552965164, + "rewards/cosine_rewards": 0.0010621265973895788, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": 0.0, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.984375, + "epoch": 0.05247070809984717, + "grad_norm": 19.00540852037756, + "kl": 0.116455078125, + "learning_rate": 9.737646459500763e-07, + "loss": 0.0047, + "reward": 1.0997494161128998, + "reward_std": 0.5674505531787872, + "rewards/accuracy_reward": 0.1499999761581421, + "rewards/cosine_rewards": -0.003250634763389826, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0001250000059371814, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.3125, + "epoch": 0.052980132450331126, + "grad_norm": 8.993773014446798, + "kl": 0.115478515625, + "learning_rate": 9.735099337748344e-07, + "loss": 0.0046, + "reward": 1.547185480594635, + "reward_std": 0.5772347450256348, + "rewards/accuracy_reward": 0.5750000029802322, + "rewards/cosine_rewards": 0.0034354651579633355, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": 0.0, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.09375, + "epoch": 0.05348955680081508, + "grad_norm": 21.83379704072219, + "kl": 0.11279296875, + "learning_rate": 9.732552215995923e-07, + "loss": 0.0045, + "reward": 0.9665651321411133, + "reward_std": 0.19240357726812363, + "rewards/accuracy_reward": -0.012500010430812836, + "rewards/cosine_rewards": -0.005309856729581952, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.296875, + "epoch": 0.05399898115129903, + "grad_norm": 15.74936299058057, + "kl": 0.1240234375, + "learning_rate": 9.730005094243505e-07, + "loss": 0.005, + "reward": 0.8526512682437897, + "reward_std": 0.45641621947288513, + "rewards/accuracy_reward": -0.1250000149011612, + "rewards/cosine_rewards": -0.006723731989040971, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.203125, + "epoch": 0.054508405501782986, + "grad_norm": 7.621263779056561, + "kl": 0.116455078125, + "learning_rate": 9.727457972491084e-07, + "loss": 0.0047, + "reward": 1.4213617444038391, + "reward_std": 0.42073580622673035, + "rewards/accuracy_reward": 0.4375, + "rewards/cosine_rewards": -0.0005132523947395384, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.234375, + "epoch": 0.05501782985226694, + "grad_norm": 14.098307573524853, + "kl": 0.119140625, + "learning_rate": 9.724910850738665e-07, + "loss": 0.0048, + "reward": 1.1232723593711853, + "reward_std": 0.45567604154348373, + "rewards/accuracy_reward": 0.12812499329447746, + "rewards/cosine_rewards": -0.004852580255828798, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.328125, + "epoch": 0.05552725420275089, + "grad_norm": 10.937546247684887, + "kl": 0.18994140625, + "learning_rate": 9.722363728986245e-07, + "loss": 0.0076, + "reward": 1.8381596803665161, + "reward_std": 0.28626738488674164, + "rewards/accuracy_reward": 0.831250011920929, + "rewards/cosine_rewards": 0.006909639807417989, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.375, + "epoch": 0.056036678553234846, + "grad_norm": 16.445027328915916, + "kl": 0.11181640625, + "learning_rate": 9.719816607233826e-07, + "loss": 0.0045, + "reward": 1.2096136808395386, + "reward_std": 0.36066293716430664, + "rewards/accuracy_reward": 0.21249999105930328, + "rewards/cosine_rewards": -0.002886334084905684, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.765625, + "epoch": 0.056546102903718795, + "grad_norm": 18.874204475299454, + "kl": 0.106689453125, + "learning_rate": 9.717269485481405e-07, + "loss": 0.0043, + "reward": 1.3519207835197449, + "reward_std": 0.08376272046007216, + "rewards/accuracy_reward": 0.3531250059604645, + "rewards/cosine_rewards": -0.0012042350135743618, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.796875, + "epoch": 0.05705552725420275, + "grad_norm": 9.104959106458736, + "kl": 0.121337890625, + "learning_rate": 9.714722363728986e-07, + "loss": 0.0049, + "reward": 1.381228744983673, + "reward_std": 0.16323383897542953, + "rewards/accuracy_reward": 0.3812500238418579, + "rewards/cosine_rewards": -2.1282234229147434e-05, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.171875, + "epoch": 0.05756495160468671, + "grad_norm": 13.28611466805594, + "kl": 0.10888671875, + "learning_rate": 9.712175241976566e-07, + "loss": 0.0044, + "reward": 1.3366525173187256, + "reward_std": 0.28694501193240285, + "rewards/accuracy_reward": 0.3531249985098839, + "rewards/cosine_rewards": -0.0008475282229483128, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.4375, + "epoch": 0.058074375955170655, + "grad_norm": 20.0350592953945, + "kl": 0.107666015625, + "learning_rate": 9.709628120224145e-07, + "loss": 0.0043, + "reward": 1.4107850790023804, + "reward_std": 0.18804995715618134, + "rewards/accuracy_reward": 0.40937501937150955, + "rewards/cosine_rewards": 0.001410042867064476, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.6875, + "epoch": 0.05858380030565461, + "grad_norm": 9.526099983425397, + "kl": 0.10595703125, + "learning_rate": 9.707080998471726e-07, + "loss": 0.0042, + "reward": 1.4221826791763306, + "reward_std": 0.2918977811932564, + "rewards/accuracy_reward": 0.4374999925494194, + "rewards/cosine_rewards": 0.00030758429784327745, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.984375, + "epoch": 0.05909322465613857, + "grad_norm": 14.046929728525855, + "kl": 0.11572265625, + "learning_rate": 9.704533876719306e-07, + "loss": 0.0046, + "reward": 1.2389479279518127, + "reward_std": 0.4534989148378372, + "rewards/accuracy_reward": 0.24062498658895493, + "rewards/cosine_rewards": -0.00167706364300102, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.390625, + "epoch": 0.059602649006622516, + "grad_norm": 12.733865198375515, + "kl": 0.105224609375, + "learning_rate": 9.701986754966887e-07, + "loss": 0.0042, + "reward": 1.0669120252132416, + "reward_std": 0.319850392639637, + "rewards/accuracy_reward": 0.07187498360872269, + "rewards/cosine_rewards": -0.004963014740496874, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.140625, + "epoch": 0.06011207335710647, + "grad_norm": 11.670513012812588, + "kl": 0.093505859375, + "learning_rate": 9.699439633214466e-07, + "loss": 0.0038, + "reward": 1.665140986442566, + "reward_std": 0.12403370253741741, + "rewards/accuracy_reward": 0.6624999940395355, + "rewards/cosine_rewards": 0.0026409668498672545, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.828125, + "epoch": 0.06062149770759042, + "grad_norm": 28.993909689663674, + "kl": 0.1015625, + "learning_rate": 9.696892511462047e-07, + "loss": 0.0041, + "reward": 1.0673952102661133, + "reward_std": 0.30907338857650757, + "rewards/accuracy_reward": 0.07187499105930328, + "rewards/cosine_rewards": -0.004479756113141775, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.40625, + "epoch": 0.061130922058074376, + "grad_norm": 27.685530130702478, + "kl": 0.104736328125, + "learning_rate": 9.694345389709627e-07, + "loss": 0.0042, + "reward": 1.4373126029968262, + "reward_std": 0.21315501490607858, + "rewards/accuracy_reward": 0.4375, + "rewards/cosine_rewards": -0.0001873411238193512, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.0625, + "epoch": 0.06164034640855833, + "grad_norm": 12.6569610895899, + "kl": 0.121826171875, + "learning_rate": 9.691798267957208e-07, + "loss": 0.0049, + "reward": 1.3240773677825928, + "reward_std": 0.26775629818439484, + "rewards/accuracy_reward": 0.32499999552965164, + "rewards/cosine_rewards": -0.0009226472466252744, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.78125, + "epoch": 0.06214977075904228, + "grad_norm": 11.557058687556623, + "kl": 0.103759765625, + "learning_rate": 9.689251146204787e-07, + "loss": 0.0041, + "reward": 1.7503631114959717, + "reward_std": 0.08287379238754511, + "rewards/accuracy_reward": 0.7468750178813934, + "rewards/cosine_rewards": 0.0034881452447734773, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.578125, + "epoch": 0.06265919510952624, + "grad_norm": 19.676894670897823, + "kl": 0.1025390625, + "learning_rate": 9.686704024452369e-07, + "loss": 0.0041, + "reward": 1.3520426154136658, + "reward_std": 0.24371477961540222, + "rewards/accuracy_reward": 0.3531249761581421, + "rewards/cosine_rewards": -0.0010823981137946248, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.484375, + "epoch": 0.06316861946001019, + "grad_norm": 7.7934253879172894, + "kl": 0.10107421875, + "learning_rate": 9.684156902699948e-07, + "loss": 0.004, + "reward": 1.4387494623661041, + "reward_std": 0.26880691200494766, + "rewards/accuracy_reward": 0.4374999888241291, + "rewards/cosine_rewards": 0.0015281732194125652, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.000278731546131894, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.671875, + "epoch": 0.06367804381049415, + "grad_norm": 14.376740722468174, + "kl": 0.1064453125, + "learning_rate": 9.68160978094753e-07, + "loss": 0.0043, + "reward": 1.2107464671134949, + "reward_std": 0.20096861571073532, + "rewards/accuracy_reward": 0.21249999292194843, + "rewards/cosine_rewards": -0.0017535560764372349, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.25, + "epoch": 0.06418746816097809, + "grad_norm": 24.20355073697258, + "kl": 0.10693359375, + "learning_rate": 9.679062659195109e-07, + "loss": 0.0043, + "reward": 1.09614896774292, + "reward_std": 0.4781967103481293, + "rewards/accuracy_reward": 0.09999998658895493, + "rewards/cosine_rewards": -0.0038510175654664636, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.203125, + "epoch": 0.06469689251146205, + "grad_norm": 8.56510891629326, + "kl": 0.111083984375, + "learning_rate": 9.676515537442688e-07, + "loss": 0.0044, + "reward": 1.5521512031555176, + "reward_std": 0.46633191406726837, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/cosine_rewards": 0.002151212247554213, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.421875, + "epoch": 0.065206316861946, + "grad_norm": 13.379306876395301, + "kl": 0.120849609375, + "learning_rate": 9.67396841569027e-07, + "loss": 0.0048, + "reward": 1.722820222377777, + "reward_std": 0.32213538885116577, + "rewards/accuracy_reward": 0.71875, + "rewards/cosine_rewards": 0.004070190014317632, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.703125, + "epoch": 0.06571574121242996, + "grad_norm": 153.40445361752842, + "kl": 0.1162109375, + "learning_rate": 9.67142129393785e-07, + "loss": 0.0047, + "reward": 1.2643532752990723, + "reward_std": 0.23417328391224146, + "rewards/accuracy_reward": 0.2656250037252903, + "rewards/cosine_rewards": -0.0012717264471575618, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.5625, + "epoch": 0.06622516556291391, + "grad_norm": 9.275290295194138, + "kl": 0.10302734375, + "learning_rate": 9.66887417218543e-07, + "loss": 0.0041, + "reward": 1.0960015654563904, + "reward_std": 0.21426187455654144, + "rewards/accuracy_reward": 0.09999999403953552, + "rewards/cosine_rewards": -0.003998432832304388, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.265625, + "epoch": 0.06673458991339785, + "grad_norm": 19.81840956306445, + "kl": 0.105224609375, + "learning_rate": 9.66632705043301e-07, + "loss": 0.0042, + "reward": 1.6365814805030823, + "reward_std": 0.20723329484462738, + "rewards/accuracy_reward": 0.6343750059604645, + "rewards/cosine_rewards": 0.0022064344957470894, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.234375, + "epoch": 0.06724401426388181, + "grad_norm": 8.864924483945437, + "kl": 0.108642578125, + "learning_rate": 9.66377992868059e-07, + "loss": 0.0044, + "reward": 1.3077268600463867, + "reward_std": 0.3278057724237442, + "rewards/accuracy_reward": 0.32500000670552254, + "rewards/cosine_rewards": -0.0012397709069773555, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.00040839536814019084, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.6875, + "epoch": 0.06775343861436577, + "grad_norm": 14.31694516925494, + "kl": 0.112060546875, + "learning_rate": 9.661232806928172e-07, + "loss": 0.0045, + "reward": 1.3927981853485107, + "reward_std": 0.3101032227277756, + "rewards/accuracy_reward": 0.40937499701976776, + "rewards/cosine_rewards": -0.0007292817026609555, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.00022258506942307577, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.515625, + "epoch": 0.06826286296484972, + "grad_norm": 13.206286933876525, + "kl": 0.110107421875, + "learning_rate": 9.65868568517575e-07, + "loss": 0.0044, + "reward": 1.4952284097671509, + "reward_std": 0.16513758851215243, + "rewards/accuracy_reward": 0.4937499910593033, + "rewards/cosine_rewards": 0.0014784452505409718, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.15625, + "epoch": 0.06877228731533368, + "grad_norm": 8.196723017225267, + "kl": 0.110107421875, + "learning_rate": 9.656138563423332e-07, + "loss": 0.0044, + "reward": 1.3523318767547607, + "reward_std": 0.19119784235954285, + "rewards/accuracy_reward": 0.3531249985098839, + "rewards/cosine_rewards": -0.0007931197178550065, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.375, + "epoch": 0.06928171166581763, + "grad_norm": 7.96588213615428, + "kl": 0.10400390625, + "learning_rate": 9.653591441670911e-07, + "loss": 0.0042, + "reward": 1.3809208273887634, + "reward_std": 0.16492938250303268, + "rewards/accuracy_reward": 0.3812500238418579, + "rewards/cosine_rewards": -0.00032906350679695606, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.78125, + "epoch": 0.06979113601630157, + "grad_norm": 15.45343619628501, + "kl": 0.1142578125, + "learning_rate": 9.651044319918493e-07, + "loss": 0.0046, + "reward": 1.5364066362380981, + "reward_std": 0.3309681713581085, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/cosine_rewards": 0.002031611278653145, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.484375, + "epoch": 0.07030056036678553, + "grad_norm": 10.658123621710919, + "kl": 0.1142578125, + "learning_rate": 9.648497198166072e-07, + "loss": 0.0046, + "reward": 1.5228744149208069, + "reward_std": 0.08533496968448162, + "rewards/accuracy_reward": 0.5218750089406967, + "rewards/cosine_rewards": 0.0011002181563526392, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.00010080645006382838, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.734375, + "epoch": 0.07080998471726949, + "grad_norm": 13.847520067525743, + "kl": 0.118408203125, + "learning_rate": 9.645950076413653e-07, + "loss": 0.0047, + "reward": 0.6919489502906799, + "reward_std": 0.29361478984355927, + "rewards/accuracy_reward": -0.29375001788139343, + "rewards/cosine_rewards": -0.014301038347184658, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.6875, + "epoch": 0.07131940906775344, + "grad_norm": 32.08146525993247, + "kl": 0.115234375, + "learning_rate": 9.643402954661233e-07, + "loss": 0.0046, + "reward": 1.3814507126808167, + "reward_std": 0.10938079445622861, + "rewards/accuracy_reward": 0.3812499940395355, + "rewards/cosine_rewards": 0.00031310925260186195, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.00011241007450735196, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.890625, + "epoch": 0.0718288334182374, + "grad_norm": 13.580365765026054, + "kl": 0.12158203125, + "learning_rate": 9.640855832908814e-07, + "loss": 0.0049, + "reward": 1.2904618978500366, + "reward_std": 0.09482555650174618, + "rewards/accuracy_reward": 0.296875, + "rewards/cosine_rewards": -0.006186658749356866, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.00022644927958026528, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.5, + "epoch": 0.07233825776872134, + "grad_norm": 24.285612348709225, + "kl": 0.113525390625, + "learning_rate": 9.638308711156393e-07, + "loss": 0.0045, + "reward": 1.4387189745903015, + "reward_std": 0.30863603949546814, + "rewards/accuracy_reward": 0.4374999888241291, + "rewards/cosine_rewards": 0.0012189627159386873, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.5, + "epoch": 0.0728476821192053, + "grad_norm": 22.094925488371874, + "kl": 0.11669921875, + "learning_rate": 9.635761589403972e-07, + "loss": 0.0047, + "reward": 1.495344638824463, + "reward_std": 0.46879828721284866, + "rewards/accuracy_reward": 0.4937499761581421, + "rewards/cosine_rewards": 0.0015945886261761189, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.125, + "epoch": 0.07335710646968925, + "grad_norm": 7.954348731599845, + "kl": 0.1259765625, + "learning_rate": 9.633214467651554e-07, + "loss": 0.005, + "reward": 1.5794875025749207, + "reward_std": 0.2703954949975014, + "rewards/accuracy_reward": 0.6062500178813934, + "rewards/cosine_rewards": 0.0044874417362734675, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": 0.0, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.234375, + "epoch": 0.0738665308201732, + "grad_norm": 10.240352920236468, + "kl": 0.1240234375, + "learning_rate": 9.630667345899133e-07, + "loss": 0.005, + "reward": 1.323024868965149, + "reward_std": 0.3642221838235855, + "rewards/accuracy_reward": 0.32499999552965164, + "rewards/cosine_rewards": -0.0019751336076296866, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.96875, + "epoch": 0.07437595517065716, + "grad_norm": 7.757169620409243, + "kl": 0.1318359375, + "learning_rate": 9.628120224146714e-07, + "loss": 0.0053, + "reward": 1.4797114729881287, + "reward_std": 0.40076301991939545, + "rewards/accuracy_reward": 0.4937500078231096, + "rewards/cosine_rewards": 0.001799287972971797, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.00021284000831656158, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.015625, + "epoch": 0.0748853795211411, + "grad_norm": 9.791138031684504, + "kl": 0.1123046875, + "learning_rate": 9.625573102394294e-07, + "loss": 0.0045, + "reward": 1.5510019659996033, + "reward_std": 0.27960680425167084, + "rewards/accuracy_reward": 0.5781249850988388, + "rewards/cosine_rewards": 0.0043075907160528, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0001806358341127634, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.6875, + "epoch": 0.07539480387162506, + "grad_norm": 7.852116556950569, + "kl": 0.12060546875, + "learning_rate": 9.623025980641875e-07, + "loss": 0.0048, + "reward": 1.2943141460418701, + "reward_std": 0.43064263463020325, + "rewards/accuracy_reward": 0.2968749701976776, + "rewards/cosine_rewards": -0.0023121244739741087, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.0002487746678525582, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.8125, + "epoch": 0.07590422822210902, + "grad_norm": 19.19937094751421, + "kl": 0.1240234375, + "learning_rate": 9.620478858889454e-07, + "loss": 0.005, + "reward": 1.8400413990020752, + "reward_std": 0.39075249433517456, + "rewards/accuracy_reward": 0.8593749701976776, + "rewards/cosine_rewards": 0.011916308663785458, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": 0.0, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.890625, + "epoch": 0.07641365257259297, + "grad_norm": 17.532821778348946, + "kl": 0.1376953125, + "learning_rate": 9.617931737137036e-07, + "loss": 0.0055, + "reward": 1.5498095750808716, + "reward_std": 0.29365313798189163, + "rewards/accuracy_reward": 0.5781250298023224, + "rewards/cosine_rewards": 0.0030243303044699132, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -8.979885024018586e-05, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.3125, + "epoch": 0.07692307692307693, + "grad_norm": 8.653969191960044, + "kl": 0.120361328125, + "learning_rate": 9.615384615384615e-07, + "loss": 0.0048, + "reward": 1.2203205227851868, + "reward_std": 0.5703159868717194, + "rewards/accuracy_reward": 0.24062499403953552, + "rewards/cosine_rewards": -0.004613903176505119, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -6.565126386703923e-05, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.484375, + "epoch": 0.07743250127356088, + "grad_norm": 56.921468540439704, + "kl": 0.119873046875, + "learning_rate": 9.612837493632196e-07, + "loss": 0.0048, + "reward": 1.2310086488723755, + "reward_std": 0.41925153136253357, + "rewards/accuracy_reward": 0.2656249925494194, + "rewards/cosine_rewards": -0.00321156473364681, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.00015470296784769744, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.0, + "epoch": 0.07794192562404482, + "grad_norm": 9.400079372239615, + "kl": 0.107666015625, + "learning_rate": 9.610290371879775e-07, + "loss": 0.0043, + "reward": 1.6124141216278076, + "reward_std": 0.487982913851738, + "rewards/accuracy_reward": 0.606249988079071, + "rewards/cosine_rewards": 0.006320342654362321, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.00015624999650754035, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.90625, + "epoch": 0.07845134997452878, + "grad_norm": 16.320123460893743, + "kl": 0.125732421875, + "learning_rate": 9.607743250127357e-07, + "loss": 0.005, + "reward": 1.5100122094154358, + "reward_std": 0.4279818534851074, + "rewards/accuracy_reward": 0.5218749791383743, + "rewards/cosine_rewards": 0.0037621970986947417, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.125, + "epoch": 0.07896077432501274, + "grad_norm": 12.345482711993489, + "kl": 0.163330078125, + "learning_rate": 9.605196128374936e-07, + "loss": 0.0065, + "reward": 0.9318991005420685, + "reward_std": 0.23740804940462112, + "rewards/accuracy_reward": -0.04062497615814209, + "rewards/cosine_rewards": -0.011850890005007386, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.671875, + "epoch": 0.07947019867549669, + "grad_norm": 6.991107442023172, + "kl": 0.1240234375, + "learning_rate": 9.602649006622515e-07, + "loss": 0.005, + "reward": 0.9476701319217682, + "reward_std": 0.33865927904844284, + "rewards/accuracy_reward": -0.04062502086162567, + "rewards/cosine_rewards": -0.011704806645866483, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.46875, + "epoch": 0.07997962302598065, + "grad_norm": 11.299778697394407, + "kl": 0.117919921875, + "learning_rate": 9.600101884870097e-07, + "loss": 0.0047, + "reward": 1.3649136424064636, + "reward_std": 0.42557042837142944, + "rewards/accuracy_reward": 0.3812499940395355, + "rewards/cosine_rewards": -0.0007113651372492313, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.015625, + "epoch": 0.08048904737646459, + "grad_norm": 16.322542283478924, + "kl": 0.12255859375, + "learning_rate": 9.597554763117676e-07, + "loss": 0.0049, + "reward": 1.3941306471824646, + "reward_std": 0.4155275672674179, + "rewards/accuracy_reward": 0.40937498211860657, + "rewards/cosine_rewards": 0.000547687232028693, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.00016711230273358524, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.3125, + "epoch": 0.08099847172694855, + "grad_norm": 6.393101239111367, + "kl": 0.11865234375, + "learning_rate": 9.595007641365257e-07, + "loss": 0.0047, + "reward": 1.2930153012275696, + "reward_std": 0.2976529533043504, + "rewards/accuracy_reward": 0.296875, + "rewards/cosine_rewards": -0.0038597104139626026, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.125, + "epoch": 0.0815078960774325, + "grad_norm": 15.2053157125375, + "kl": 0.119140625, + "learning_rate": 9.592460519612836e-07, + "loss": 0.0048, + "reward": 1.2102863192558289, + "reward_std": 0.43368688225746155, + "rewards/accuracy_reward": 0.2124999836087227, + "rewards/cosine_rewards": -0.0022136420011520386, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.109375, + "epoch": 0.08201732042791646, + "grad_norm": 8.230345044429809, + "kl": 0.11376953125, + "learning_rate": 9.589913397860418e-07, + "loss": 0.0045, + "reward": 1.5247125625610352, + "reward_std": 0.313697911798954, + "rewards/accuracy_reward": 0.5218749791383743, + "rewards/cosine_rewards": 0.0028375727706588805, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.421875, + "epoch": 0.08252674477840041, + "grad_norm": 7.440050437747031, + "kl": 0.132568359375, + "learning_rate": 9.587366276107997e-07, + "loss": 0.0053, + "reward": 1.4958758354187012, + "reward_std": 0.2720055654644966, + "rewards/accuracy_reward": 0.4937499761581421, + "rewards/cosine_rewards": 0.002125886792782694, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.125, + "epoch": 0.08303616912888435, + "grad_norm": 44.81125981696038, + "kl": 0.119384765625, + "learning_rate": 9.584819154355578e-07, + "loss": 0.0048, + "reward": 1.5242316722869873, + "reward_std": 0.6014019548892975, + "rewards/accuracy_reward": 0.5218749940395355, + "rewards/cosine_rewards": 0.0023566827294416726, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.046875, + "epoch": 0.08354559347936831, + "grad_norm": 13.052733655899614, + "kl": 0.119140625, + "learning_rate": 9.582272032603158e-07, + "loss": 0.0048, + "reward": 1.6689130067825317, + "reward_std": 0.2884200101252645, + "rewards/accuracy_reward": 0.6624999940395355, + "rewards/cosine_rewards": 0.0064131125109270215, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.53125, + "epoch": 0.08405501782985227, + "grad_norm": 45.85990240379708, + "kl": 0.455078125, + "learning_rate": 9.57972491085074e-07, + "loss": 0.0181, + "reward": 1.72576242685318, + "reward_std": 0.48727013170719147, + "rewards/accuracy_reward": 0.7187499701976776, + "rewards/cosine_rewards": 0.007012464571744204, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.890625, + "epoch": 0.08456444218033622, + "grad_norm": 62.93710363981597, + "kl": 0.117431640625, + "learning_rate": 9.577177789098318e-07, + "loss": 0.0047, + "reward": 0.9781621694564819, + "reward_std": 0.20786645263433456, + "rewards/accuracy_reward": -0.012500008568167686, + "rewards/cosine_rewards": -0.009250549599528313, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -8.729050023248419e-05, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.109375, + "epoch": 0.08507386653082018, + "grad_norm": 11.046432411232663, + "kl": 0.13134765625, + "learning_rate": 9.5746306673459e-07, + "loss": 0.0052, + "reward": 1.3762089014053345, + "reward_std": 0.32985249161720276, + "rewards/accuracy_reward": 0.37812500819563866, + "rewards/cosine_rewards": -0.0019161199452355504, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.578125, + "epoch": 0.08558329088130413, + "grad_norm": 5.950394983602238, + "kl": 0.11279296875, + "learning_rate": 9.572083545593479e-07, + "loss": 0.0045, + "reward": 1.0190700888633728, + "reward_std": 0.6297826766967773, + "rewards/accuracy_reward": 0.04062497615814209, + "rewards/cosine_rewards": -0.005929919425398111, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.71875, + "epoch": 0.08609271523178808, + "grad_norm": 17.640294707452878, + "kl": 0.11572265625, + "learning_rate": 9.56953642384106e-07, + "loss": 0.0046, + "reward": 0.9798631221055984, + "reward_std": 0.20544240390881896, + "rewards/accuracy_reward": -0.012500017881393433, + "rewards/cosine_rewards": -0.007636879570782185, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.34375, + "epoch": 0.08660213958227203, + "grad_norm": 7.762505201079191, + "kl": 0.112548828125, + "learning_rate": 9.56698930208864e-07, + "loss": 0.0045, + "reward": 1.152494490146637, + "reward_std": 0.30975981056690216, + "rewards/accuracy_reward": 0.15625, + "rewards/cosine_rewards": -0.0037555836606770754, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.875, + "epoch": 0.08711156393275599, + "grad_norm": 11.506198147130426, + "kl": 0.111083984375, + "learning_rate": 9.564442180336219e-07, + "loss": 0.0045, + "reward": 0.9780029058456421, + "reward_std": 0.6867689490318298, + "rewards/accuracy_reward": -0.012500010430812836, + "rewards/cosine_rewards": -0.009497055783867836, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.125, + "epoch": 0.08762098828323994, + "grad_norm": 11.819517524957538, + "kl": 0.10546875, + "learning_rate": 9.5618950585838e-07, + "loss": 0.0042, + "reward": 1.3241556882858276, + "reward_std": 0.3773365914821625, + "rewards/accuracy_reward": 0.32499998807907104, + "rewards/cosine_rewards": -0.0008443233091384172, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.34375, + "epoch": 0.0881304126337239, + "grad_norm": 8.416955648144409, + "kl": 0.116943359375, + "learning_rate": 9.55934793683138e-07, + "loss": 0.0047, + "reward": 1.6661878824234009, + "reward_std": 0.20251824986189604, + "rewards/accuracy_reward": 0.6624999940395355, + "rewards/cosine_rewards": 0.003687863936647773, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.921875, + "epoch": 0.08863983698420784, + "grad_norm": 7.131693684668596, + "kl": 0.12060546875, + "learning_rate": 9.55680081507896e-07, + "loss": 0.0048, + "reward": 1.0765551328659058, + "reward_std": 0.3972722738981247, + "rewards/accuracy_reward": 0.09999998845160007, + "rewards/cosine_rewards": -0.007819817401468754, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.75, + "epoch": 0.0891492613346918, + "grad_norm": 11.781793838489648, + "kl": 0.11083984375, + "learning_rate": 9.55425369332654e-07, + "loss": 0.0044, + "reward": 1.5515506863594055, + "reward_std": 0.3071342632174492, + "rewards/accuracy_reward": 0.5500000268220901, + "rewards/cosine_rewards": 0.0015506702475249767, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.609375, + "epoch": 0.08965868568517575, + "grad_norm": 40.566147710572594, + "kl": 0.109130859375, + "learning_rate": 9.551706571574121e-07, + "loss": 0.0044, + "reward": 1.5527549982070923, + "reward_std": 0.39126846194267273, + "rewards/accuracy_reward": 0.5499999895691872, + "rewards/cosine_rewards": 0.0027549704536795616, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.53125, + "epoch": 0.09016811003565971, + "grad_norm": 12.817541000164553, + "kl": 0.10595703125, + "learning_rate": 9.5491594498217e-07, + "loss": 0.0042, + "reward": 1.9808745980262756, + "reward_std": 0.08480274910107255, + "rewards/accuracy_reward": 0.971875011920929, + "rewards/cosine_rewards": 0.009283588267862797, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.0002840909000951797, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.75, + "epoch": 0.09067753438614366, + "grad_norm": 7.914504626786531, + "kl": 0.103759765625, + "learning_rate": 9.546612328069282e-07, + "loss": 0.0041, + "reward": 1.5245178937911987, + "reward_std": 0.34963520616292953, + "rewards/accuracy_reward": 0.5218750089406967, + "rewards/cosine_rewards": 0.0026429439894855022, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.9375, + "epoch": 0.0911869587366276, + "grad_norm": 8.759963126963402, + "kl": 0.13037109375, + "learning_rate": 9.544065206316861e-07, + "loss": 0.0052, + "reward": 1.638785481452942, + "reward_std": 0.2284149518236518, + "rewards/accuracy_reward": 0.6343750059604645, + "rewards/cosine_rewards": 0.0044105148408561945, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.90625, + "epoch": 0.09169638308711156, + "grad_norm": 6.418887244829745, + "kl": 0.116943359375, + "learning_rate": 9.541518084564442e-07, + "loss": 0.0047, + "reward": 1.3810052275657654, + "reward_std": 0.4032685235142708, + "rewards/accuracy_reward": 0.3812499940395355, + "rewards/cosine_rewards": -0.0001616678200662136, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -8.311169949593022e-05, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.078125, + "epoch": 0.09220580743759552, + "grad_norm": 6.304857280986285, + "kl": 0.12890625, + "learning_rate": 9.538970962812022e-07, + "loss": 0.0051, + "reward": 1.2630045115947723, + "reward_std": 0.17365956178400666, + "rewards/accuracy_reward": 0.2656250037252903, + "rewards/cosine_rewards": -0.002620481769554317, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.265625, + "epoch": 0.09271523178807947, + "grad_norm": 11.35970580998742, + "kl": 0.11181640625, + "learning_rate": 9.536423841059602e-07, + "loss": 0.0045, + "reward": 1.6366259455680847, + "reward_std": 0.2085256204009056, + "rewards/accuracy_reward": 0.6343750059604645, + "rewards/cosine_rewards": 0.002349784132093191, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -9.889240755001083e-05, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.96875, + "epoch": 0.09322465613856343, + "grad_norm": 33.125698049494765, + "kl": 0.118408203125, + "learning_rate": 9.533876719307182e-07, + "loss": 0.0048, + "reward": 1.553468942642212, + "reward_std": 0.16592675540596247, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/cosine_rewards": 0.003468883689492941, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": 0.0, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.171875, + "epoch": 0.09373408048904738, + "grad_norm": 15.866784423418947, + "kl": 0.115234375, + "learning_rate": 9.531329597554763e-07, + "loss": 0.0046, + "reward": 1.1207141280174255, + "reward_std": 0.19428733736276627, + "rewards/accuracy_reward": 0.12812499701976776, + "rewards/cosine_rewards": -0.007209272123873234, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.00020161290012765676, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.640625, + "epoch": 0.09424350483953133, + "grad_norm": 20.352317051449248, + "kl": 0.3115234375, + "learning_rate": 9.528782475802343e-07, + "loss": 0.0124, + "reward": 1.6525439023971558, + "reward_std": 0.38362888991832733, + "rewards/accuracy_reward": 0.6625000238418579, + "rewards/cosine_rewards": 0.005779681145213544, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.00011081559932790697, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.234375, + "epoch": 0.09475292919001528, + "grad_norm": 9.691736527471202, + "kl": 0.124755859375, + "learning_rate": 9.526235354049923e-07, + "loss": 0.005, + "reward": 0.9626118838787079, + "reward_std": 0.40054861456155777, + "rewards/accuracy_reward": -0.015625011175870895, + "rewards/cosine_rewards": -0.006138101452961564, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.578125, + "epoch": 0.09526235354049924, + "grad_norm": 9.437447186660206, + "kl": 0.123046875, + "learning_rate": 9.523688232297503e-07, + "loss": 0.0049, + "reward": 1.569740116596222, + "reward_std": 0.1397167220711708, + "rewards/accuracy_reward": 0.5781250298023224, + "rewards/cosine_rewards": 0.007240177597850561, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": 0.0, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.625, + "epoch": 0.09577177789098319, + "grad_norm": 14.943967064784786, + "kl": 0.15087890625, + "learning_rate": 9.521141110545084e-07, + "loss": 0.006, + "reward": 1.0288785099983215, + "reward_std": 0.2980290725827217, + "rewards/accuracy_reward": 0.07187499105930328, + "rewards/cosine_rewards": -0.011638639261946082, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.00010775862028822303, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.921875, + "epoch": 0.09628120224146715, + "grad_norm": 15.000304282598657, + "kl": 0.12744140625, + "learning_rate": 9.518593988792664e-07, + "loss": 0.0051, + "reward": 1.3486477732658386, + "reward_std": 0.30507488548755646, + "rewards/accuracy_reward": 0.34999997913837433, + "rewards/cosine_rewards": -0.0010681524872779846, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.0002840909000951797, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.96875, + "epoch": 0.09679062659195109, + "grad_norm": 5.941418391788204, + "kl": 0.13916015625, + "learning_rate": 9.516046867040244e-07, + "loss": 0.0056, + "reward": 1.6924657821655273, + "reward_std": 0.3630830645561218, + "rewards/accuracy_reward": 0.7187499850988388, + "rewards/cosine_rewards": 0.004965720232576132, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": 0.0, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.421875, + "epoch": 0.09730005094243505, + "grad_norm": 7.708020896616392, + "kl": 0.14453125, + "learning_rate": 9.513499745287824e-07, + "loss": 0.0058, + "reward": 1.250920683145523, + "reward_std": 0.4801155626773834, + "rewards/accuracy_reward": 0.2968749776482582, + "rewards/cosine_rewards": 0.0009206933900713921, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": 0.0, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.3125, + "epoch": 0.097809475292919, + "grad_norm": 13.916574397802314, + "kl": 0.13671875, + "learning_rate": 9.510952623535404e-07, + "loss": 0.0055, + "reward": 1.129820704460144, + "reward_std": 0.717576265335083, + "rewards/accuracy_reward": 0.21249999850988388, + "rewards/cosine_rewards": -0.0045542995212599635, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": 0.0, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.21875, + "epoch": 0.09831889964340296, + "grad_norm": 5.201799859391353, + "kl": 0.13427734375, + "learning_rate": 9.508405501782984e-07, + "loss": 0.0054, + "reward": 1.3024629950523376, + "reward_std": 0.4307016432285309, + "rewards/accuracy_reward": 0.37812498956918716, + "rewards/cosine_rewards": 0.0024630045518279076, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": 0.0, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.203125, + "epoch": 0.09882832399388691, + "grad_norm": 9.365246983313067, + "kl": 0.12939453125, + "learning_rate": 9.505858380030564e-07, + "loss": 0.0052, + "reward": 0.7090668827295303, + "reward_std": 0.5288920998573303, + "rewards/accuracy_reward": -0.23750002309679985, + "rewards/cosine_rewards": -0.022183137945830822, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": 0.0, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.8125, + "epoch": 0.09933774834437085, + "grad_norm": 12.601377217707842, + "kl": 0.14501953125, + "learning_rate": 9.503311258278145e-07, + "loss": 0.0058, + "reward": 1.413894236087799, + "reward_std": 0.7254346013069153, + "rewards/accuracy_reward": 0.5218749940395355, + "rewards/cosine_rewards": 0.001518724486231804, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.00012443749437807128, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.1875, + "epoch": 0.09984717269485481, + "grad_norm": 6.45339117400833, + "kl": 0.1376953125, + "learning_rate": 9.500764136525725e-07, + "loss": 0.0055, + "reward": 1.5756230354309082, + "reward_std": 0.48759835958480835, + "rewards/accuracy_reward": 0.6624999940395355, + "rewards/cosine_rewards": 0.007316130446270108, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.0004430353583302349, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.796875, + "epoch": 0.10035659704533877, + "grad_norm": 11.359096408502863, + "kl": 0.2138671875, + "learning_rate": 9.498217014773305e-07, + "loss": 0.0086, + "reward": 1.2498727440834045, + "reward_std": 0.5983296632766724, + "rewards/accuracy_reward": 0.37812499701976776, + "rewards/cosine_rewards": -0.0031815596157684922, + "rewards/format_reward": 0.875, + "rewards/repetition_rewards": -7.070136052789167e-05, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.234375, + "epoch": 0.10086602139582272, + "grad_norm": 13.920508776432966, + "kl": 0.12255859375, + "learning_rate": 9.495669893020886e-07, + "loss": 0.0049, + "reward": 0.6862081587314606, + "reward_std": 0.7485357820987701, + "rewards/accuracy_reward": -0.2656250223517418, + "rewards/cosine_rewards": -0.01686593284830451, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -5.089576370664872e-05, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.140625, + "epoch": 0.10137544574630668, + "grad_norm": 8.671114765474545, + "kl": 0.124267578125, + "learning_rate": 9.493122771268466e-07, + "loss": 0.005, + "reward": 1.1818422079086304, + "reward_std": 0.5584293901920319, + "rewards/accuracy_reward": 0.29375000298023224, + "rewards/cosine_rewards": -0.002258662148960866, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.0002741228090599179, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.875, + "epoch": 0.10188487009679063, + "grad_norm": 10.078840755345926, + "kl": 0.126708984375, + "learning_rate": 9.490575649516046e-07, + "loss": 0.0051, + "reward": 1.3181660771369934, + "reward_std": 0.6019489467144012, + "rewards/accuracy_reward": 0.40937498211860657, + "rewards/cosine_rewards": 0.002541057765483856, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": 0.0, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.515625, + "epoch": 0.10239429444727458, + "grad_norm": 5.555372749575627, + "kl": 0.130859375, + "learning_rate": 9.488028527763627e-07, + "loss": 0.0052, + "reward": 1.5905040502548218, + "reward_std": 0.41512130200862885, + "rewards/accuracy_reward": 0.6624999940395355, + "rewards/cosine_rewards": 0.006129102781414986, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": 0.0, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.03125, + "epoch": 0.10290371879775853, + "grad_norm": 10.052848791937414, + "kl": 0.12890625, + "learning_rate": 9.485481406011207e-07, + "loss": 0.0051, + "reward": 1.1239948272705078, + "reward_std": 0.9119550585746765, + "rewards/accuracy_reward": 0.26875001192092896, + "rewards/cosine_rewards": -0.004009488970041275, + "rewards/format_reward": 0.859375, + "rewards/repetition_rewards": -0.00012065636838087812, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.5, + "epoch": 0.10341314314824249, + "grad_norm": 11.024825341246498, + "kl": 0.115966796875, + "learning_rate": 9.482934284258787e-07, + "loss": 0.0046, + "reward": 0.9447762966156006, + "reward_std": 0.800986647605896, + "rewards/accuracy_reward": 0.12812498584389687, + "rewards/cosine_rewards": -0.011282204184681177, + "rewards/format_reward": 0.828125, + "rewards/repetition_rewards": -0.00019145716942148283, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.375, + "epoch": 0.10392256749872644, + "grad_norm": 7.842670844504004, + "kl": 0.118408203125, + "learning_rate": 9.480387162506367e-07, + "loss": 0.0047, + "reward": 1.2975149750709534, + "reward_std": 0.6046717762947083, + "rewards/accuracy_reward": 0.40937497094273567, + "rewards/cosine_rewards": -0.0023666354827582836, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.00011837121564894915, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.234375, + "epoch": 0.1044319918492104, + "grad_norm": 20.462186918743633, + "kl": 0.125732421875, + "learning_rate": 9.477840040753947e-07, + "loss": 0.005, + "reward": 1.0097321271896362, + "reward_std": 0.4872446656227112, + "rewards/accuracy_reward": 0.1250000149011612, + "rewards/cosine_rewards": -0.005829372443258762, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -6.351625779643655e-05, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.328125, + "epoch": 0.10494141619969434, + "grad_norm": 18.578745554695768, + "kl": 0.1298828125, + "learning_rate": 9.475292919001527e-07, + "loss": 0.0052, + "reward": 0.8584832549095154, + "reward_std": 0.5744369626045227, + "rewards/accuracy_reward": 0.040624991059303284, + "rewards/cosine_rewards": -0.010225818026810884, + "rewards/format_reward": 0.828125, + "rewards/repetition_rewards": -4.101049853488803e-05, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.234375, + "epoch": 0.1054508405501783, + "grad_norm": 20.30723423804259, + "kl": 0.11376953125, + "learning_rate": 9.472745797249107e-07, + "loss": 0.0045, + "reward": 1.0917281210422516, + "reward_std": 0.4984763488173485, + "rewards/accuracy_reward": 0.24062497913837433, + "rewards/cosine_rewards": -0.008111415430903435, + "rewards/format_reward": 0.859375, + "rewards/repetition_rewards": -0.0001604560275154654, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.28125, + "epoch": 0.10596026490066225, + "grad_norm": 12.813170138938949, + "kl": 0.130859375, + "learning_rate": 9.470198675496688e-07, + "loss": 0.0052, + "reward": 1.258288562297821, + "reward_std": 0.4630318433046341, + "rewards/accuracy_reward": 0.3812500238418579, + "rewards/cosine_rewards": 0.0022279657423496246, + "rewards/format_reward": 0.875, + "rewards/repetition_rewards": -0.00018934992840513587, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.0, + "epoch": 0.10646968925114621, + "grad_norm": 10.281274330223528, + "kl": 0.15185546875, + "learning_rate": 9.467651553744268e-07, + "loss": 0.0061, + "reward": 1.2817729711532593, + "reward_std": 0.4518425017595291, + "rewards/accuracy_reward": 0.40937501937150955, + "rewards/cosine_rewards": -0.0026020415825769305, + "rewards/format_reward": 0.875, + "rewards/repetition_rewards": 0.0, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.90625, + "epoch": 0.10697911360163016, + "grad_norm": 5.991490542584776, + "kl": 0.118896484375, + "learning_rate": 9.465104431991848e-07, + "loss": 0.0047, + "reward": 1.537351131439209, + "reward_std": 0.5478895753622055, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/cosine_rewards": 0.00610114517621696, + "rewards/format_reward": 0.84375, + "rewards/repetition_rewards": 0.0, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.015625, + "epoch": 0.1074885379521141, + "grad_norm": 60.00536554673066, + "kl": 0.115966796875, + "learning_rate": 9.462557310239428e-07, + "loss": 0.0046, + "reward": 0.7785031795501709, + "reward_std": 0.42832519114017487, + "rewards/accuracy_reward": -0.1250000223517418, + "rewards/cosine_rewards": -0.018284045159816742, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -8.778089977568015e-05, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.21875, + "epoch": 0.10799796230259806, + "grad_norm": 21.72003474650607, + "kl": 0.1220703125, + "learning_rate": 9.460010188487009e-07, + "loss": 0.0049, + "reward": 1.0551989674568176, + "reward_std": 0.46487441658973694, + "rewards/accuracy_reward": 0.18437497317790985, + "rewards/cosine_rewards": -0.004139983095228672, + "rewards/format_reward": 0.875, + "rewards/repetition_rewards": -3.600230411393568e-05, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.609375, + "epoch": 0.10850738665308202, + "grad_norm": 13.593112035734373, + "kl": 0.118896484375, + "learning_rate": 9.457463066734589e-07, + "loss": 0.0048, + "reward": 1.4092811346054077, + "reward_std": 0.6851305663585663, + "rewards/accuracy_reward": 0.6343750059604645, + "rewards/cosine_rewards": 0.009402429801411927, + "rewards/format_reward": 0.765625, + "rewards/repetition_rewards": -0.00012127523950766772, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.65625, + "epoch": 0.10901681100356597, + "grad_norm": 102.50188740817565, + "kl": 0.114013671875, + "learning_rate": 9.45491594498217e-07, + "loss": 0.0046, + "reward": 1.3722986578941345, + "reward_std": 0.5870523750782013, + "rewards/accuracy_reward": 0.5218750089406967, + "rewards/cosine_rewards": -0.008539619389921427, + "rewards/format_reward": 0.859375, + "rewards/repetition_rewards": -0.000411735316447448, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.796875, + "epoch": 0.10952623535404993, + "grad_norm": 17.000809347446246, + "kl": 0.114013671875, + "learning_rate": 9.452368823229751e-07, + "loss": 0.0046, + "reward": 1.182218611240387, + "reward_std": 0.6600647866725922, + "rewards/accuracy_reward": 0.265625, + "rewards/cosine_rewards": -0.02086095977574587, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -4.542151145869866e-05, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.796875, + "epoch": 0.11003565970453388, + "grad_norm": 21.15698667263886, + "kl": 0.107666015625, + "learning_rate": 9.449821701477331e-07, + "loss": 0.0043, + "reward": 1.1834356784820557, + "reward_std": 0.686463937163353, + "rewards/accuracy_reward": 0.2968749925494194, + "rewards/cosine_rewards": -0.019689313136041164, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": 0.0, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.296875, + "epoch": 0.11054508405501783, + "grad_norm": 7.9150578217370535, + "kl": 0.09814453125, + "learning_rate": 9.447274579724911e-07, + "loss": 0.0039, + "reward": 1.0662736892700195, + "reward_std": 0.8222787380218506, + "rewards/accuracy_reward": 0.18437499552965164, + "rewards/cosine_rewards": -0.0396728478372097, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.00030338978831423447, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.671875, + "epoch": 0.11105450840550178, + "grad_norm": 10.397918174345362, + "kl": 0.1357421875, + "learning_rate": 9.444727457972492e-07, + "loss": 0.0054, + "reward": 1.6620882153511047, + "reward_std": 0.6701975017786026, + "rewards/accuracy_reward": 0.71875, + "rewards/cosine_rewards": 0.021556629799306393, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -9.343791316496208e-05, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.984375, + "epoch": 0.11156393275598574, + "grad_norm": 16.664810091620872, + "kl": 0.09619140625, + "learning_rate": 9.442180336220072e-07, + "loss": 0.0038, + "reward": 0.6033791899681091, + "reward_std": 0.47761378437280655, + "rewards/accuracy_reward": -0.2656250149011612, + "rewards/cosine_rewards": -0.0521757323294878, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0006950152310309932, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.359375, + "epoch": 0.11207335710646969, + "grad_norm": 6.030171938320145, + "kl": 0.09423828125, + "learning_rate": 9.439633214467651e-07, + "loss": 0.0038, + "reward": 1.0732125043869019, + "reward_std": 0.532948449254036, + "rewards/accuracy_reward": 0.18437499180436134, + "rewards/cosine_rewards": -0.017268475145101547, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.00014401252064999426, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.921875, + "epoch": 0.11258278145695365, + "grad_norm": 13.578372073106125, + "kl": 0.08740234375, + "learning_rate": 9.437086092715231e-07, + "loss": 0.0035, + "reward": 1.089949607849121, + "reward_std": 0.7014666199684143, + "rewards/accuracy_reward": 0.18437499180436134, + "rewards/cosine_rewards": -0.04696316970512271, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0005872593028470874, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.15625, + "epoch": 0.11309220580743759, + "grad_norm": 9.243544591708364, + "kl": 0.099609375, + "learning_rate": 9.434538970962812e-07, + "loss": 0.004, + "reward": 1.25474151968956, + "reward_std": 0.42495501041412354, + "rewards/accuracy_reward": 0.2968750074505806, + "rewards/cosine_rewards": -0.010176160372793674, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0007073541928548366, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.9375, + "epoch": 0.11360163015792155, + "grad_norm": 11.854145682727308, + "kl": 0.09423828125, + "learning_rate": 9.431991849210392e-07, + "loss": 0.0038, + "reward": 1.336020827293396, + "reward_std": 0.5929334163665771, + "rewards/accuracy_reward": 0.3812499977648258, + "rewards/cosine_rewards": 0.0018316814675927162, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0001858295945567079, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.109375, + "epoch": 0.1141110545084055, + "grad_norm": 9.830726171785136, + "kl": 0.13330078125, + "learning_rate": 9.429444727457972e-07, + "loss": 0.0053, + "reward": 0.9984832406044006, + "reward_std": 0.45606285333633423, + "rewards/accuracy_reward": 0.043749988079071045, + "rewards/cosine_rewards": -0.014016739558428526, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": 0.0, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.640625, + "epoch": 0.11462047885888946, + "grad_norm": 9.176338530222994, + "kl": 0.115234375, + "learning_rate": 9.426897605705553e-07, + "loss": 0.0046, + "reward": 1.2019062638282776, + "reward_std": 0.7189642786979675, + "rewards/accuracy_reward": 0.29374999925494194, + "rewards/cosine_rewards": -0.01371871994342655, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": 0.0, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.71875, + "epoch": 0.11512990320937341, + "grad_norm": 11.349158424596924, + "kl": 0.110107421875, + "learning_rate": 9.424350483953133e-07, + "loss": 0.0044, + "reward": 1.3144216537475586, + "reward_std": 0.4914311468601227, + "rewards/accuracy_reward": 0.32499998807907104, + "rewards/cosine_rewards": 0.005097148037748411, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -5.056634472566657e-05, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.390625, + "epoch": 0.11563932755985736, + "grad_norm": 8.61981591225416, + "kl": 0.105712890625, + "learning_rate": 9.421803362200713e-07, + "loss": 0.0042, + "reward": 1.064522534608841, + "reward_std": 0.3509945422410965, + "rewards/accuracy_reward": 0.15312501788139343, + "rewards/cosine_rewards": -0.010458544362336397, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -1.8939394067274407e-05, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.75, + "epoch": 0.11614875191034131, + "grad_norm": 12.524545820572607, + "kl": 0.106689453125, + "learning_rate": 9.419256240448294e-07, + "loss": 0.0043, + "reward": 1.3913479149341583, + "reward_std": 0.2862061709165573, + "rewards/accuracy_reward": 0.40937498956918716, + "rewards/cosine_rewards": -0.0021625147201120853, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.00023957982193678617, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.5, + "epoch": 0.11665817626082527, + "grad_norm": 17.83927533701907, + "kl": 0.13232421875, + "learning_rate": 9.416709118695874e-07, + "loss": 0.0053, + "reward": 1.5334136486053467, + "reward_std": 0.45667168498039246, + "rewards/accuracy_reward": 0.6062500029802322, + "rewards/cosine_rewards": 0.005288586835376918, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": 0.0, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.671875, + "epoch": 0.11716760061130922, + "grad_norm": 23.81656431934888, + "kl": 0.108642578125, + "learning_rate": 9.414161996943454e-07, + "loss": 0.0043, + "reward": 1.1264008283615112, + "reward_std": 0.6892756521701813, + "rewards/accuracy_reward": 0.21249999105930328, + "rewards/cosine_rewards": -0.02350334101356566, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -9.586199303157628e-05, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.59375, + "epoch": 0.11767702496179318, + "grad_norm": 28.467466655884312, + "kl": 0.12109375, + "learning_rate": 9.411614875191034e-07, + "loss": 0.0048, + "reward": 1.4944193363189697, + "reward_std": 0.3786798119544983, + "rewards/accuracy_reward": 0.518750011920929, + "rewards/cosine_rewards": 0.0069862306118011475, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -6.69164874125272e-05, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.40625, + "epoch": 0.11818644931227713, + "grad_norm": 9.248494107592876, + "kl": 0.12060546875, + "learning_rate": 9.409067753438615e-07, + "loss": 0.0048, + "reward": 1.2941021919250488, + "reward_std": 0.5259552597999573, + "rewards/accuracy_reward": 0.3749999776482582, + "rewards/cosine_rewards": 0.012875130865722895, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -2.2944199372432195e-05, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.15625, + "epoch": 0.11869587366276108, + "grad_norm": 14.194855529249866, + "kl": 0.107666015625, + "learning_rate": 9.406520631686195e-07, + "loss": 0.0043, + "reward": 1.4532509446144104, + "reward_std": 0.47306837141513824, + "rewards/accuracy_reward": 0.46562501788139343, + "rewards/cosine_rewards": 0.003419560845941305, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.00016866176156327128, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.875, + "epoch": 0.11920529801324503, + "grad_norm": 24.881210594835412, + "kl": 0.0986328125, + "learning_rate": 9.403973509933774e-07, + "loss": 0.0039, + "reward": 0.9971878528594971, + "reward_std": 0.8903799057006836, + "rewards/accuracy_reward": 0.09375, + "rewards/cosine_rewards": -0.018273995257914066, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.00016315293032675982, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.28125, + "epoch": 0.11971472236372899, + "grad_norm": 4.4996951306965975, + "kl": 0.08544921875, + "learning_rate": 9.401426388181355e-07, + "loss": 0.0034, + "reward": 1.3378186225891113, + "reward_std": 0.8512288331985474, + "rewards/accuracy_reward": 0.4593750089406967, + "rewards/cosine_rewards": -0.027322867885231972, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.000483501615235582, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.765625, + "epoch": 0.12022414671421294, + "grad_norm": 5.402908709299499, + "kl": 0.080322265625, + "learning_rate": 9.398879266428935e-07, + "loss": 0.0032, + "reward": 1.4937435388565063, + "reward_std": 0.35082364082336426, + "rewards/accuracy_reward": 0.5499999821186066, + "rewards/cosine_rewards": -0.024816589895635843, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0001899001763376873, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.796875, + "epoch": 0.1207335710646969, + "grad_norm": 11.178072528468537, + "kl": 0.0947265625, + "learning_rate": 9.396332144676515e-07, + "loss": 0.0038, + "reward": 1.1548867225646973, + "reward_std": 0.8218154907226562, + "rewards/accuracy_reward": 0.23749998956918716, + "rewards/cosine_rewards": -0.0042152018286287785, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0002730985652306117, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 506.640625, + "epoch": 0.12124299541518084, + "grad_norm": 3.9623481852584983, + "kl": 0.078857421875, + "learning_rate": 9.393785022924095e-07, + "loss": 0.0032, + "reward": 1.2667301297187805, + "reward_std": 0.8781076371669769, + "rewards/accuracy_reward": 0.40937498211860657, + "rewards/cosine_rewards": -0.04850983805954456, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.0003849874483421445, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.515625, + "epoch": 0.1217524197656648, + "grad_norm": 5.742693044990549, + "kl": 0.094482421875, + "learning_rate": 9.391237901171676e-07, + "loss": 0.0038, + "reward": 0.6565631031990051, + "reward_std": 0.7225559949874878, + "rewards/accuracy_reward": -0.15312501043081284, + "rewards/cosine_rewards": -0.08046763762831688, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.00046927113726269454, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.265625, + "epoch": 0.12226184411614875, + "grad_norm": 7.476762059683771, + "kl": 0.08984375, + "learning_rate": 9.388690779419256e-07, + "loss": 0.0036, + "reward": 1.2646641731262207, + "reward_std": 0.35142165422439575, + "rewards/accuracy_reward": 0.296875, + "rewards/cosine_rewards": -0.016350463964045048, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.00023539320682175457, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.125, + "epoch": 0.12277126846663271, + "grad_norm": 7.077545719384053, + "kl": 0.101318359375, + "learning_rate": 9.386143657666836e-07, + "loss": 0.0041, + "reward": 0.9919856488704681, + "reward_std": 0.6525652855634689, + "rewards/accuracy_reward": 0.043749988079071045, + "rewards/cosine_rewards": -0.020295456051826477, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.00021890102652832866, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.765625, + "epoch": 0.12328069281711666, + "grad_norm": 8.441107538365218, + "kl": 0.1044921875, + "learning_rate": 9.383596535914417e-07, + "loss": 0.0042, + "reward": 1.616421401500702, + "reward_std": 0.3022947758436203, + "rewards/accuracy_reward": 0.6593749970197678, + "rewards/cosine_rewards": 0.004121019504964352, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.00019959894416388124, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.796875, + "epoch": 0.1237901171676006, + "grad_norm": 9.380134906229868, + "kl": 0.11083984375, + "learning_rate": 9.381049414161997e-07, + "loss": 0.0044, + "reward": 1.3176445960998535, + "reward_std": 0.3997122645378113, + "rewards/accuracy_reward": 0.32499998807907104, + "rewards/cosine_rewards": -0.0073250585701316595, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -3.028100763913244e-05, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.53125, + "epoch": 0.12429954151808456, + "grad_norm": 5.4472083052212765, + "kl": 0.109375, + "learning_rate": 9.378502292409577e-07, + "loss": 0.0044, + "reward": 1.6422365307807922, + "reward_std": 0.2728146519511938, + "rewards/accuracy_reward": 0.6624999940395355, + "rewards/cosine_rewards": 0.01141381449997425, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.00042724609375, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.828125, + "epoch": 0.12480896586856852, + "grad_norm": 8.442731808091501, + "kl": 0.1083984375, + "learning_rate": 9.375955170657157e-07, + "loss": 0.0043, + "reward": 1.3157773613929749, + "reward_std": 0.4320952445268631, + "rewards/accuracy_reward": 0.3531249985098839, + "rewards/cosine_rewards": 0.009643017314374447, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.00011561772407731041, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.6875, + "epoch": 0.12531839021905247, + "grad_norm": 14.294535330077375, + "kl": 0.1171875, + "learning_rate": 9.373408048904738e-07, + "loss": 0.0047, + "reward": 1.331631362438202, + "reward_std": 0.4216170907020569, + "rewards/accuracy_reward": 0.3531249985098839, + "rewards/cosine_rewards": 0.00982090923935175, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -6.454958565882407e-05, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.984375, + "epoch": 0.12582781456953643, + "grad_norm": 9.55932148178992, + "kl": 0.108642578125, + "learning_rate": 9.370860927152318e-07, + "loss": 0.0043, + "reward": 1.340530276298523, + "reward_std": 0.4534093588590622, + "rewards/accuracy_reward": 0.3531250096857548, + "rewards/cosine_rewards": 0.003091069171205163, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -6.0797665355494246e-05, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.234375, + "epoch": 0.12633723892002038, + "grad_norm": 17.80951371391722, + "kl": 0.110595703125, + "learning_rate": 9.368313805399897e-07, + "loss": 0.0044, + "reward": 1.2751246690750122, + "reward_std": 0.520209550857544, + "rewards/accuracy_reward": 0.2968749850988388, + "rewards/cosine_rewards": 0.010451191570609808, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0009515111669315957, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.625, + "epoch": 0.12684666327050434, + "grad_norm": 12.086775377390172, + "kl": 0.111083984375, + "learning_rate": 9.365766683647478e-07, + "loss": 0.0044, + "reward": 0.8402246385812759, + "reward_std": 0.6177513003349304, + "rewards/accuracy_reward": -0.04062502086162567, + "rewards/cosine_rewards": -0.04077841015532613, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0002469850951456465, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.125, + "epoch": 0.1273560876209883, + "grad_norm": 11.748922331365623, + "kl": 0.0869140625, + "learning_rate": 9.363219561895058e-07, + "loss": 0.0035, + "reward": 1.7429784536361694, + "reward_std": 0.6669142842292786, + "rewards/accuracy_reward": 0.746874988079071, + "rewards/cosine_rewards": 0.02791230659931898, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.000558776329853572, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.34375, + "epoch": 0.12786551197147222, + "grad_norm": 5.635036318066103, + "kl": 0.073974609375, + "learning_rate": 9.360672440142638e-07, + "loss": 0.003, + "reward": 1.3756027221679688, + "reward_std": 0.3430413454771042, + "rewards/accuracy_reward": 0.4375, + "rewards/cosine_rewards": -0.01460547186434269, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0004167625156696886, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.78125, + "epoch": 0.12837493632195618, + "grad_norm": 3.7721855945126013, + "kl": 0.071533203125, + "learning_rate": 9.358125318390219e-07, + "loss": 0.0029, + "reward": 1.1054343283176422, + "reward_std": 0.9506143927574158, + "rewards/accuracy_reward": 0.23749998211860657, + "rewards/cosine_rewards": -0.06930245459079742, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.00026325164799345657, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 691.8125, + "epoch": 0.12888436067244013, + "grad_norm": 4.515905173052182, + "kl": 0.06494140625, + "learning_rate": 9.355578196637799e-07, + "loss": 0.0026, + "reward": 1.1395662426948547, + "reward_std": 1.164560616016388, + "rewards/accuracy_reward": 0.24062499403953552, + "rewards/cosine_rewards": -0.06932513415813446, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0004836731095565483, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 750.703125, + "epoch": 0.1293937850229241, + "grad_norm": 3.759822074251192, + "kl": 0.0609130859375, + "learning_rate": 9.353031074885379e-07, + "loss": 0.0024, + "reward": 1.3212904930114746, + "reward_std": 0.9738726019859314, + "rewards/accuracy_reward": 0.4625000059604645, + "rewards/cosine_rewards": -0.03070250153541565, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.0011320026533212513, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.40625, + "epoch": 0.12990320937340805, + "grad_norm": 4.697641184697458, + "kl": 0.082763671875, + "learning_rate": 9.350483953132959e-07, + "loss": 0.0033, + "reward": 1.1115484535694122, + "reward_std": 0.7478219866752625, + "rewards/accuracy_reward": 0.23125000298023224, + "rewards/cosine_rewards": -0.05675292294472456, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.00044860908383270726, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 635.84375, + "epoch": 0.130412633723892, + "grad_norm": 4.028619362240205, + "kl": 0.09765625, + "learning_rate": 9.34793683138054e-07, + "loss": 0.0039, + "reward": 1.457118034362793, + "reward_std": 0.788001298904419, + "rewards/accuracy_reward": 0.5218749791383743, + "rewards/cosine_rewards": -0.0015127966180443764, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0007440973713528365, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.40625, + "epoch": 0.13092205807437596, + "grad_norm": 5.63417318829876, + "kl": 0.07275390625, + "learning_rate": 9.34538970962812e-07, + "loss": 0.0029, + "reward": 1.4154019951820374, + "reward_std": 0.815990686416626, + "rewards/accuracy_reward": 0.518750011920929, + "rewards/cosine_rewards": -0.02476619742810726, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.00045683811185881495, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 676.984375, + "epoch": 0.1314314824248599, + "grad_norm": 6.021417699991294, + "kl": 0.065673828125, + "learning_rate": 9.3428425878757e-07, + "loss": 0.0026, + "reward": 0.6927553117275238, + "reward_std": 0.8777336776256561, + "rewards/accuracy_reward": -0.0781250074505806, + "rewards/cosine_rewards": -0.15041033178567886, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0005843567778356373, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.203125, + "epoch": 0.13194090677534387, + "grad_norm": 8.306056976533926, + "kl": 0.082763671875, + "learning_rate": 9.340295466123281e-07, + "loss": 0.0033, + "reward": 1.2112269699573517, + "reward_std": 0.9674933552742004, + "rewards/accuracy_reward": 0.43437499552965164, + "rewards/cosine_rewards": -0.08218972198665142, + "rewards/format_reward": 0.859375, + "rewards/repetition_rewards": -0.0003333477216074243, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 694.484375, + "epoch": 0.13245033112582782, + "grad_norm": 6.232070420425315, + "kl": 0.06689453125, + "learning_rate": 9.337748344370861e-07, + "loss": 0.0027, + "reward": 1.0117461681365967, + "reward_std": 0.7802118062973022, + "rewards/accuracy_reward": 0.21249999478459358, + "rewards/cosine_rewards": -0.09079772233963013, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.0005810301227029413, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.5625, + "epoch": 0.13295975547631178, + "grad_norm": 6.334798789966088, + "kl": 0.08544921875, + "learning_rate": 9.335201222618441e-07, + "loss": 0.0034, + "reward": 1.0542153716087341, + "reward_std": 0.8291297852993011, + "rewards/accuracy_reward": 0.18124999105930328, + "rewards/cosine_rewards": -0.032575659453868866, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.0007089868013281375, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.015625, + "epoch": 0.1334691798267957, + "grad_norm": 10.460916199726386, + "kl": 0.098388671875, + "learning_rate": 9.33265410086602e-07, + "loss": 0.0039, + "reward": 0.6651052087545395, + "reward_std": 0.9073293209075928, + "rewards/accuracy_reward": -0.09687501192092896, + "rewards/cosine_rewards": -0.03463773522526026, + "rewards/format_reward": 0.796875, + "rewards/repetition_rewards": -0.000257108491496183, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.109375, + "epoch": 0.13397860417727966, + "grad_norm": 105.36035227056938, + "kl": 0.10546875, + "learning_rate": 9.330106979113601e-07, + "loss": 0.0042, + "reward": 1.6959076523780823, + "reward_std": 0.6433850526809692, + "rewards/accuracy_reward": 0.7374999523162842, + "rewards/cosine_rewards": 0.036790573969483376, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0002579164138296619, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.671875, + "epoch": 0.13448802852776362, + "grad_norm": 12.269079038415155, + "kl": 0.1044921875, + "learning_rate": 9.327559857361181e-07, + "loss": 0.0042, + "reward": 1.3304521441459656, + "reward_std": 0.7337057292461395, + "rewards/accuracy_reward": 0.40312500298023224, + "rewards/cosine_rewards": -0.009734044317156076, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0004387954395497218, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.421875, + "epoch": 0.13499745287824758, + "grad_norm": 5.0495702848173805, + "kl": 0.12451171875, + "learning_rate": 9.325012735608761e-07, + "loss": 0.005, + "reward": 1.5114508867263794, + "reward_std": 0.4991532266139984, + "rewards/accuracy_reward": 0.6031249910593033, + "rewards/cosine_rewards": 0.0021946561755612493, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.0001188212918350473, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.890625, + "epoch": 0.13550687722873153, + "grad_norm": 8.380147213080475, + "kl": 0.11376953125, + "learning_rate": 9.322465613856342e-07, + "loss": 0.0046, + "reward": 1.3185867071151733, + "reward_std": 0.5081266015768051, + "rewards/accuracy_reward": 0.37812499701976776, + "rewards/cosine_rewards": 0.0029779861215502024, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -1.6225338185904548e-05, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.4375, + "epoch": 0.1360163015792155, + "grad_norm": 6.7670554711392725, + "kl": 0.1259765625, + "learning_rate": 9.319918492103922e-07, + "loss": 0.005, + "reward": 1.917210876941681, + "reward_std": 0.2323581874370575, + "rewards/accuracy_reward": 0.96875, + "rewards/cosine_rewards": 0.011114767286926508, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.00015385003644041717, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0, + "epoch": 0.13652572592969944, + "grad_norm": 7.285618988190272, + "kl": 0.119873046875, + "learning_rate": 9.317371370351502e-07, + "loss": 0.0048, + "reward": 1.2626032829284668, + "reward_std": 0.6921159029006958, + "rewards/accuracy_reward": 0.34687499701976776, + "rewards/cosine_rewards": -0.006146675441414118, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": 0.0, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.953125, + "epoch": 0.1370351502801834, + "grad_norm": 10.577916333197056, + "kl": 0.140625, + "learning_rate": 9.314824248599083e-07, + "loss": 0.0056, + "reward": 1.2036974430084229, + "reward_std": 0.5991593599319458, + "rewards/accuracy_reward": 0.2968749850988388, + "rewards/cosine_rewards": 0.0007108037825673819, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.0001382743357680738, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.34375, + "epoch": 0.13754457463066735, + "grad_norm": 14.293416719573916, + "kl": 0.1201171875, + "learning_rate": 9.312277126846663e-07, + "loss": 0.0048, + "reward": 1.2185573279857635, + "reward_std": 0.43015679717063904, + "rewards/accuracy_reward": 0.24062500149011612, + "rewards/cosine_rewards": -0.006263321032747626, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.00017934850984602235, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.46875, + "epoch": 0.1380539989811513, + "grad_norm": 9.393354579467495, + "kl": 0.1240234375, + "learning_rate": 9.309730005094243e-07, + "loss": 0.005, + "reward": 1.5472444295883179, + "reward_std": 0.5279964953660965, + "rewards/accuracy_reward": 0.606249988079071, + "rewards/cosine_rewards": 0.003494387026876211, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": 0.0, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.609375, + "epoch": 0.13856342333163527, + "grad_norm": 7.200843784037537, + "kl": 0.117431640625, + "learning_rate": 9.307182883341823e-07, + "loss": 0.0047, + "reward": 1.3769221901893616, + "reward_std": 0.4806235730648041, + "rewards/accuracy_reward": 0.40937501937150955, + "rewards/cosine_rewards": -0.0011419787188060582, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -6.0797665355494246e-05, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.828125, + "epoch": 0.1390728476821192, + "grad_norm": 11.003082967675637, + "kl": 0.18359375, + "learning_rate": 9.304635761589404e-07, + "loss": 0.0073, + "reward": 1.3800683617591858, + "reward_std": 0.4095611423254013, + "rewards/accuracy_reward": 0.40937498211860657, + "rewards/cosine_rewards": 0.0019433526322245598, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": 0.0, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.453125, + "epoch": 0.13958227203260315, + "grad_norm": 6.9562429559567285, + "kl": 0.130859375, + "learning_rate": 9.302088639836984e-07, + "loss": 0.0052, + "reward": 1.424567699432373, + "reward_std": 0.2551300157792866, + "rewards/accuracy_reward": 0.4375, + "rewards/cosine_rewards": 0.0028896235453430563, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0001969077275134623, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.296875, + "epoch": 0.1400916963830871, + "grad_norm": 9.330566123712217, + "kl": 0.1240234375, + "learning_rate": 9.299541518084564e-07, + "loss": 0.005, + "reward": 1.2650930285453796, + "reward_std": 0.42955365777015686, + "rewards/accuracy_reward": 0.32499999552965164, + "rewards/cosine_rewards": 0.0025930306874215603, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": 0.0, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.75, + "epoch": 0.14060112073357106, + "grad_norm": 8.495983071057866, + "kl": 0.11962890625, + "learning_rate": 9.296994396332144e-07, + "loss": 0.0048, + "reward": 1.8627826571464539, + "reward_std": 0.2839447557926178, + "rewards/accuracy_reward": 0.859375, + "rewards/cosine_rewards": 0.019193909130990505, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0001612851265235804, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.515625, + "epoch": 0.14111054508405502, + "grad_norm": 12.855580556594866, + "kl": 0.14306640625, + "learning_rate": 9.294447274579724e-07, + "loss": 0.0057, + "reward": 1.5162805318832397, + "reward_std": 0.6588033437728882, + "rewards/accuracy_reward": 0.6343750059604645, + "rewards/cosine_rewards": -0.008702149149030447, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -1.7208149074576795e-05, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.484375, + "epoch": 0.14161996943453897, + "grad_norm": 10.357622467045976, + "kl": 0.102783203125, + "learning_rate": 9.291900152827304e-07, + "loss": 0.0041, + "reward": 1.1203789710998535, + "reward_std": 0.6814777851104736, + "rewards/accuracy_reward": 0.17812500894069672, + "rewards/cosine_rewards": -0.010577938985079527, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0002930604387074709, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.390625, + "epoch": 0.14212939378502293, + "grad_norm": 23.483530147678458, + "kl": 0.114013671875, + "learning_rate": 9.289353031074884e-07, + "loss": 0.0046, + "reward": 1.3522316813468933, + "reward_std": 0.28182537853717804, + "rewards/accuracy_reward": 0.3812500238418579, + "rewards/cosine_rewards": 0.0023158364929258823, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -8.418447396252304e-05, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.96875, + "epoch": 0.14263881813550688, + "grad_norm": 5.804128123317947, + "kl": 0.109619140625, + "learning_rate": 9.286805909322465e-07, + "loss": 0.0044, + "reward": 1.2828457355499268, + "reward_std": 0.5574119389057159, + "rewards/accuracy_reward": 0.3500000163912773, + "rewards/cosine_rewards": -0.004654169548302889, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": 0.0, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.984375, + "epoch": 0.14314824248599084, + "grad_norm": 9.062411976412948, + "kl": 0.09130859375, + "learning_rate": 9.284258787570045e-07, + "loss": 0.0037, + "reward": 1.9385767579078674, + "reward_std": 0.3151838555932045, + "rewards/accuracy_reward": 0.9437500238418579, + "rewards/cosine_rewards": 0.04203657992184162, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0003348248792462982, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.875, + "epoch": 0.1436576668364748, + "grad_norm": 8.221976106115973, + "kl": 0.104248046875, + "learning_rate": 9.281711665817625e-07, + "loss": 0.0042, + "reward": 1.323907494544983, + "reward_std": 0.6098371148109436, + "rewards/accuracy_reward": 0.40312501788139343, + "rewards/cosine_rewards": 0.014727211673744023, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.00019470852021186147, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 603.21875, + "epoch": 0.14416709118695872, + "grad_norm": 7.812929807725803, + "kl": 0.084228515625, + "learning_rate": 9.279164544065206e-07, + "loss": 0.0034, + "reward": 1.3660696744918823, + "reward_std": 0.6207956671714783, + "rewards/accuracy_reward": 0.46562500298023224, + "rewards/cosine_rewards": -0.005538210505619645, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.0002670584217412397, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.6875, + "epoch": 0.14467651553744268, + "grad_norm": 8.491615227525342, + "kl": 0.08056640625, + "learning_rate": 9.276617422312786e-07, + "loss": 0.0032, + "reward": 1.3353699743747711, + "reward_std": 0.5946642160415649, + "rewards/accuracy_reward": 0.40937500447034836, + "rewards/cosine_rewards": -0.02690817415714264, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0002218634108430706, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.984375, + "epoch": 0.14518593988792663, + "grad_norm": 16.68462964732329, + "kl": 0.077880859375, + "learning_rate": 9.274070300560366e-07, + "loss": 0.0031, + "reward": 0.9605185687541962, + "reward_std": 0.7793702185153961, + "rewards/accuracy_reward": 0.09999999031424522, + "rewards/cosine_rewards": -0.06100003980100155, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0003564156068023294, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 649.3125, + "epoch": 0.1456953642384106, + "grad_norm": 10.597989534798653, + "kl": 0.068115234375, + "learning_rate": 9.271523178807946e-07, + "loss": 0.0027, + "reward": 1.1927469968795776, + "reward_std": 1.0099957585334778, + "rewards/accuracy_reward": 0.34999997913837433, + "rewards/cosine_rewards": -0.04736426845192909, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.0005137407861184329, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 621.234375, + "epoch": 0.14620478858889455, + "grad_norm": 5.399955402557674, + "kl": 0.072265625, + "learning_rate": 9.268976057055527e-07, + "loss": 0.0029, + "reward": 0.821646511554718, + "reward_std": 0.9464232325553894, + "rewards/accuracy_reward": 0.03749999776482582, + "rewards/cosine_rewards": -0.10573448240756989, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.0007440397967002355, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 646.796875, + "epoch": 0.1467142129393785, + "grad_norm": 5.9108976297695355, + "kl": 0.075439453125, + "learning_rate": 9.266428935303107e-07, + "loss": 0.003, + "reward": 1.8053097128868103, + "reward_std": 0.5278272330760956, + "rewards/accuracy_reward": 0.7749999761581421, + "rewards/cosine_rewards": 0.061956772580742836, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0003969733224948868, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 628.484375, + "epoch": 0.14722363728986246, + "grad_norm": 4.280094122642851, + "kl": 0.0692138671875, + "learning_rate": 9.263881813550687e-07, + "loss": 0.0028, + "reward": 0.7580513060092926, + "reward_std": 0.9215057492256165, + "rewards/accuracy_reward": -0.04062502086162567, + "rewards/cosine_rewards": -0.1223737820982933, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0008249446109402925, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 697.03125, + "epoch": 0.1477330616403464, + "grad_norm": 4.704795726585343, + "kl": 0.068359375, + "learning_rate": 9.261334691798267e-07, + "loss": 0.0027, + "reward": 1.0915009379386902, + "reward_std": 0.6004486382007599, + "rewards/accuracy_reward": 0.21249999105930328, + "rewards/cosine_rewards": -0.05759305879473686, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0009060115553438663, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 739.375, + "epoch": 0.14824248599083037, + "grad_norm": 5.468968593755717, + "kl": 0.065185546875, + "learning_rate": 9.258787570045847e-07, + "loss": 0.0026, + "reward": 1.328648567199707, + "reward_std": 0.8502229452133179, + "rewards/accuracy_reward": 0.40312500298023224, + "rewards/cosine_rewards": -0.027191368862986565, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.00041003923979587853, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 800.390625, + "epoch": 0.14875191034131433, + "grad_norm": 2.644913201802289, + "kl": 0.07861328125, + "learning_rate": 9.256240448293427e-07, + "loss": 0.0031, + "reward": 1.5775163769721985, + "reward_std": 0.6978716552257538, + "rewards/accuracy_reward": 0.6562500149011612, + "rewards/cosine_rewards": 0.031187113374471664, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.0005458263913169503, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.328125, + "epoch": 0.14926133469179828, + "grad_norm": 3.858694298808609, + "kl": 0.0548095703125, + "learning_rate": 9.253693326541008e-07, + "loss": 0.0022, + "reward": 0.39561687409877777, + "reward_std": 1.1356619894504547, + "rewards/accuracy_reward": -0.1625000238418579, + "rewards/cosine_rewards": -0.23805859684944153, + "rewards/format_reward": 0.796875, + "rewards/repetition_rewards": -0.0006995665607973933, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 1044.703125, + "epoch": 0.1497707590422822, + "grad_norm": 2.0660275837501643, + "kl": 0.0902099609375, + "learning_rate": 9.251146204788588e-07, + "loss": 0.0036, + "reward": 1.0626700818538666, + "reward_std": 1.1662874221801758, + "rewards/accuracy_reward": 0.3531249985098839, + "rewards/cosine_rewards": -0.055325835943222046, + "rewards/format_reward": 0.765625, + "rewards/repetition_rewards": -0.0007540385995525867, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 934.28125, + "epoch": 0.15028018339276616, + "grad_norm": 7.548230617974183, + "kl": 0.0538330078125, + "learning_rate": 9.248599083036168e-07, + "loss": 0.0022, + "reward": 1.2535955309867859, + "reward_std": 1.0525287985801697, + "rewards/accuracy_reward": 0.3750000223517418, + "rewards/cosine_rewards": -0.04287016252055764, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.00040922046173363924, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 790.09375, + "epoch": 0.15078960774325012, + "grad_norm": 3.7401563979919654, + "kl": 0.0584716796875, + "learning_rate": 9.246051961283748e-07, + "loss": 0.0023, + "reward": 1.1489249467849731, + "reward_std": 0.5376773178577423, + "rewards/accuracy_reward": 0.2937499899417162, + "rewards/cosine_rewards": -0.08189126010984182, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0004337812424637377, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 836.375, + "epoch": 0.15129903209373408, + "grad_norm": 2.8153067499507105, + "kl": 0.0618896484375, + "learning_rate": 9.243504839531329e-07, + "loss": 0.0025, + "reward": 1.3525272011756897, + "reward_std": 0.8126451969146729, + "rewards/accuracy_reward": 0.4906250238418579, + "rewards/cosine_rewards": -0.012518584728240967, + "rewards/format_reward": 0.875, + "rewards/repetition_rewards": -0.000579186889808625, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 890.625, + "epoch": 0.15180845644421803, + "grad_norm": 5.316949650260697, + "kl": 0.0552978515625, + "learning_rate": 9.240957717778909e-07, + "loss": 0.0022, + "reward": 1.2640092372894287, + "reward_std": 0.8870376944541931, + "rewards/accuracy_reward": 0.4062499850988388, + "rewards/cosine_rewards": -0.0009442958980798721, + "rewards/format_reward": 0.859375, + "rewards/repetition_rewards": -0.0006714609917253256, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 813.953125, + "epoch": 0.152317880794702, + "grad_norm": 3.8825478721674953, + "kl": 0.0574951171875, + "learning_rate": 9.23841059602649e-07, + "loss": 0.0023, + "reward": 1.2717376947402954, + "reward_std": 0.830648809671402, + "rewards/accuracy_reward": 0.4375, + "rewards/cosine_rewards": -0.03994514420628548, + "rewards/format_reward": 0.875, + "rewards/repetition_rewards": -0.0008170758956111968, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completion_length": 752.140625, + "epoch": 0.15282730514518594, + "grad_norm": 5.06920521582769, + "kl": 0.059814453125, + "learning_rate": 9.235863474274071e-07, + "loss": 0.0024, + "reward": 1.1217154264450073, + "reward_std": 0.8524642586708069, + "rewards/accuracy_reward": 0.24062498286366463, + "rewards/cosine_rewards": -0.04011305421590805, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0006714656192343682, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 684.625, + "epoch": 0.1533367294956699, + "grad_norm": 8.555507127767159, + "kl": 0.0672607421875, + "learning_rate": 9.233316352521651e-07, + "loss": 0.0027, + "reward": 1.1471417546272278, + "reward_std": 0.7909112870693207, + "rewards/accuracy_reward": 0.2656249962747097, + "rewards/cosine_rewards": -0.039835451170802116, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0005227623041719198, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completion_length": 667.78125, + "epoch": 0.15384615384615385, + "grad_norm": 2.9040824114504877, + "kl": 0.064697265625, + "learning_rate": 9.230769230769231e-07, + "loss": 0.0026, + "reward": 0.9261243343353271, + "reward_std": 0.668161928653717, + "rewards/accuracy_reward": 0.1281249988824129, + "rewards/cosine_rewards": -0.06077958270907402, + "rewards/format_reward": 0.859375, + "rewards/repetition_rewards": -0.0005960852140560746, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 702.484375, + "epoch": 0.1543555781966378, + "grad_norm": 4.4461209381275655, + "kl": 0.06298828125, + "learning_rate": 9.228222109016812e-07, + "loss": 0.0025, + "reward": 1.506935715675354, + "reward_std": 0.6653757691383362, + "rewards/accuracy_reward": 0.5468749850988388, + "rewards/cosine_rewards": 0.03871871158480644, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0005329845298547298, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completion_length": 633.140625, + "epoch": 0.15486500254712177, + "grad_norm": 3.9254091150548933, + "kl": 0.069091796875, + "learning_rate": 9.225674987264391e-07, + "loss": 0.0028, + "reward": 1.3886016011238098, + "reward_std": 0.9017740190029144, + "rewards/accuracy_reward": 0.5749999731779099, + "rewards/cosine_rewards": -0.02933959849178791, + "rewards/format_reward": 0.84375, + "rewards/repetition_rewards": -0.000808820070233196, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 648.625, + "epoch": 0.1553744268976057, + "grad_norm": 6.070022774209878, + "kl": 0.068115234375, + "learning_rate": 9.223127865511971e-07, + "loss": 0.0027, + "reward": 1.6925800442695618, + "reward_std": 0.6231902837753296, + "rewards/accuracy_reward": 0.6625000238418579, + "rewards/cosine_rewards": 0.06164960749447346, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0003195497556589544, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.15625, + "epoch": 0.15588385124808965, + "grad_norm": 11.091865062468658, + "kl": 0.317138671875, + "learning_rate": 9.220580743759551e-07, + "loss": 0.0127, + "reward": 1.5423057079315186, + "reward_std": 0.3847469687461853, + "rewards/accuracy_reward": 0.5468749962747097, + "rewards/cosine_rewards": 0.05880427733063698, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0008736126183066517, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 612.578125, + "epoch": 0.1563932755985736, + "grad_norm": 3.103459103182676, + "kl": 0.0673828125, + "learning_rate": 9.218033622007132e-07, + "loss": 0.0027, + "reward": 1.6744784712791443, + "reward_std": 0.659433513879776, + "rewards/accuracy_reward": 0.6875, + "rewards/cosine_rewards": 0.06587037723511457, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0007668640464544296, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completion_length": 657.265625, + "epoch": 0.15690269994905756, + "grad_norm": 4.50781660421839, + "kl": 0.068115234375, + "learning_rate": 9.215486500254712e-07, + "loss": 0.0027, + "reward": 1.145881563425064, + "reward_std": 1.0458006858825684, + "rewards/accuracy_reward": 0.34062499180436134, + "rewards/cosine_rewards": -0.03727734461426735, + "rewards/format_reward": 0.84375, + "rewards/repetition_rewards": -0.0012161528575234115, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 771.5625, + "epoch": 0.15741212429954152, + "grad_norm": 6.920330329994176, + "kl": 0.064208984375, + "learning_rate": 9.212939378502292e-07, + "loss": 0.0026, + "reward": 0.8615269958972931, + "reward_std": 0.9165626764297485, + "rewards/accuracy_reward": 0.140625, + "rewards/cosine_rewards": -0.07544910162687302, + "rewards/format_reward": 0.796875, + "rewards/repetition_rewards": -0.0005239159800112247, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completion_length": 758.96875, + "epoch": 0.15792154865002547, + "grad_norm": 11.706103376756111, + "kl": 0.056396484375, + "learning_rate": 9.210392256749873e-07, + "loss": 0.0023, + "reward": 1.567901074886322, + "reward_std": 1.1157508492469788, + "rewards/accuracy_reward": 0.6437499821186066, + "rewards/cosine_rewards": 0.08178849518299103, + "rewards/format_reward": 0.84375, + "rewards/repetition_rewards": -0.0013874000869691372, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 694.390625, + "epoch": 0.15843097300050943, + "grad_norm": 4.42128730127276, + "kl": 0.062255859375, + "learning_rate": 9.207845134997453e-07, + "loss": 0.0025, + "reward": 0.944963201880455, + "reward_std": 0.9125352203845978, + "rewards/accuracy_reward": 0.16249998658895493, + "rewards/cosine_rewards": -0.04496639594435692, + "rewards/format_reward": 0.828125, + "rewards/repetition_rewards": -0.0006953877746127546, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completion_length": 812.46875, + "epoch": 0.15894039735099338, + "grad_norm": 5.888648418898334, + "kl": 0.0587158203125, + "learning_rate": 9.205298013245033e-07, + "loss": 0.0023, + "reward": 0.6825668215751648, + "reward_std": 1.0514086484909058, + "rewards/accuracy_reward": 0.04999999701976776, + "rewards/cosine_rewards": -0.13240730948746204, + "rewards/format_reward": 0.765625, + "rewards/repetition_rewards": -0.0006508340884465724, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 703.703125, + "epoch": 0.15944982170147734, + "grad_norm": 5.201587660434957, + "kl": 0.0626220703125, + "learning_rate": 9.202750891492613e-07, + "loss": 0.0025, + "reward": 0.849999725818634, + "reward_std": 1.2490254640579224, + "rewards/accuracy_reward": 0.16249999590218067, + "rewards/cosine_rewards": -0.04634671099483967, + "rewards/format_reward": 0.734375, + "rewards/repetition_rewards": -0.0005285786173772067, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completion_length": 732.84375, + "epoch": 0.1599592460519613, + "grad_norm": 41.79369545195822, + "kl": 0.0654296875, + "learning_rate": 9.200203769740194e-07, + "loss": 0.0026, + "reward": 1.359117031097412, + "reward_std": 1.1281075477600098, + "rewards/accuracy_reward": 0.49687501788139343, + "rewards/cosine_rewards": 0.06599474605172873, + "rewards/format_reward": 0.796875, + "rewards/repetition_rewards": -0.0006276974454522133, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 633.609375, + "epoch": 0.16046867040244522, + "grad_norm": 5.659472242303819, + "kl": 0.090087890625, + "learning_rate": 9.197656647987774e-07, + "loss": 0.0036, + "reward": 1.149334043264389, + "reward_std": 1.1551178693771362, + "rewards/accuracy_reward": 0.3593749962747097, + "rewards/cosine_rewards": 0.025284748524427414, + "rewards/format_reward": 0.765625, + "rewards/repetition_rewards": -0.0009507373906672001, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 704.09375, + "epoch": 0.16097809475292918, + "grad_norm": 5.702114455603425, + "kl": 0.071044921875, + "learning_rate": 9.195109526235354e-07, + "loss": 0.0028, + "reward": 1.3667227029800415, + "reward_std": 0.6237545907497406, + "rewards/accuracy_reward": 0.4031249713152647, + "rewards/cosine_rewards": 0.01146969199180603, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0009969472303055227, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 615.984375, + "epoch": 0.16148751910341314, + "grad_norm": 7.41926766137556, + "kl": 0.072998046875, + "learning_rate": 9.192562404482935e-07, + "loss": 0.0029, + "reward": 1.2655977010726929, + "reward_std": 0.7071200311183929, + "rewards/accuracy_reward": 0.37187498807907104, + "rewards/cosine_rewards": -0.0119027029722929, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.00062458252068609, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completion_length": 655.65625, + "epoch": 0.1619969434538971, + "grad_norm": 6.476802877604566, + "kl": 0.072265625, + "learning_rate": 9.190015282730514e-07, + "loss": 0.0029, + "reward": 1.4926868677139282, + "reward_std": 0.5651115030050278, + "rewards/accuracy_reward": 0.4906250089406967, + "rewards/cosine_rewards": 0.05030408315360546, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0013672530185431242, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 694.4375, + "epoch": 0.16250636780438105, + "grad_norm": 5.651262358287829, + "kl": 0.078125, + "learning_rate": 9.187468160978094e-07, + "loss": 0.0031, + "reward": 1.6467930674552917, + "reward_std": 0.6130897700786591, + "rewards/accuracy_reward": 0.6343749761581421, + "rewards/cosine_rewards": 0.060116952285170555, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0008239042945206165, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completion_length": 627.1875, + "epoch": 0.163015792154865, + "grad_norm": 8.752414113774137, + "kl": 0.087890625, + "learning_rate": 9.184921039225674e-07, + "loss": 0.0035, + "reward": 1.2824658155441284, + "reward_std": 0.6804981231689453, + "rewards/accuracy_reward": 0.4281250089406967, + "rewards/cosine_rewards": -0.004211767576634884, + "rewards/format_reward": 0.859375, + "rewards/repetition_rewards": -0.0008223777404054999, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 647.421875, + "epoch": 0.16352521650534896, + "grad_norm": 24.555397071153298, + "kl": 0.10791015625, + "learning_rate": 9.182373917473255e-07, + "loss": 0.0043, + "reward": 1.5703404545783997, + "reward_std": 0.6466428339481354, + "rewards/accuracy_reward": 0.5781249850988388, + "rewards/cosine_rewards": 0.023975687101483345, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0005103159819555003, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completion_length": 679.453125, + "epoch": 0.1640346408558329, + "grad_norm": 10.65571265954387, + "kl": 0.0751953125, + "learning_rate": 9.179826795720835e-07, + "loss": 0.003, + "reward": 1.6510714292526245, + "reward_std": 1.0072646141052246, + "rewards/accuracy_reward": 0.7062499523162842, + "rewards/cosine_rewards": 0.0703657679259777, + "rewards/format_reward": 0.875, + "rewards/repetition_rewards": -0.0005443187110358849, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 815.84375, + "epoch": 0.16454406520631687, + "grad_norm": 3.0291641455139042, + "kl": 0.0577392578125, + "learning_rate": 9.177279673968415e-07, + "loss": 0.0023, + "reward": 0.7281904220581055, + "reward_std": 0.7364227771759033, + "rewards/accuracy_reward": -0.07187500596046448, + "rewards/cosine_rewards": -0.15170371532440186, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0013558552600443363, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completion_length": 644.84375, + "epoch": 0.16505348955680083, + "grad_norm": 4.950572350987556, + "kl": 0.081787109375, + "learning_rate": 9.174732552215996e-07, + "loss": 0.0033, + "reward": 1.5456467270851135, + "reward_std": 0.3990190625190735, + "rewards/accuracy_reward": 0.5750000178813934, + "rewards/cosine_rewards": 0.03384638950228691, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.000699635551427491, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 692.96875, + "epoch": 0.16556291390728478, + "grad_norm": 5.615028773473959, + "kl": 0.0675048828125, + "learning_rate": 9.172185430463576e-07, + "loss": 0.0027, + "reward": 1.4643962979316711, + "reward_std": 0.538501039147377, + "rewards/accuracy_reward": 0.4906250089406967, + "rewards/cosine_rewards": 0.021510865539312363, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.000864640751387924, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 748.71875, + "epoch": 0.1660723382577687, + "grad_norm": 12.207464085563803, + "kl": 0.071533203125, + "learning_rate": 9.169638308711156e-07, + "loss": 0.0029, + "reward": 1.1908642947673798, + "reward_std": 0.8150831162929535, + "rewards/accuracy_reward": 0.3156250100582838, + "rewards/cosine_rewards": 0.0008599106222391129, + "rewards/format_reward": 0.875, + "rewards/repetition_rewards": -0.000620643695583567, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 692.828125, + "epoch": 0.16658176260825266, + "grad_norm": 4.633110149052728, + "kl": 0.067626953125, + "learning_rate": 9.167091186958737e-07, + "loss": 0.0027, + "reward": 1.3975687623023987, + "reward_std": 0.6602180898189545, + "rewards/accuracy_reward": 0.40937499701976776, + "rewards/cosine_rewards": 0.020020989701151848, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0005772198055638, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completion_length": 886.234375, + "epoch": 0.16709118695873662, + "grad_norm": 11.480819900317456, + "kl": 0.0567626953125, + "learning_rate": 9.164544065206317e-07, + "loss": 0.0023, + "reward": 1.3322511315345764, + "reward_std": 0.7808408439159393, + "rewards/accuracy_reward": 0.3812499865889549, + "rewards/cosine_rewards": -0.0007541030645370483, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0013698027469217777, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 869.515625, + "epoch": 0.16760061130922058, + "grad_norm": 7.997306620734284, + "kl": 0.0577392578125, + "learning_rate": 9.161996943453897e-07, + "loss": 0.0023, + "reward": 1.1871361136436462, + "reward_std": 0.9155566692352295, + "rewards/accuracy_reward": 0.3218750059604645, + "rewards/cosine_rewards": -0.0396097619086504, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.0013791794190183282, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completion_length": 879.125, + "epoch": 0.16811003565970453, + "grad_norm": 3.8086916601970175, + "kl": 0.05810546875, + "learning_rate": 9.159449821701477e-07, + "loss": 0.0023, + "reward": 1.3816418051719666, + "reward_std": 0.8457719385623932, + "rewards/accuracy_reward": 0.43437500298023224, + "rewards/cosine_rewards": 0.026831649709492922, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0014398820349015296, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 1077.125, + "epoch": 0.1686194600101885, + "grad_norm": 3.3635095209387, + "kl": 0.05029296875, + "learning_rate": 9.156902699949058e-07, + "loss": 0.002, + "reward": 1.4233552813529968, + "reward_std": 0.8923040926456451, + "rewards/accuracy_reward": 0.5718750357627869, + "rewards/cosine_rewards": 0.05618499033153057, + "rewards/format_reward": 0.796875, + "rewards/repetition_rewards": -0.0015796992811374366, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completion_length": 1025.625, + "epoch": 0.16912888436067244, + "grad_norm": 2.6795836472345886, + "kl": 0.053955078125, + "learning_rate": 9.154355578196637e-07, + "loss": 0.0022, + "reward": 1.5009884238243103, + "reward_std": 0.7616147696971893, + "rewards/accuracy_reward": 0.46562496945261955, + "rewards/cosine_rewards": 0.08306753821671009, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0008290903642773628, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 1130.390625, + "epoch": 0.1696383087111564, + "grad_norm": 2.8897374461041068, + "kl": 0.05615234375, + "learning_rate": 9.151808456444217e-07, + "loss": 0.0022, + "reward": 0.9568201899528503, + "reward_std": 0.883324146270752, + "rewards/accuracy_reward": 0.18437499552965164, + "rewards/cosine_rewards": -0.14675537310540676, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0026744193164631724, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completion_length": 1140.796875, + "epoch": 0.17014773306164035, + "grad_norm": 3.24780768226595, + "kl": 0.053955078125, + "learning_rate": 9.149261334691798e-07, + "loss": 0.0022, + "reward": 0.4763996750116348, + "reward_std": 1.3050541877746582, + "rewards/accuracy_reward": -0.07187501713633537, + "rewards/cosine_rewards": -0.26269275695085526, + "rewards/format_reward": 0.8125, + "rewards/repetition_rewards": -0.0015325736021623015, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 1162.21875, + "epoch": 0.1706571574121243, + "grad_norm": 7.96370989509143, + "kl": 0.0509033203125, + "learning_rate": 9.146714212939378e-07, + "loss": 0.002, + "reward": 1.0168579816818237, + "reward_std": 1.0622537732124329, + "rewards/accuracy_reward": 0.23749998211860657, + "rewards/cosine_rewards": -0.06289426982402802, + "rewards/format_reward": 0.84375, + "rewards/repetition_rewards": -0.0014976929523982108, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 1223.59375, + "epoch": 0.17116658176260827, + "grad_norm": 5.774523253643062, + "kl": 0.083251953125, + "learning_rate": 9.144167091186958e-07, + "loss": 0.0033, + "reward": 0.9260146915912628, + "reward_std": 1.3471828699111938, + "rewards/accuracy_reward": 0.26249998807907104, + "rewards/cosine_rewards": -0.11641103774309158, + "rewards/format_reward": 0.78125, + "rewards/repetition_rewards": -0.0013242715504020452, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 1069.484375, + "epoch": 0.1716760061130922, + "grad_norm": 7.732047491992381, + "kl": 0.0555419921875, + "learning_rate": 9.141619969434538e-07, + "loss": 0.0022, + "reward": 1.0389263331890106, + "reward_std": 0.9250738620758057, + "rewards/accuracy_reward": 0.20937499403953552, + "rewards/cosine_rewards": -0.09034883230924606, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.001974849379621446, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completion_length": 846.234375, + "epoch": 0.17218543046357615, + "grad_norm": 6.146520612031918, + "kl": 0.06689453125, + "learning_rate": 9.139072847682119e-07, + "loss": 0.0027, + "reward": 1.5287657380104065, + "reward_std": 0.7281034886837006, + "rewards/accuracy_reward": 0.5218749940395355, + "rewards/cosine_rewards": 0.055092147551476955, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0013264745939522982, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 878.59375, + "epoch": 0.1726948548140601, + "grad_norm": 5.859040533770109, + "kl": 0.059814453125, + "learning_rate": 9.136525725929699e-07, + "loss": 0.0024, + "reward": 1.309591829776764, + "reward_std": 0.8282720148563385, + "rewards/accuracy_reward": 0.3781249839812517, + "rewards/cosine_rewards": 0.02607971802353859, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.0008628710638731718, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completion_length": 706.234375, + "epoch": 0.17320427916454406, + "grad_norm": 4.068095402441544, + "kl": 0.066162109375, + "learning_rate": 9.133978604177279e-07, + "loss": 0.0026, + "reward": 1.1101016998291016, + "reward_std": 0.7019257247447968, + "rewards/accuracy_reward": 0.20624998956918716, + "rewards/cosine_rewards": -0.03270102944225073, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0009472573874518275, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 782.09375, + "epoch": 0.17371370351502802, + "grad_norm": 8.092723935778833, + "kl": 0.07080078125, + "learning_rate": 9.13143148242486e-07, + "loss": 0.0028, + "reward": 1.3624014258384705, + "reward_std": 0.6876442432403564, + "rewards/accuracy_reward": 0.40937499701976776, + "rewards/cosine_rewards": 0.0012194328010082245, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0013180217938497663, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completion_length": 725.109375, + "epoch": 0.17422312786551197, + "grad_norm": 8.756999335905311, + "kl": 0.130126953125, + "learning_rate": 9.12888436067244e-07, + "loss": 0.0052, + "reward": 1.1084296703338623, + "reward_std": 1.0551597476005554, + "rewards/accuracy_reward": 0.2343750037252903, + "rewards/cosine_rewards": -0.062263866886496544, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0011814486351795495, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 664.5625, + "epoch": 0.17473255221599593, + "grad_norm": 4.670760280414413, + "kl": 0.07275390625, + "learning_rate": 9.12633723892002e-07, + "loss": 0.0029, + "reward": 1.379169523715973, + "reward_std": 0.6884946823120117, + "rewards/accuracy_reward": 0.40937499701976776, + "rewards/cosine_rewards": -0.014050468802452087, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0005300141347106546, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completion_length": 667.359375, + "epoch": 0.17524197656647988, + "grad_norm": 27.098795999332967, + "kl": 0.08056640625, + "learning_rate": 9.123790117167601e-07, + "loss": 0.0032, + "reward": 1.6130830645561218, + "reward_std": 0.44565099477767944, + "rewards/accuracy_reward": 0.5781249701976776, + "rewards/cosine_rewards": 0.051279583014547825, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0006965193606447428, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 654.65625, + "epoch": 0.17575140091696384, + "grad_norm": 10.133659329005896, + "kl": 0.075439453125, + "learning_rate": 9.121242995415181e-07, + "loss": 0.003, + "reward": 1.6888669729232788, + "reward_std": 0.506424754858017, + "rewards/accuracy_reward": 0.690625011920929, + "rewards/cosine_rewards": 0.06190674379467964, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0011647465871647, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 647.6875, + "epoch": 0.1762608252674478, + "grad_norm": 5.129606334927334, + "kl": 0.07958984375, + "learning_rate": 9.11869587366276e-07, + "loss": 0.0032, + "reward": 1.2593636512756348, + "reward_std": 0.41098763048648834, + "rewards/accuracy_reward": 0.296875, + "rewards/cosine_rewards": -0.005569446831941605, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0006919201114214957, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 696.46875, + "epoch": 0.17677024961793172, + "grad_norm": 11.616711482358948, + "kl": 0.074462890625, + "learning_rate": 9.11614875191034e-07, + "loss": 0.003, + "reward": 1.4469356536865234, + "reward_std": 0.6090122163295746, + "rewards/accuracy_reward": 0.46562500298023224, + "rewards/cosine_rewards": -0.0018481542356312275, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.001216164615470916, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completion_length": 660.0, + "epoch": 0.17727967396841568, + "grad_norm": 16.70982931235053, + "kl": 0.092041015625, + "learning_rate": 9.113601630157921e-07, + "loss": 0.0037, + "reward": 1.3860605359077454, + "reward_std": 0.5821886360645294, + "rewards/accuracy_reward": 0.40625, + "rewards/cosine_rewards": 0.01206381805241108, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0010032225982286036, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 819.25, + "epoch": 0.17778909831889964, + "grad_norm": 10.312325223777075, + "kl": 0.0694580078125, + "learning_rate": 9.111054508405501e-07, + "loss": 0.0028, + "reward": 1.3597615957260132, + "reward_std": 0.5677385032176971, + "rewards/accuracy_reward": 0.4375000074505806, + "rewards/cosine_rewards": 0.0019306838512420654, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0015441215364262462, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completion_length": 816.125, + "epoch": 0.1782985226693836, + "grad_norm": 3.8656926374469416, + "kl": 0.07080078125, + "learning_rate": 9.108507386653081e-07, + "loss": 0.0028, + "reward": 1.1428874135017395, + "reward_std": 0.40452495217323303, + "rewards/accuracy_reward": 0.21249999105930328, + "rewards/cosine_rewards": -0.05335182696580887, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0006357444362947717, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 839.09375, + "epoch": 0.17880794701986755, + "grad_norm": 10.545025641089767, + "kl": 0.062744140625, + "learning_rate": 9.105960264900662e-07, + "loss": 0.0025, + "reward": 1.440682828426361, + "reward_std": 0.7122917473316193, + "rewards/accuracy_reward": 0.4375, + "rewards/cosine_rewards": 0.004494791850447655, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.001311894680839032, + "step": 351 + }, + { + "clip_ratio": 0.0, + "completion_length": 745.34375, + "epoch": 0.1793173713703515, + "grad_norm": 5.236449549563, + "kl": 0.081787109375, + "learning_rate": 9.103413143148242e-07, + "loss": 0.0033, + "reward": 1.7106852531433105, + "reward_std": 0.4475601017475128, + "rewards/accuracy_reward": 0.6875, + "rewards/cosine_rewards": 0.07085046917200089, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0007901439967099577, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 782.65625, + "epoch": 0.17982679572083546, + "grad_norm": 4.40390803368756, + "kl": 0.07568359375, + "learning_rate": 9.100866021395822e-07, + "loss": 0.003, + "reward": 1.321226179599762, + "reward_std": 0.5729265064001083, + "rewards/accuracy_reward": 0.3812500238418579, + "rewards/cosine_rewards": -0.04316529631614685, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.001233478484209627, + "step": 353 + }, + { + "clip_ratio": 0.0, + "completion_length": 911.4375, + "epoch": 0.18033622007131941, + "grad_norm": 4.585368026245459, + "kl": 0.083740234375, + "learning_rate": 9.098318899643402e-07, + "loss": 0.0034, + "reward": 1.2610972821712494, + "reward_std": 0.5936008393764496, + "rewards/accuracy_reward": 0.3812499828636646, + "rewards/cosine_rewards": -0.02527322620153427, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.0011294231517240405, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.59375, + "epoch": 0.18084564442180337, + "grad_norm": 7.408796362493787, + "kl": 0.0693359375, + "learning_rate": 9.095771777890983e-07, + "loss": 0.0028, + "reward": 1.2509925812482834, + "reward_std": 0.5566798448562622, + "rewards/accuracy_reward": 0.3499999940395355, + "rewards/cosine_rewards": -0.034961797297000885, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.001545542269013822, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completion_length": 875.421875, + "epoch": 0.18135506877228733, + "grad_norm": 6.46350391356739, + "kl": 0.08251953125, + "learning_rate": 9.093224656138563e-07, + "loss": 0.0033, + "reward": 1.1181039810180664, + "reward_std": 0.674926146864891, + "rewards/accuracy_reward": 0.23749998956918716, + "rewards/cosine_rewards": -0.05567748658359051, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0012184783699922264, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completion_length": 1032.265625, + "epoch": 0.18186449312277128, + "grad_norm": 6.831510020206218, + "kl": 0.0609130859375, + "learning_rate": 9.090677534386143e-07, + "loss": 0.0024, + "reward": 1.559360921382904, + "reward_std": 0.6407117247581482, + "rewards/accuracy_reward": 0.518750011920929, + "rewards/cosine_rewards": 0.057725198566913605, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0014893330517224967, + "step": 357 + }, + { + "clip_ratio": 0.0, + "completion_length": 1343.859375, + "epoch": 0.1823739174732552, + "grad_norm": 4.523279934522272, + "kl": 0.05419921875, + "learning_rate": 9.088130412633724e-07, + "loss": 0.0022, + "reward": 1.3108936548233032, + "reward_std": 1.3749122023582458, + "rewards/accuracy_reward": 0.4843749850988388, + "rewards/cosine_rewards": -0.015468426048755646, + "rewards/format_reward": 0.84375, + "rewards/repetition_rewards": -0.0017629386857151985, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completion_length": 1283.875, + "epoch": 0.18288334182373917, + "grad_norm": 2.9278358224956733, + "kl": 0.046875, + "learning_rate": 9.085583290881304e-07, + "loss": 0.0019, + "reward": 0.899000346660614, + "reward_std": 1.2357721328735352, + "rewards/accuracy_reward": 0.20000001043081284, + "rewards/cosine_rewards": -0.12764177471399307, + "rewards/format_reward": 0.828125, + "rewards/repetition_rewards": -0.0014829274150542915, + "step": 359 + }, + { + "clip_ratio": 0.0, + "completion_length": 1351.765625, + "epoch": 0.18339276617422312, + "grad_norm": 5.747269025294299, + "kl": 0.05029296875, + "learning_rate": 9.083036169128883e-07, + "loss": 0.002, + "reward": 0.685440868139267, + "reward_std": 1.0775729417800903, + "rewards/accuracy_reward": 0.062499986961483955, + "rewards/cosine_rewards": -0.23488027602434158, + "rewards/format_reward": 0.859375, + "rewards/repetition_rewards": -0.0015538162551820278, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 1328.359375, + "epoch": 0.18390219052470708, + "grad_norm": 5.5666444239146, + "kl": 0.046630859375, + "learning_rate": 9.080489047376463e-07, + "loss": 0.0019, + "reward": 1.440912902355194, + "reward_std": 1.3688839673995972, + "rewards/accuracy_reward": 0.546875, + "rewards/cosine_rewards": 0.05198000371456146, + "rewards/format_reward": 0.84375, + "rewards/repetition_rewards": -0.0016921277856454253, + "step": 361 + }, + { + "clip_ratio": 0.0, + "completion_length": 1222.359375, + "epoch": 0.18441161487519103, + "grad_norm": 3.8469994374841496, + "kl": 0.063720703125, + "learning_rate": 9.077941925624044e-07, + "loss": 0.0025, + "reward": 1.2567678689956665, + "reward_std": 1.0096549689769745, + "rewards/accuracy_reward": 0.3531249761581421, + "rewards/cosine_rewards": -0.06352230161428452, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.001584898098371923, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completion_length": 1071.90625, + "epoch": 0.184921039225675, + "grad_norm": 10.748025785151507, + "kl": 0.080810546875, + "learning_rate": 9.075394803871624e-07, + "loss": 0.0032, + "reward": 1.4773434400558472, + "reward_std": 0.7611989676952362, + "rewards/accuracy_reward": 0.518750011920929, + "rewards/cosine_rewards": 0.022458821535110474, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0013653661007992923, + "step": 363 + }, + { + "clip_ratio": 0.0, + "completion_length": 1068.84375, + "epoch": 0.18543046357615894, + "grad_norm": 6.961531569684862, + "kl": 0.0966796875, + "learning_rate": 9.072847682119204e-07, + "loss": 0.0039, + "reward": 1.3342331051826477, + "reward_std": 0.9612607657909393, + "rewards/accuracy_reward": 0.4906249940395355, + "rewards/cosine_rewards": -0.029875734820961952, + "rewards/format_reward": 0.875, + "rewards/repetition_rewards": -0.0015161921037361026, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completion_length": 1151.765625, + "epoch": 0.1859398879266429, + "grad_norm": 4.787495962824196, + "kl": 0.0526123046875, + "learning_rate": 9.070300560366785e-07, + "loss": 0.0021, + "reward": 0.44851796329021454, + "reward_std": 0.6482652425765991, + "rewards/accuracy_reward": -0.18125002831220627, + "rewards/cosine_rewards": -0.3225611299276352, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0007959024223964661, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completion_length": 1087.453125, + "epoch": 0.18644931227712686, + "grad_norm": 3.5882611622656704, + "kl": 0.05517578125, + "learning_rate": 9.067753438614365e-07, + "loss": 0.0022, + "reward": 1.0280417203903198, + "reward_std": 0.8365518152713776, + "rewards/accuracy_reward": 0.2093750163912773, + "rewards/cosine_rewards": -0.10154062137007713, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0016676230588927865, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completion_length": 921.3125, + "epoch": 0.1869587366276108, + "grad_norm": 10.03430669886217, + "kl": 0.07080078125, + "learning_rate": 9.065206316861945e-07, + "loss": 0.0028, + "reward": 1.1769609451293945, + "reward_std": 0.880241334438324, + "rewards/accuracy_reward": 0.2656249925494194, + "rewards/cosine_rewards": -0.04036855325102806, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0014205531333573163, + "step": 367 + }, + { + "clip_ratio": 0.0, + "completion_length": 841.6875, + "epoch": 0.18746816097809477, + "grad_norm": 27.23508906311087, + "kl": 0.07861328125, + "learning_rate": 9.062659195109526e-07, + "loss": 0.0031, + "reward": 1.685433030128479, + "reward_std": 0.49864277243614197, + "rewards/accuracy_reward": 0.6875, + "rewards/cosine_rewards": 0.1084844060242176, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.0011764070368371904, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completion_length": 737.03125, + "epoch": 0.1879775853285787, + "grad_norm": 18.41490109995637, + "kl": 0.08740234375, + "learning_rate": 9.060112073357106e-07, + "loss": 0.0035, + "reward": 1.3734083771705627, + "reward_std": 0.4119359850883484, + "rewards/accuracy_reward": 0.37812499701976776, + "rewards/cosine_rewards": 0.01141296117566526, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.000504543146234937, + "step": 369 + }, + { + "clip_ratio": 0.0, + "completion_length": 707.796875, + "epoch": 0.18848700967906265, + "grad_norm": 35.58611363543668, + "kl": 0.084716796875, + "learning_rate": 9.057564951604686e-07, + "loss": 0.0034, + "reward": 1.6782256960868835, + "reward_std": 0.5156250298023224, + "rewards/accuracy_reward": 0.6343749761581421, + "rewards/cosine_rewards": 0.07835755217820406, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0032568235765211284, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 647.015625, + "epoch": 0.1889964340295466, + "grad_norm": 7.982503076191807, + "kl": 0.086669921875, + "learning_rate": 9.055017829852266e-07, + "loss": 0.0035, + "reward": 1.760904848575592, + "reward_std": 0.49070215225219727, + "rewards/accuracy_reward": 0.690625011920929, + "rewards/cosine_rewards": 0.0864610131829977, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0005561279249377549, + "step": 371 + }, + { + "clip_ratio": 0.0, + "completion_length": 716.359375, + "epoch": 0.18950585838003056, + "grad_norm": 10.914576943859945, + "kl": 0.077880859375, + "learning_rate": 9.052470708099847e-07, + "loss": 0.0031, + "reward": 1.9701185822486877, + "reward_std": 0.40086938440799713, + "rewards/accuracy_reward": 0.831250011920929, + "rewards/cosine_rewards": 0.1397455483675003, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.0008769762353040278, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completion_length": 738.546875, + "epoch": 0.19001528273051452, + "grad_norm": 6.034059111318338, + "kl": 0.08544921875, + "learning_rate": 9.049923586347427e-07, + "loss": 0.0034, + "reward": 1.8058127164840698, + "reward_std": 0.41330619156360626, + "rewards/accuracy_reward": 0.7468750178813934, + "rewards/cosine_rewards": 0.1067701168358326, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0009574841533321887, + "step": 373 + }, + { + "clip_ratio": 0.0, + "completion_length": 728.0, + "epoch": 0.19052470708099847, + "grad_norm": 8.532753416320839, + "kl": 0.07861328125, + "learning_rate": 9.047376464595006e-07, + "loss": 0.0031, + "reward": 1.0842646658420563, + "reward_std": 0.44039003551006317, + "rewards/accuracy_reward": 0.15312500298023224, + "rewards/cosine_rewards": -0.052397772669792175, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0008375749748665839, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completion_length": 804.21875, + "epoch": 0.19103413143148243, + "grad_norm": 7.4599789196405375, + "kl": 0.078125, + "learning_rate": 9.044829342842587e-07, + "loss": 0.0031, + "reward": 0.974018394947052, + "reward_std": 0.5849625766277313, + "rewards/accuracy_reward": 0.09999999403953552, + "rewards/cosine_rewards": -0.10963174607604742, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0007248484616866335, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completion_length": 767.84375, + "epoch": 0.19154355578196638, + "grad_norm": 7.073657321582574, + "kl": 0.0703125, + "learning_rate": 9.042282221090167e-07, + "loss": 0.0028, + "reward": 0.914261519908905, + "reward_std": 0.7159627079963684, + "rewards/accuracy_reward": 0.09999998658895493, + "rewards/cosine_rewards": -0.13804471492767334, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0008187246276065707, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completion_length": 898.65625, + "epoch": 0.19205298013245034, + "grad_norm": 5.152812978669099, + "kl": 0.060791015625, + "learning_rate": 9.039735099337747e-07, + "loss": 0.0024, + "reward": 1.3070060014724731, + "reward_std": 0.5369542390108109, + "rewards/accuracy_reward": 0.3531250059604645, + "rewards/cosine_rewards": 0.01811320334672928, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0017322039348073304, + "step": 377 + }, + { + "clip_ratio": 0.0, + "completion_length": 824.65625, + "epoch": 0.1925624044829343, + "grad_norm": 4.4264228290413055, + "kl": 0.071044921875, + "learning_rate": 9.037187977585327e-07, + "loss": 0.0028, + "reward": 1.9969289302825928, + "reward_std": 0.36238182336091995, + "rewards/accuracy_reward": 0.887499988079071, + "rewards/cosine_rewards": 0.1572401076555252, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0009361990523757413, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completion_length": 1041.234375, + "epoch": 0.19307182883341822, + "grad_norm": 2.966764644317296, + "kl": 0.0531005859375, + "learning_rate": 9.034640855832908e-07, + "loss": 0.0021, + "reward": 1.9203879237174988, + "reward_std": 0.6297050192952156, + "rewards/accuracy_reward": 0.831250011920929, + "rewards/cosine_rewards": 0.1525670364499092, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0009290309972129762, + "step": 379 + }, + { + "clip_ratio": 0.0, + "completion_length": 1066.578125, + "epoch": 0.19358125318390218, + "grad_norm": 6.45607299399273, + "kl": 0.0604248046875, + "learning_rate": 9.032093734080488e-07, + "loss": 0.0024, + "reward": 1.5978580713272095, + "reward_std": 0.7550583779811859, + "rewards/accuracy_reward": 0.546875, + "rewards/cosine_rewards": 0.0831909030675888, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0009578557801432908, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 1159.84375, + "epoch": 0.19409067753438614, + "grad_norm": 15.218618980246333, + "kl": 0.0557861328125, + "learning_rate": 9.029546612328068e-07, + "loss": 0.0022, + "reward": 1.495898723602295, + "reward_std": 0.8071758449077606, + "rewards/accuracy_reward": 0.5187499970197678, + "rewards/cosine_rewards": 0.04102367162704468, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0013749129138886929, + "step": 381 + }, + { + "clip_ratio": 0.0, + "completion_length": 1371.671875, + "epoch": 0.1946001018848701, + "grad_norm": 2.5671889777891415, + "kl": 0.0416259765625, + "learning_rate": 9.026999490575649e-07, + "loss": 0.0017, + "reward": 1.4654145240783691, + "reward_std": 0.9137448668479919, + "rewards/accuracy_reward": 0.5468749925494194, + "rewards/cosine_rewards": 0.029642254114151, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.0017276888247579336, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completion_length": 1303.015625, + "epoch": 0.19510952623535405, + "grad_norm": 3.460987727073742, + "kl": 0.0421142578125, + "learning_rate": 9.024452368823229e-07, + "loss": 0.0017, + "reward": 1.348323106765747, + "reward_std": 0.42551596462726593, + "rewards/accuracy_reward": 0.40937499701976776, + "rewards/cosine_rewards": 0.0025482475757598877, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0011001455131918192, + "step": 383 + }, + { + "clip_ratio": 0.0, + "completion_length": 1507.828125, + "epoch": 0.195618950585838, + "grad_norm": 2.4731324069110165, + "kl": 0.040771484375, + "learning_rate": 9.021905247070809e-07, + "loss": 0.0016, + "reward": 1.2840899229049683, + "reward_std": 1.3389369249343872, + "rewards/accuracy_reward": 0.43437498807907104, + "rewards/cosine_rewards": 0.007216873578727245, + "rewards/format_reward": 0.84375, + "rewards/repetition_rewards": -0.0012519625015556812, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 1392.5625, + "epoch": 0.19612837493632196, + "grad_norm": 3.9847528096417464, + "kl": 0.0401611328125, + "learning_rate": 9.019358125318391e-07, + "loss": 0.0016, + "reward": 0.952269122004509, + "reward_std": 1.110903412103653, + "rewards/accuracy_reward": 0.21249999478459358, + "rewards/cosine_rewards": -0.16534814983606339, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.001132699428126216, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completion_length": 1509.515625, + "epoch": 0.19663779928680591, + "grad_norm": 1.3917929153423205, + "kl": 0.0386962890625, + "learning_rate": 9.016811003565971e-07, + "loss": 0.0015, + "reward": 1.3951207399368286, + "reward_std": 1.3895853757858276, + "rewards/accuracy_reward": 0.49062497913837433, + "rewards/cosine_rewards": 0.030975546687841415, + "rewards/format_reward": 0.875, + "rewards/repetition_rewards": -0.0014798620832152665, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completion_length": 1453.75, + "epoch": 0.19714722363728987, + "grad_norm": 4.5123123100553215, + "kl": 0.040283203125, + "learning_rate": 9.014263881813551e-07, + "loss": 0.0016, + "reward": 1.0250075459480286, + "reward_std": 1.1121925115585327, + "rewards/accuracy_reward": 0.2656249925494194, + "rewards/cosine_rewards": -0.06678299978375435, + "rewards/format_reward": 0.828125, + "rewards/repetition_rewards": -0.001959475106559694, + "step": 387 + }, + { + "clip_ratio": 0.0, + "completion_length": 1502.953125, + "epoch": 0.19765664798777383, + "grad_norm": 6.3687413391252665, + "kl": 0.0384521484375, + "learning_rate": 9.011716760061131e-07, + "loss": 0.0015, + "reward": 0.6134699061512947, + "reward_std": 0.8420631885528564, + "rewards/accuracy_reward": 0.015624985098838806, + "rewards/cosine_rewards": -0.2909963075071573, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.0017838198109529912, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completion_length": 1398.71875, + "epoch": 0.19816607233825778, + "grad_norm": 3.252398002176852, + "kl": 0.04052734375, + "learning_rate": 9.009169638308711e-07, + "loss": 0.0016, + "reward": 0.6156338006258011, + "reward_std": 1.171474575996399, + "rewards/accuracy_reward": -0.012500010430812836, + "rewards/cosine_rewards": -0.2769355773925781, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.0011806105903815478, + "step": 389 + }, + { + "clip_ratio": 0.0, + "completion_length": 1356.796875, + "epoch": 0.1986754966887417, + "grad_norm": 2.9564973631467386, + "kl": 0.0411376953125, + "learning_rate": 9.006622516556291e-07, + "loss": 0.0016, + "reward": 1.4809187650680542, + "reward_std": 0.4241075813770294, + "rewards/accuracy_reward": 0.4656249899417162, + "rewards/cosine_rewards": 0.0635819137096405, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0014131638454273343, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 1314.8125, + "epoch": 0.19918492103922567, + "grad_norm": 1.9868954496027869, + "kl": 0.040283203125, + "learning_rate": 9.004075394803871e-07, + "loss": 0.0016, + "reward": 0.3289404660463333, + "reward_std": 0.6832451522350311, + "rewards/accuracy_reward": -0.23750004172325134, + "rewards/cosine_rewards": -0.38559940457344055, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0010851426632143557, + "step": 391 + }, + { + "clip_ratio": 0.0, + "completion_length": 1211.65625, + "epoch": 0.19969434538970962, + "grad_norm": 2.3384012636409524, + "kl": 0.0426025390625, + "learning_rate": 9.001528273051452e-07, + "loss": 0.0017, + "reward": 1.7431849241256714, + "reward_std": 0.5287438631057739, + "rewards/accuracy_reward": 0.6624999940395355, + "rewards/cosine_rewards": 0.0974309928715229, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0011210814118385315, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completion_length": 1199.40625, + "epoch": 0.20020376974019358, + "grad_norm": 8.037383660003005, + "kl": 0.0426025390625, + "learning_rate": 8.998981151299032e-07, + "loss": 0.0017, + "reward": 1.205706238746643, + "reward_std": 0.5482289791107178, + "rewards/accuracy_reward": 0.2968749925494194, + "rewards/cosine_rewards": -0.09018014371395111, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.0009886454208754003, + "step": 393 + }, + { + "clip_ratio": 0.0, + "completion_length": 1215.25, + "epoch": 0.20071319409067753, + "grad_norm": 2.7015176132022205, + "kl": 0.04150390625, + "learning_rate": 8.996434029546612e-07, + "loss": 0.0017, + "reward": 1.3461086750030518, + "reward_std": 0.36276355385780334, + "rewards/accuracy_reward": 0.3812499940395355, + "rewards/cosine_rewards": -0.033333455212414265, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.0018078879220411181, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completion_length": 1148.140625, + "epoch": 0.2012226184411615, + "grad_norm": 2.4525739585224064, + "kl": 0.0447998046875, + "learning_rate": 8.993886907794193e-07, + "loss": 0.0018, + "reward": 1.6304560899734497, + "reward_std": 0.6783818304538727, + "rewards/accuracy_reward": 0.5781249850988388, + "rewards/cosine_rewards": 0.0690329410135746, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0010768624488264322, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completion_length": 1234.03125, + "epoch": 0.20173204279164544, + "grad_norm": 2.620518407503657, + "kl": 0.0426025390625, + "learning_rate": 8.991339786041773e-07, + "loss": 0.0017, + "reward": 1.0580366849899292, + "reward_std": 0.45367684960365295, + "rewards/accuracy_reward": 0.18437499552965164, + "rewards/cosine_rewards": -0.09430436789989471, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0007839706668164581, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completion_length": 1255.140625, + "epoch": 0.2022414671421294, + "grad_norm": 2.848324792333859, + "kl": 0.0416259765625, + "learning_rate": 8.988792664289353e-07, + "loss": 0.0017, + "reward": 1.396336853504181, + "reward_std": 0.6851004362106323, + "rewards/accuracy_reward": 0.40937498584389687, + "rewards/cosine_rewards": 0.003251887857913971, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.00066499671083875, + "step": 397 + }, + { + "clip_ratio": 0.0, + "completion_length": 1243.234375, + "epoch": 0.20275089149261336, + "grad_norm": 2.5122988909394457, + "kl": 0.04150390625, + "learning_rate": 8.986245542536933e-07, + "loss": 0.0017, + "reward": 2.053937077522278, + "reward_std": 0.5187530070543289, + "rewards/accuracy_reward": 0.8312499821186066, + "rewards/cosine_rewards": 0.22372649610042572, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.0010393889679107815, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completion_length": 1395.28125, + "epoch": 0.2032603158430973, + "grad_norm": 8.131421667160394, + "kl": 0.039306640625, + "learning_rate": 8.983698420784514e-07, + "loss": 0.0016, + "reward": 1.9118317365646362, + "reward_std": 0.3381110727787018, + "rewards/accuracy_reward": 0.7187500149011612, + "rewards/cosine_rewards": 0.19487697072327137, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.001795282296370715, + "step": 399 + }, + { + "clip_ratio": 0.0, + "completion_length": 1477.328125, + "epoch": 0.20376974019358127, + "grad_norm": 2.5663546513961992, + "kl": 0.0489501953125, + "learning_rate": 8.981151299032094e-07, + "loss": 0.002, + "reward": 0.616385743021965, + "reward_std": 0.5365406274795532, + "rewards/accuracy_reward": -0.012500017881393433, + "rewards/cosine_rewards": -0.27611421793699265, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.0012499869335442781, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 1641.53125, + "epoch": 0.2042791645440652, + "grad_norm": 2.3476982545455947, + "kl": 0.0382080078125, + "learning_rate": 8.978604177279674e-07, + "loss": 0.0015, + "reward": 0.38990160822868347, + "reward_std": 1.22097048163414, + "rewards/accuracy_reward": -0.06875001266598701, + "rewards/cosine_rewards": -0.352715402841568, + "rewards/format_reward": 0.8125, + "rewards/repetition_rewards": -0.0011329837725497782, + "step": 401 + }, + { + "clip_ratio": 0.0, + "completion_length": 1740.96875, + "epoch": 0.20478858889454915, + "grad_norm": 1.6789909982175664, + "kl": 0.036376953125, + "learning_rate": 8.976057055527255e-07, + "loss": 0.0015, + "reward": 0.7690124660730362, + "reward_std": 1.7883394956588745, + "rewards/accuracy_reward": 0.24062499403953552, + "rewards/cosine_rewards": -0.15724666975438595, + "rewards/format_reward": 0.6875, + "rewards/repetition_rewards": -0.001865879981778562, + "step": 402 + }, + { + "clip_ratio": 0.0, + "completion_length": 1715.625, + "epoch": 0.2052980132450331, + "grad_norm": 1.732486740072958, + "kl": 0.035400390625, + "learning_rate": 8.973509933774834e-07, + "loss": 0.0014, + "reward": 0.6791011095046997, + "reward_std": 1.0334843397140503, + "rewards/accuracy_reward": 0.1249999925494194, + "rewards/cosine_rewards": -0.21049801260232925, + "rewards/format_reward": 0.765625, + "rewards/repetition_rewards": -0.0010258048423565924, + "step": 403 + }, + { + "clip_ratio": 0.0, + "completion_length": 1585.046875, + "epoch": 0.20580743759551706, + "grad_norm": 1.6162362158227377, + "kl": 0.037109375, + "learning_rate": 8.970962812022414e-07, + "loss": 0.0015, + "reward": 0.9881232976913452, + "reward_std": 1.0253838300704956, + "rewards/accuracy_reward": 0.24062499403953552, + "rewards/cosine_rewards": -0.12615075334906578, + "rewards/format_reward": 0.875, + "rewards/repetition_rewards": -0.001350913429632783, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completion_length": 1497.734375, + "epoch": 0.20631686194600102, + "grad_norm": 5.362930427796704, + "kl": 0.039306640625, + "learning_rate": 8.968415690269994e-07, + "loss": 0.0016, + "reward": 1.5187935531139374, + "reward_std": 0.5071015954017639, + "rewards/accuracy_reward": 0.5218749940395355, + "rewards/cosine_rewards": 0.07592727243900299, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0008836896740831435, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completion_length": 1471.890625, + "epoch": 0.20682628629648497, + "grad_norm": 2.5474971754837896, + "kl": 0.0374755859375, + "learning_rate": 8.965868568517575e-07, + "loss": 0.0015, + "reward": 1.7093470096588135, + "reward_std": 0.26929083466529846, + "rewards/accuracy_reward": 0.6062499955296516, + "rewards/cosine_rewards": 0.13566255569458008, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0013155650231055915, + "step": 406 + }, + { + "clip_ratio": 0.0, + "completion_length": 1517.765625, + "epoch": 0.20733571064696893, + "grad_norm": 2.3954440093211695, + "kl": 0.0372314453125, + "learning_rate": 8.963321446765155e-07, + "loss": 0.0015, + "reward": 1.6693125367164612, + "reward_std": 0.8508188724517822, + "rewards/accuracy_reward": 0.5781250149011612, + "rewards/cosine_rewards": 0.12336396798491478, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.000926460576010868, + "step": 407 + }, + { + "clip_ratio": 0.0, + "completion_length": 1427.40625, + "epoch": 0.20784513499745289, + "grad_norm": 4.487502302070771, + "kl": 0.037109375, + "learning_rate": 8.960774325012735e-07, + "loss": 0.0015, + "reward": 1.6373254656791687, + "reward_std": 0.37433764338493347, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/cosine_rewards": 0.11931294947862625, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0007375250570476055, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completion_length": 1500.875, + "epoch": 0.20835455934793684, + "grad_norm": 5.555469475832445, + "kl": 0.0374755859375, + "learning_rate": 8.958227203260316e-07, + "loss": 0.0015, + "reward": 1.398006021976471, + "reward_std": 1.336867332458496, + "rewards/accuracy_reward": 0.4375, + "rewards/cosine_rewards": 0.02401774376630783, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.001011726533761248, + "step": 409 + }, + { + "clip_ratio": 0.0, + "completion_length": 1434.859375, + "epoch": 0.2088639836984208, + "grad_norm": 3.6143044040934105, + "kl": 0.0435791015625, + "learning_rate": 8.955680081507896e-07, + "loss": 0.0017, + "reward": 1.618862271308899, + "reward_std": 0.7050271332263947, + "rewards/accuracy_reward": 0.5468750074505806, + "rewards/cosine_rewards": 0.1038745865225792, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.000637321179965511, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completion_length": 1506.828125, + "epoch": 0.20937340804890472, + "grad_norm": 3.854404990598997, + "kl": 0.0361328125, + "learning_rate": 8.953132959755476e-07, + "loss": 0.0014, + "reward": 1.6651726961135864, + "reward_std": 0.45976050198078156, + "rewards/accuracy_reward": 0.5781250074505806, + "rewards/cosine_rewards": 0.11916181445121765, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0008640679297968745, + "step": 411 + }, + { + "clip_ratio": 0.0, + "completion_length": 1526.96875, + "epoch": 0.20988283239938868, + "grad_norm": 2.3422021364641736, + "kl": 0.03662109375, + "learning_rate": 8.950585838003057e-07, + "loss": 0.0015, + "reward": 0.6352521181106567, + "reward_std": 1.1320685744285583, + "rewards/accuracy_reward": -0.012500017881393433, + "rewards/cosine_rewards": -0.28875819593667984, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0009897005802486092, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.03125, + "epoch": 0.21039225674987264, + "grad_norm": 2.1794044547587275, + "kl": 0.0567626953125, + "learning_rate": 8.948038716250637e-07, + "loss": 0.0023, + "reward": 1.4266446828842163, + "reward_std": 0.8459653854370117, + "rewards/accuracy_reward": 0.4624999910593033, + "rewards/cosine_rewards": 0.07424483820796013, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.0007251804636325687, + "step": 413 + }, + { + "clip_ratio": 0.0, + "completion_length": 1505.8125, + "epoch": 0.2109016811003566, + "grad_norm": 2.019375037035424, + "kl": 0.042236328125, + "learning_rate": 8.945491594498217e-07, + "loss": 0.0017, + "reward": 1.4379878044128418, + "reward_std": 0.6174334287643433, + "rewards/accuracy_reward": 0.4374999888241291, + "rewards/cosine_rewards": 0.04812653362751007, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0007637535745743662, + "step": 414 + }, + { + "clip_ratio": 0.0, + "completion_length": 1476.9375, + "epoch": 0.21141110545084055, + "grad_norm": 2.647595808512136, + "kl": 0.041259765625, + "learning_rate": 8.942944472745797e-07, + "loss": 0.0016, + "reward": 0.9988905191421509, + "reward_std": 0.6921209692955017, + "rewards/accuracy_reward": 0.20937499403953552, + "rewards/cosine_rewards": -0.1462814100086689, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0017031602037604898, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completion_length": 1508.578125, + "epoch": 0.2119205298013245, + "grad_norm": 2.6566052420282933, + "kl": 0.03466796875, + "learning_rate": 8.940397350993378e-07, + "loss": 0.0014, + "reward": 1.211571991443634, + "reward_std": 1.0560529828071594, + "rewards/accuracy_reward": 0.32499998807907104, + "rewards/cosine_rewards": -0.0497976616024971, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.00113033052184619, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completion_length": 1513.40625, + "epoch": 0.21242995415180846, + "grad_norm": 2.084079343038072, + "kl": 0.0411376953125, + "learning_rate": 8.937850229240957e-07, + "loss": 0.0016, + "reward": 0.5206416845321655, + "reward_std": 0.49498558044433594, + "rewards/accuracy_reward": -0.09687501192092896, + "rewards/cosine_rewards": -0.36537329852581024, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0014850463485345244, + "step": 417 + }, + { + "clip_ratio": 0.0, + "completion_length": 1506.921875, + "epoch": 0.21293937850229241, + "grad_norm": 1.7101934155068432, + "kl": 0.036865234375, + "learning_rate": 8.935303107488537e-07, + "loss": 0.0015, + "reward": 1.16130793094635, + "reward_std": 0.738935075700283, + "rewards/accuracy_reward": 0.2968749850988388, + "rewards/cosine_rewards": -0.08809526264667511, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0005968308250885457, + "step": 418 + }, + { + "clip_ratio": 0.0, + "completion_length": 1452.859375, + "epoch": 0.21344880285277637, + "grad_norm": 2.6364264984634236, + "kl": 0.037109375, + "learning_rate": 8.932755985736118e-07, + "loss": 0.0015, + "reward": 1.4882609844207764, + "reward_std": 0.6527669131755829, + "rewards/accuracy_reward": 0.4937499836087227, + "rewards/cosine_rewards": 0.042041175067424774, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0006552368577104062, + "step": 419 + }, + { + "clip_ratio": 0.0, + "completion_length": 1425.0, + "epoch": 0.21395822720326033, + "grad_norm": 22.24294483419425, + "kl": 0.0374755859375, + "learning_rate": 8.930208863983698e-07, + "loss": 0.0015, + "reward": 1.5828353762626648, + "reward_std": 0.6265529096126556, + "rewards/accuracy_reward": 0.5468749850988388, + "rewards/cosine_rewards": 0.08372939098626375, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0008939505496528, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 1396.171875, + "epoch": 0.21446765155374428, + "grad_norm": 2.8168572000468366, + "kl": 0.049560546875, + "learning_rate": 8.927661742231278e-07, + "loss": 0.002, + "reward": 1.6206639409065247, + "reward_std": 0.5450826287269592, + "rewards/accuracy_reward": 0.546875, + "rewards/cosine_rewards": 0.12136101722717285, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0006969515234231949, + "step": 421 + }, + { + "clip_ratio": 0.0, + "completion_length": 1395.9375, + "epoch": 0.2149770759042282, + "grad_norm": 1.840733711397487, + "kl": 0.0379638671875, + "learning_rate": 8.925114620478858e-07, + "loss": 0.0015, + "reward": 1.8798171877861023, + "reward_std": 0.5979900360107422, + "rewards/accuracy_reward": 0.690625011920929, + "rewards/cosine_rewards": 0.18991604819893837, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.0007238158723339438, + "step": 422 + }, + { + "clip_ratio": 0.0, + "completion_length": 1503.703125, + "epoch": 0.21548650025471217, + "grad_norm": 2.327429653832842, + "kl": 0.0377197265625, + "learning_rate": 8.922567498726439e-07, + "loss": 0.0015, + "reward": 1.1887712478637695, + "reward_std": 0.615043044090271, + "rewards/accuracy_reward": 0.2968749850988388, + "rewards/cosine_rewards": -0.09194361418485641, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0005351053987396881, + "step": 423 + }, + { + "clip_ratio": 0.0, + "completion_length": 1528.765625, + "epoch": 0.21599592460519612, + "grad_norm": 3.1639646848610017, + "kl": 0.0347900390625, + "learning_rate": 8.920020376974019e-07, + "loss": 0.0014, + "reward": 1.1957539916038513, + "reward_std": 1.2394747734069824, + "rewards/accuracy_reward": 0.3531249985098839, + "rewards/cosine_rewards": -0.03115752711892128, + "rewards/format_reward": 0.875, + "rewards/repetition_rewards": -0.001213467272464186, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completion_length": 1674.765625, + "epoch": 0.21650534895568008, + "grad_norm": 2.5043711144165126, + "kl": 0.0338134765625, + "learning_rate": 8.917473255221599e-07, + "loss": 0.0014, + "reward": 1.1731443107128143, + "reward_std": 0.8068048655986786, + "rewards/accuracy_reward": 0.3218749836087227, + "rewards/cosine_rewards": -0.038288604468107224, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.0010670205520000309, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completion_length": 1664.5, + "epoch": 0.21701477330616403, + "grad_norm": 3.6500533940846327, + "kl": 0.03515625, + "learning_rate": 8.91492613346918e-07, + "loss": 0.0014, + "reward": 0.6680706441402435, + "reward_std": 1.1470927596092224, + "rewards/accuracy_reward": 0.012499995529651642, + "rewards/cosine_rewards": -0.28094063699245453, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.000988698098808527, + "step": 426 + }, + { + "clip_ratio": 0.0, + "completion_length": 1733.359375, + "epoch": 0.217524197656648, + "grad_norm": 1.7264860203774577, + "kl": 0.033203125, + "learning_rate": 8.91237901171676e-07, + "loss": 0.0013, + "reward": 1.151515543460846, + "reward_std": 1.0065627694129944, + "rewards/accuracy_reward": 0.37812499701976776, + "rewards/cosine_rewards": -0.02256488800048828, + "rewards/format_reward": 0.796875, + "rewards/repetition_rewards": -0.0009195689344778657, + "step": 427 + }, + { + "clip_ratio": 0.0, + "completion_length": 1738.171875, + "epoch": 0.21803362200713194, + "grad_norm": 1.8028678441679502, + "kl": 0.033447265625, + "learning_rate": 8.90983188996434e-07, + "loss": 0.0013, + "reward": 0.33622707426548004, + "reward_std": 1.554500699043274, + "rewards/accuracy_reward": -0.046875011175870895, + "rewards/cosine_rewards": -0.35029861330986023, + "rewards/format_reward": 0.734375, + "rewards/repetition_rewards": -0.0009743365517351776, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completion_length": 1710.0, + "epoch": 0.2185430463576159, + "grad_norm": 1.7315216890300187, + "kl": 0.0386962890625, + "learning_rate": 8.90728476821192e-07, + "loss": 0.0015, + "reward": 1.2531213760375977, + "reward_std": 1.7619973421096802, + "rewards/accuracy_reward": 0.4624999910593033, + "rewards/cosine_rewards": 0.010814379900693893, + "rewards/format_reward": 0.78125, + "rewards/repetition_rewards": -0.0014429978909902275, + "step": 429 + }, + { + "clip_ratio": 0.0, + "completion_length": 1681.5625, + "epoch": 0.21905247070809986, + "grad_norm": 1.448581293364632, + "kl": 0.0350341796875, + "learning_rate": 8.904737646459501e-07, + "loss": 0.0014, + "reward": 0.5936174094676971, + "reward_std": 1.1982838213443756, + "rewards/accuracy_reward": 0.015624940395355225, + "rewards/cosine_rewards": -0.2807646095752716, + "rewards/format_reward": 0.859375, + "rewards/repetition_rewards": -0.0006179730116855353, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completion_length": 1480.1875, + "epoch": 0.2195618950585838, + "grad_norm": 4.830088485887878, + "kl": 0.0394287109375, + "learning_rate": 8.90219052470708e-07, + "loss": 0.0016, + "reward": 1.1779060363769531, + "reward_std": 1.0625053942203522, + "rewards/accuracy_reward": 0.31562499701976776, + "rewards/cosine_rewards": -0.05869085341691971, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0009031399386003613, + "step": 431 + }, + { + "clip_ratio": 0.0, + "completion_length": 1440.796875, + "epoch": 0.22007131940906777, + "grad_norm": 2.4405157931321124, + "kl": 0.037109375, + "learning_rate": 8.89964340295466e-07, + "loss": 0.0015, + "reward": 0.9002698361873627, + "reward_std": 0.7880153059959412, + "rewards/accuracy_reward": 0.09999998845160007, + "rewards/cosine_rewards": -0.18298358470201492, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0011215846752747893, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completion_length": 1348.515625, + "epoch": 0.2205807437595517, + "grad_norm": 2.1326390000811806, + "kl": 0.0418701171875, + "learning_rate": 8.897096281202241e-07, + "loss": 0.0017, + "reward": 0.7409723997116089, + "reward_std": 0.7918355762958527, + "rewards/accuracy_reward": 0.015624990686774254, + "rewards/cosine_rewards": -0.21155225485563278, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.000600404484430328, + "step": 433 + }, + { + "clip_ratio": 0.0, + "completion_length": 1298.125, + "epoch": 0.22109016811003565, + "grad_norm": 3.815337891096848, + "kl": 0.0418701171875, + "learning_rate": 8.894549159449821e-07, + "loss": 0.0017, + "reward": 1.8587952256202698, + "reward_std": 0.6939655542373657, + "rewards/accuracy_reward": 0.7187499701976776, + "rewards/cosine_rewards": 0.1717987135052681, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0005034840432927012, + "step": 434 + }, + { + "clip_ratio": 0.0, + "completion_length": 1156.0, + "epoch": 0.2215995924605196, + "grad_norm": 4.479065196602373, + "kl": 0.0440673828125, + "learning_rate": 8.892002037697401e-07, + "loss": 0.0018, + "reward": 1.4347090125083923, + "reward_std": 0.3772214949131012, + "rewards/accuracy_reward": 0.43749997206032276, + "rewards/cosine_rewards": -0.0022302046418190002, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.0005607931379927322, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completion_length": 1158.46875, + "epoch": 0.22210901681100356, + "grad_norm": 4.149290240827553, + "kl": 0.0455322265625, + "learning_rate": 8.889454915944982e-07, + "loss": 0.0018, + "reward": 1.0905642956495285, + "reward_std": 0.5234603583812714, + "rewards/accuracy_reward": 0.2124999761581421, + "rewards/cosine_rewards": -0.10600101202726364, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.00030972264357842505, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completion_length": 1058.734375, + "epoch": 0.22261844116148752, + "grad_norm": 6.6780990750455205, + "kl": 0.046630859375, + "learning_rate": 8.886907794192562e-07, + "loss": 0.0019, + "reward": 0.9731817841529846, + "reward_std": 0.8772869110107422, + "rewards/accuracy_reward": 0.09687498956918716, + "rewards/cosine_rewards": -0.09189720638096333, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0005459659732878208, + "step": 437 + }, + { + "clip_ratio": 0.0, + "completion_length": 1056.5, + "epoch": 0.22312786551197147, + "grad_norm": 2.737447142182153, + "kl": 0.044189453125, + "learning_rate": 8.884360672440142e-07, + "loss": 0.0018, + "reward": 1.1522070169448853, + "reward_std": 0.6963326930999756, + "rewards/accuracy_reward": 0.24062499403953552, + "rewards/cosine_rewards": -0.07206200063228607, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0007310137443710119, + "step": 438 + }, + { + "clip_ratio": 0.0, + "completion_length": 1072.171875, + "epoch": 0.22363728986245543, + "grad_norm": 2.6298041678582953, + "kl": 0.046875, + "learning_rate": 8.881813550687722e-07, + "loss": 0.0019, + "reward": 1.4510762691497803, + "reward_std": 0.5045955777168274, + "rewards/accuracy_reward": 0.49375002086162567, + "rewards/cosine_rewards": 0.020084097981452942, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0002578186395112425, + "step": 439 + }, + { + "clip_ratio": 0.0, + "completion_length": 1094.546875, + "epoch": 0.22414671421293939, + "grad_norm": 2.047145097208152, + "kl": 0.0438232421875, + "learning_rate": 8.879266428935303e-07, + "loss": 0.0018, + "reward": 1.5189008712768555, + "reward_std": 0.34239334613084793, + "rewards/accuracy_reward": 0.4906250089406967, + "rewards/cosine_rewards": 0.0758383758366108, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0006874670943943784, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 1083.09375, + "epoch": 0.22465613856342334, + "grad_norm": 2.966919929118076, + "kl": 0.0457763671875, + "learning_rate": 8.876719307182883e-07, + "loss": 0.0018, + "reward": 1.2056291699409485, + "reward_std": 0.828714907169342, + "rewards/accuracy_reward": 0.29687498696148396, + "rewards/cosine_rewards": -0.043857116252183914, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0005137350672157481, + "step": 441 + }, + { + "clip_ratio": 0.0, + "completion_length": 1103.375, + "epoch": 0.2251655629139073, + "grad_norm": 3.389083005059361, + "kl": 0.042724609375, + "learning_rate": 8.874172185430463e-07, + "loss": 0.0017, + "reward": 1.483572542667389, + "reward_std": 0.5207121074199677, + "rewards/accuracy_reward": 0.4624999910593033, + "rewards/cosine_rewards": 0.053015733137726784, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0006932187097845599, + "step": 442 + }, + { + "clip_ratio": 0.0, + "completion_length": 1217.421875, + "epoch": 0.22567498726439122, + "grad_norm": 1.8084576051013326, + "kl": 0.0426025390625, + "learning_rate": 8.871625063678044e-07, + "loss": 0.0017, + "reward": 1.604416847229004, + "reward_std": 0.68864506483078, + "rewards/accuracy_reward": 0.578125, + "rewards/cosine_rewards": 0.10491618514060974, + "rewards/format_reward": 0.921875, + "rewards/repetition_rewards": -0.0004992800822947174, + "step": 443 + }, + { + "clip_ratio": 0.0, + "completion_length": 1285.484375, + "epoch": 0.22618441161487518, + "grad_norm": 20.129612374292474, + "kl": 0.042236328125, + "learning_rate": 8.869077941925624e-07, + "loss": 0.0017, + "reward": 1.7994786500930786, + "reward_std": 0.3120774105191231, + "rewards/accuracy_reward": 0.6624999940395355, + "rewards/cosine_rewards": 0.15342308580875397, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0008193884277716279, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completion_length": 1370.578125, + "epoch": 0.22669383596535914, + "grad_norm": 3.234143174544009, + "kl": 0.0433349609375, + "learning_rate": 8.866530820173203e-07, + "loss": 0.0017, + "reward": 1.3398171067237854, + "reward_std": 0.7532171607017517, + "rewards/accuracy_reward": 0.3812500014901161, + "rewards/cosine_rewards": -0.0252380333840847, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0005697726446669549, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completion_length": 1474.875, + "epoch": 0.2272032603158431, + "grad_norm": 1.7926893560129409, + "kl": 0.040283203125, + "learning_rate": 8.863983698420783e-07, + "loss": 0.0016, + "reward": 1.4312800765037537, + "reward_std": 0.6859093904495239, + "rewards/accuracy_reward": 0.43437500298023224, + "rewards/cosine_rewards": 0.02916320227086544, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0010081499349325895, + "step": 446 + }, + { + "clip_ratio": 0.0, + "completion_length": 1589.5625, + "epoch": 0.22771268466632705, + "grad_norm": 1.796069908482133, + "kl": 0.036865234375, + "learning_rate": 8.861436576668364e-07, + "loss": 0.0015, + "reward": 1.443231225013733, + "reward_std": 0.6114392578601837, + "rewards/accuracy_reward": 0.4375000074505806, + "rewards/cosine_rewards": 0.038026634603738785, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.001045387762133032, + "step": 447 + }, + { + "clip_ratio": 0.0, + "completion_length": 1672.09375, + "epoch": 0.228222109016811, + "grad_norm": 2.1300405555940802, + "kl": 0.0377197265625, + "learning_rate": 8.858889454915944e-07, + "loss": 0.0015, + "reward": 1.5443891882896423, + "reward_std": 0.5743480771780014, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/cosine_rewards": 0.10473084449768066, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.0009665640536695719, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completion_length": 1766.0, + "epoch": 0.22873153336729496, + "grad_norm": 4.287974732646472, + "kl": 0.037841796875, + "learning_rate": 8.856342333163524e-07, + "loss": 0.0015, + "reward": 1.3031042218208313, + "reward_std": 1.7085354328155518, + "rewards/accuracy_reward": 0.4593749940395355, + "rewards/cosine_rewards": 0.06326716393232346, + "rewards/format_reward": 0.78125, + "rewards/repetition_rewards": -0.0007880023040343076, + "step": 449 + }, + { + "clip_ratio": 0.0, + "completion_length": 1844.640625, + "epoch": 0.22924095771777891, + "grad_norm": 1.6528382190432698, + "kl": 0.0341796875, + "learning_rate": 8.853795211411105e-07, + "loss": 0.0014, + "reward": 0.5885469168424606, + "reward_std": 1.7182486653327942, + "rewards/accuracy_reward": 0.17812500149011612, + "rewards/cosine_rewards": -0.1980201005935669, + "rewards/format_reward": 0.609375, + "rewards/repetition_rewards": -0.0009329892345704138, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completion_length": 1887.9375, + "epoch": 0.22975038206826287, + "grad_norm": 1.8787523511209072, + "kl": 0.0335693359375, + "learning_rate": 8.851248089658685e-07, + "loss": 0.0013, + "reward": 0.7307622581720352, + "reward_std": 1.600571632385254, + "rewards/accuracy_reward": 0.24062500894069672, + "rewards/cosine_rewards": -0.13401341438293457, + "rewards/format_reward": 0.625, + "rewards/repetition_rewards": -0.0008493586792610586, + "step": 451 + }, + { + "clip_ratio": 0.0, + "completion_length": 1833.78125, + "epoch": 0.23025980641874683, + "grad_norm": 6.893897138558536, + "kl": 0.0357666015625, + "learning_rate": 8.848700967906265e-07, + "loss": 0.0014, + "reward": 1.0294001996517181, + "reward_std": 1.7062013149261475, + "rewards/accuracy_reward": 0.40312499552965164, + "rewards/cosine_rewards": 0.0025026053190231323, + "rewards/format_reward": 0.625, + "rewards/repetition_rewards": -0.0012274246546439826, + "step": 452 + }, + { + "clip_ratio": 0.0, + "completion_length": 1943.453125, + "epoch": 0.23076923076923078, + "grad_norm": 2.3588444541934273, + "kl": 0.0322265625, + "learning_rate": 8.846153846153846e-07, + "loss": 0.0013, + "reward": 0.19596866890788078, + "reward_std": 1.8146210312843323, + "rewards/accuracy_reward": -0.02500000223517418, + "rewards/cosine_rewards": -0.3406580686569214, + "rewards/format_reward": 0.5625, + "rewards/repetition_rewards": -0.0008732638962101191, + "step": 453 + }, + { + "clip_ratio": 0.0, + "completion_length": 1863.109375, + "epoch": 0.2312786551197147, + "grad_norm": 1.5475872027740656, + "kl": 0.0400390625, + "learning_rate": 8.843606724401426e-07, + "loss": 0.0016, + "reward": 0.2073364406824112, + "reward_std": 1.7386137247085571, + "rewards/accuracy_reward": -0.043750010430812836, + "rewards/cosine_rewards": -0.3414689302444458, + "rewards/format_reward": 0.59375, + "rewards/repetition_rewards": -0.0011946168669965118, + "step": 454 + }, + { + "clip_ratio": 0.0, + "completion_length": 1659.65625, + "epoch": 0.23178807947019867, + "grad_norm": 3.381623642320965, + "kl": 0.0540771484375, + "learning_rate": 8.841059602649006e-07, + "loss": 0.0022, + "reward": 1.5414963960647583, + "reward_std": 1.3864411413669586, + "rewards/accuracy_reward": 0.6218750178813934, + "rewards/cosine_rewards": 0.20184022560715675, + "rewards/format_reward": 0.71875, + "rewards/repetition_rewards": -0.000968798267422244, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completion_length": 1617.046875, + "epoch": 0.23229750382068262, + "grad_norm": 7.137141646753771, + "kl": 0.0372314453125, + "learning_rate": 8.838512480896586e-07, + "loss": 0.0015, + "reward": 1.1092736423015594, + "reward_std": 0.9871836006641388, + "rewards/accuracy_reward": 0.2687499839812517, + "rewards/cosine_rewards": -0.09614543057978153, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0008309493132401258, + "step": 456 + }, + { + "clip_ratio": 0.0, + "completion_length": 1511.703125, + "epoch": 0.23280692817116658, + "grad_norm": 2.858407735203425, + "kl": 0.0447998046875, + "learning_rate": 8.835965359144167e-07, + "loss": 0.0018, + "reward": 1.4474474489688873, + "reward_std": 0.8567388504743576, + "rewards/accuracy_reward": 0.4937500078231096, + "rewards/cosine_rewards": 0.06434839963912964, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.0012760092504322529, + "step": 457 + }, + { + "clip_ratio": 0.0, + "completion_length": 1550.828125, + "epoch": 0.23331635252165053, + "grad_norm": 2.432372091234863, + "kl": 0.0408935546875, + "learning_rate": 8.833418237391747e-07, + "loss": 0.0016, + "reward": 1.0046057403087616, + "reward_std": 1.0828097462654114, + "rewards/accuracy_reward": 0.20937498658895493, + "rewards/cosine_rewards": -0.14123845472931862, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0010308316559530795, + "step": 458 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.9375, + "epoch": 0.2338257768721345, + "grad_norm": 2.65385943980829, + "kl": 0.0380859375, + "learning_rate": 8.830871115639326e-07, + "loss": 0.0015, + "reward": 1.5737290382385254, + "reward_std": 0.676769882440567, + "rewards/accuracy_reward": 0.5187499821186066, + "rewards/cosine_rewards": 0.10273971408605576, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0008856799395289272, + "step": 459 + }, + { + "clip_ratio": 0.0, + "completion_length": 1412.171875, + "epoch": 0.23433520122261844, + "grad_norm": 7.668944991953695, + "kl": 0.03955078125, + "learning_rate": 8.828323993886907e-07, + "loss": 0.0016, + "reward": 1.2575648427009583, + "reward_std": 0.8219007402658463, + "rewards/accuracy_reward": 0.3499999940395355, + "rewards/cosine_rewards": -0.029430712573230267, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0005044575809733942, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completion_length": 1428.671875, + "epoch": 0.2348446255731024, + "grad_norm": 3.3174885087942747, + "kl": 0.041259765625, + "learning_rate": 8.825776872134487e-07, + "loss": 0.0017, + "reward": 0.5390121340751648, + "reward_std": 0.6499587297439575, + "rewards/accuracy_reward": -0.09687501192092896, + "rewards/cosine_rewards": -0.3163621127605438, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0008757157484069467, + "step": 461 + }, + { + "clip_ratio": 0.0, + "completion_length": 1389.328125, + "epoch": 0.23535404992358636, + "grad_norm": 1.976395582002063, + "kl": 0.040771484375, + "learning_rate": 8.823229750382067e-07, + "loss": 0.0016, + "reward": 1.6086109280586243, + "reward_std": 0.5066869556903839, + "rewards/accuracy_reward": 0.5218749716877937, + "rewards/cosine_rewards": 0.08817524462938309, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.0014392710290849209, + "step": 462 + }, + { + "clip_ratio": 0.0, + "completion_length": 1436.625, + "epoch": 0.2358634742740703, + "grad_norm": 2.4713376444103146, + "kl": 0.039794921875, + "learning_rate": 8.820682628629647e-07, + "loss": 0.0016, + "reward": 1.111421525478363, + "reward_std": 0.9693822264671326, + "rewards/accuracy_reward": 0.24062500149011612, + "rewards/cosine_rewards": -0.11270357295870781, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0008748299151193351, + "step": 463 + }, + { + "clip_ratio": 0.0, + "completion_length": 1397.96875, + "epoch": 0.23637289862455427, + "grad_norm": 2.688736588573913, + "kl": 0.0450439453125, + "learning_rate": 8.818135506877228e-07, + "loss": 0.0018, + "reward": 1.0273907780647278, + "reward_std": 0.6092932820320129, + "rewards/accuracy_reward": 0.20937500149011612, + "rewards/cosine_rewards": -0.11841067671775818, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0010735246760305017, + "step": 464 + }, + { + "clip_ratio": 0.0, + "completion_length": 1502.5, + "epoch": 0.2368823229750382, + "grad_norm": 2.233484328017706, + "kl": 0.03955078125, + "learning_rate": 8.815588385124808e-07, + "loss": 0.0016, + "reward": 2.0390628576278687, + "reward_std": 0.4271709471940994, + "rewards/accuracy_reward": 0.7750000059604645, + "rewards/cosine_rewards": 0.26496873423457146, + "rewards/format_reward": 1.0, + "rewards/repetition_rewards": -0.0009058607101906091, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completion_length": 1610.734375, + "epoch": 0.23739174732552215, + "grad_norm": 3.1291938497316867, + "kl": 0.0394287109375, + "learning_rate": 8.813041263372388e-07, + "loss": 0.0016, + "reward": 1.7407687306404114, + "reward_std": 0.8337388634681702, + "rewards/accuracy_reward": 0.659375011920929, + "rewards/cosine_rewards": 0.17613628506660461, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.0009925005142576993, + "step": 466 + }, + { + "clip_ratio": 0.0, + "completion_length": 1568.890625, + "epoch": 0.2379011716760061, + "grad_norm": 2.121477514968572, + "kl": 0.0380859375, + "learning_rate": 8.810494141619969e-07, + "loss": 0.0015, + "reward": 1.3490102887153625, + "reward_std": 0.7418502867221832, + "rewards/accuracy_reward": 0.37812500447034836, + "rewards/cosine_rewards": 0.003062829375267029, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0009275085176341236, + "step": 467 + }, + { + "clip_ratio": 0.0, + "completion_length": 1698.8125, + "epoch": 0.23841059602649006, + "grad_norm": 2.4246076523727025, + "kl": 0.03662109375, + "learning_rate": 8.807947019867549e-07, + "loss": 0.0015, + "reward": 1.3666119575500488, + "reward_std": 1.175959825515747, + "rewards/accuracy_reward": 0.46562501788139343, + "rewards/cosine_rewards": 0.02737235650420189, + "rewards/format_reward": 0.875, + "rewards/repetition_rewards": -0.0013854140415787697, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completion_length": 1691.90625, + "epoch": 0.23892002037697402, + "grad_norm": 1.3535821296606787, + "kl": 0.039306640625, + "learning_rate": 8.805399898115129e-07, + "loss": 0.0016, + "reward": 1.2414605617523193, + "reward_std": 1.0560136437416077, + "rewards/accuracy_reward": 0.3812499940395355, + "rewards/cosine_rewards": -0.02948123589158058, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.0009331759065389633, + "step": 469 + }, + { + "clip_ratio": 0.0, + "completion_length": 1686.46875, + "epoch": 0.23942944472745797, + "grad_norm": 1.6084637763929415, + "kl": 0.0467529296875, + "learning_rate": 8.802852776362711e-07, + "loss": 0.0019, + "reward": 2.0340508222579956, + "reward_std": 1.1313848793506622, + "rewards/accuracy_reward": 0.8312499821186066, + "rewards/cosine_rewards": 0.3132530748844147, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.001077289809472859, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completion_length": 1638.4375, + "epoch": 0.23993886907794193, + "grad_norm": 3.9206215218528553, + "kl": 0.0384521484375, + "learning_rate": 8.800305654610291e-07, + "loss": 0.0015, + "reward": 1.417995810508728, + "reward_std": 0.7844535112380981, + "rewards/accuracy_reward": 0.4375, + "rewards/cosine_rewards": 0.04390082508325577, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0009050128574017435, + "step": 471 + }, + { + "clip_ratio": 0.0, + "completion_length": 1583.171875, + "epoch": 0.24044829342842589, + "grad_norm": 1.862615408125007, + "kl": 0.0404052734375, + "learning_rate": 8.797758532857871e-07, + "loss": 0.0016, + "reward": 1.3552428185939789, + "reward_std": 0.831163614988327, + "rewards/accuracy_reward": 0.40937500074505806, + "rewards/cosine_rewards": -0.005982518196105957, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.001274671230930835, + "step": 472 + }, + { + "clip_ratio": 0.0, + "completion_length": 1606.359375, + "epoch": 0.24095771777890984, + "grad_norm": 6.217207017794225, + "kl": 0.039794921875, + "learning_rate": 8.795211411105451e-07, + "loss": 0.0016, + "reward": 1.677711844444275, + "reward_std": 0.8612502366304398, + "rewards/accuracy_reward": 0.5781249701976776, + "rewards/cosine_rewards": 0.11638512089848518, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0011732576531358063, + "step": 473 + }, + { + "clip_ratio": 0.0, + "completion_length": 1521.84375, + "epoch": 0.2414671421293938, + "grad_norm": 2.9743621746841677, + "kl": 0.0421142578125, + "learning_rate": 8.792664289353031e-07, + "loss": 0.0017, + "reward": 1.578629732131958, + "reward_std": 0.6186130940914154, + "rewards/accuracy_reward": 0.5218749940395355, + "rewards/cosine_rewards": 0.08938230201601982, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0013775942497886717, + "step": 474 + }, + { + "clip_ratio": 0.0, + "completion_length": 1560.25, + "epoch": 0.24197656647987772, + "grad_norm": 3.931202850710427, + "kl": 0.0396728515625, + "learning_rate": 8.790117167600611e-07, + "loss": 0.0016, + "reward": 1.8553311824798584, + "reward_std": 0.5484062433242798, + "rewards/accuracy_reward": 0.6906249970197678, + "rewards/cosine_rewards": 0.1969544254243374, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0009982050396502018, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completion_length": 1487.171875, + "epoch": 0.24248599083036168, + "grad_norm": 1.8115703582475124, + "kl": 0.0428466796875, + "learning_rate": 8.787570045848191e-07, + "loss": 0.0017, + "reward": 1.0939862728118896, + "reward_std": 0.652959406375885, + "rewards/accuracy_reward": 0.24062499776482582, + "rewards/cosine_rewards": -0.0988575927913189, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0009061352466233075, + "step": 476 + }, + { + "clip_ratio": 0.0, + "completion_length": 1421.203125, + "epoch": 0.24299541518084564, + "grad_norm": 50.509798046645564, + "kl": 0.0455322265625, + "learning_rate": 8.785022924095772e-07, + "loss": 0.0018, + "reward": 1.1556105613708496, + "reward_std": 0.8080581426620483, + "rewards/accuracy_reward": 0.26874998956918716, + "rewards/cosine_rewards": -0.06538418680429459, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0008802659867797047, + "step": 477 + }, + { + "clip_ratio": 0.0, + "completion_length": 1423.25, + "epoch": 0.2435048395313296, + "grad_norm": 2.143196092673566, + "kl": 0.042724609375, + "learning_rate": 8.782475802343352e-07, + "loss": 0.0017, + "reward": 1.4646123051643372, + "reward_std": 0.4019291028380394, + "rewards/accuracy_reward": 0.4374999925494194, + "rewards/cosine_rewards": 0.04411640763282776, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0013792455429211259, + "step": 478 + }, + { + "clip_ratio": 0.0, + "completion_length": 1525.359375, + "epoch": 0.24401426388181355, + "grad_norm": 1.4650345234014313, + "kl": 0.043701171875, + "learning_rate": 8.779928680590932e-07, + "loss": 0.0018, + "reward": 1.7237411737442017, + "reward_std": 0.6819100677967072, + "rewards/accuracy_reward": 0.6031249761581421, + "rewards/cosine_rewards": 0.13805609196424484, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0018149468814954162, + "step": 479 + }, + { + "clip_ratio": 0.0, + "completion_length": 1518.6875, + "epoch": 0.2445236882322975, + "grad_norm": 2.4194184684625166, + "kl": 0.0440673828125, + "learning_rate": 8.777381558838512e-07, + "loss": 0.0018, + "reward": 1.4634617269039154, + "reward_std": 0.4558331221342087, + "rewards/accuracy_reward": 0.46562497317790985, + "rewards/cosine_rewards": 0.03047458827495575, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0013878352474421263, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 1559.859375, + "epoch": 0.24503311258278146, + "grad_norm": 5.778258363606284, + "kl": 0.041015625, + "learning_rate": 8.774834437086093e-07, + "loss": 0.0016, + "reward": 1.1754435896873474, + "reward_std": 0.629539430141449, + "rewards/accuracy_reward": 0.2968749850988388, + "rewards/cosine_rewards": -0.058087632060050964, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.000843802816234529, + "step": 481 + }, + { + "clip_ratio": 0.0, + "completion_length": 1613.171875, + "epoch": 0.24554253693326542, + "grad_norm": 1.6143411312623293, + "kl": 0.0389404296875, + "learning_rate": 8.772287315333673e-07, + "loss": 0.0016, + "reward": 0.8586589694023132, + "reward_std": 0.45200832188129425, + "rewards/accuracy_reward": 0.09999999403953552, + "rewards/cosine_rewards": -0.22472049295902252, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0009954352863132954, + "step": 482 + }, + { + "clip_ratio": 0.0, + "completion_length": 1670.359375, + "epoch": 0.24605196128374937, + "grad_norm": 2.239924603374423, + "kl": 0.0592041015625, + "learning_rate": 8.769740193581253e-07, + "loss": 0.0024, + "reward": 1.522126853466034, + "reward_std": 0.8742709904909134, + "rewards/accuracy_reward": 0.4937499910593033, + "rewards/cosine_rewards": 0.060823358595371246, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0011965514277108014, + "step": 483 + }, + { + "clip_ratio": 0.0, + "completion_length": 1628.625, + "epoch": 0.24656138563423333, + "grad_norm": 8.106075584628705, + "kl": 0.0419921875, + "learning_rate": 8.767193071828834e-07, + "loss": 0.0017, + "reward": 0.9253878593444824, + "reward_std": 1.307717740535736, + "rewards/accuracy_reward": 0.18437500298023224, + "rewards/cosine_rewards": -0.14811599627137184, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.0014961253036744893, + "step": 484 + }, + { + "clip_ratio": 0.0, + "completion_length": 1648.875, + "epoch": 0.24707080998471728, + "grad_norm": 1.6847159437413002, + "kl": 0.0389404296875, + "learning_rate": 8.764645950076414e-07, + "loss": 0.0016, + "reward": 1.4973651766777039, + "reward_std": 0.9533334523439407, + "rewards/accuracy_reward": 0.5218749791383743, + "rewards/cosine_rewards": 0.0704129058867693, + "rewards/format_reward": 0.90625, + "rewards/repetition_rewards": -0.0011727037781383842, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completion_length": 1511.140625, + "epoch": 0.2475802343352012, + "grad_norm": 1.736665525794252, + "kl": 0.0399169921875, + "learning_rate": 8.762098828323994e-07, + "loss": 0.0016, + "reward": 0.7075473368167877, + "reward_std": 0.8960316479206085, + "rewards/accuracy_reward": 0.015624992549419403, + "rewards/cosine_rewards": -0.2443552017211914, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.001222497143317014, + "step": 486 + }, + { + "clip_ratio": 0.0, + "completion_length": 1567.671875, + "epoch": 0.24808965868568517, + "grad_norm": 3.031931829741236, + "kl": 0.0386962890625, + "learning_rate": 8.759551706571575e-07, + "loss": 0.0015, + "reward": 1.3812061548233032, + "reward_std": 0.8888083398342133, + "rewards/accuracy_reward": 0.4093749672174454, + "rewards/cosine_rewards": 0.004479339346289635, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0013981764786876738, + "step": 487 + }, + { + "clip_ratio": 0.0, + "completion_length": 1457.359375, + "epoch": 0.24859908303616912, + "grad_norm": 5.524727047487839, + "kl": 0.0506591796875, + "learning_rate": 8.757004584819154e-07, + "loss": 0.002, + "reward": 1.8092041611671448, + "reward_std": 0.5159921646118164, + "rewards/accuracy_reward": 0.6343750059604645, + "rewards/cosine_rewards": 0.19165128469467163, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0011971485218964517, + "step": 488 + }, + { + "clip_ratio": 0.0, + "completion_length": 1379.078125, + "epoch": 0.24910850738665308, + "grad_norm": 7.278746642143023, + "kl": 0.055419921875, + "learning_rate": 8.754457463066734e-07, + "loss": 0.0022, + "reward": 1.194389447569847, + "reward_std": 0.5000828057527542, + "rewards/accuracy_reward": 0.26874998211860657, + "rewards/cosine_rewards": -0.04208715260028839, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0010234276414848864, + "step": 489 + }, + { + "clip_ratio": 0.0, + "completion_length": 1397.78125, + "epoch": 0.24961793173713703, + "grad_norm": 2.9100594613125352, + "kl": 0.0543212890625, + "learning_rate": 8.751910341314314e-07, + "loss": 0.0022, + "reward": 1.6665399670600891, + "reward_std": 0.6835527420043945, + "rewards/accuracy_reward": 0.6625000089406967, + "rewards/cosine_rewards": 0.11463410407304764, + "rewards/format_reward": 0.890625, + "rewards/repetition_rewards": -0.00121912601753138, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completion_length": 1359.703125, + "epoch": 0.250127356087621, + "grad_norm": 12.325171247664876, + "kl": 0.0457763671875, + "learning_rate": 8.749363219561895e-07, + "loss": 0.0018, + "reward": 1.8410940766334534, + "reward_std": 0.4001428484916687, + "rewards/accuracy_reward": 0.690625011920929, + "rewards/cosine_rewards": 0.18309018202126026, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0013711884384974837, + "step": 491 + }, + { + "clip_ratio": 0.0, + "completion_length": 1470.75, + "epoch": 0.25063678043810494, + "grad_norm": 15.076912789505334, + "kl": 0.041015625, + "learning_rate": 8.746816097809475e-07, + "loss": 0.0016, + "reward": 1.2887136340141296, + "reward_std": 0.8450455367565155, + "rewards/accuracy_reward": 0.3531249761581421, + "rewards/cosine_rewards": -0.03244372457265854, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0007176562794484198, + "step": 492 + }, + { + "clip_ratio": 0.0, + "completion_length": 1348.140625, + "epoch": 0.2511462047885889, + "grad_norm": 10.32545410862146, + "kl": 0.05810546875, + "learning_rate": 8.744268976057055e-07, + "loss": 0.0023, + "reward": 1.227342277765274, + "reward_std": 1.0202240645885468, + "rewards/accuracy_reward": 0.3500000238418579, + "rewards/cosine_rewards": 0.002939566969871521, + "rewards/format_reward": 0.875, + "rewards/repetition_rewards": -0.0005973072838969529, + "step": 493 + }, + { + "clip_ratio": 0.0, + "completion_length": 1301.828125, + "epoch": 0.25165562913907286, + "grad_norm": 3.667270735290933, + "kl": 0.0645751953125, + "learning_rate": 8.741721854304636e-07, + "loss": 0.0026, + "reward": 1.2533040046691895, + "reward_std": 0.7434202134609222, + "rewards/accuracy_reward": 0.32500000298023224, + "rewards/cosine_rewards": -0.03934769332408905, + "rewards/format_reward": 0.96875, + "rewards/repetition_rewards": -0.0010982811218127608, + "step": 494 + }, + { + "clip_ratio": 0.0, + "completion_length": 1290.1875, + "epoch": 0.2521650534895568, + "grad_norm": 4.3465734782527345, + "kl": 0.0535888671875, + "learning_rate": 8.739174732552216e-07, + "loss": 0.0021, + "reward": 0.6353173404932022, + "reward_std": 0.6600025594234467, + "rewards/accuracy_reward": -0.040625013411045074, + "rewards/cosine_rewards": -0.2607284113764763, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0008293068385683, + "step": 495 + }, + { + "clip_ratio": 0.0, + "completion_length": 1355.375, + "epoch": 0.25267447784004077, + "grad_norm": 4.901543791199254, + "kl": 0.060546875, + "learning_rate": 8.736627610799796e-07, + "loss": 0.0024, + "reward": 1.1404387950897217, + "reward_std": 0.670623242855072, + "rewards/accuracy_reward": 0.26874999701976776, + "rewards/cosine_rewards": -0.11177334189414978, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0009129364043474197, + "step": 496 + }, + { + "clip_ratio": 0.0, + "completion_length": 1328.6875, + "epoch": 0.2531839021905247, + "grad_norm": 3.9728562721335745, + "kl": 0.0489501953125, + "learning_rate": 8.734080489047376e-07, + "loss": 0.002, + "reward": 1.1998997032642365, + "reward_std": 0.630705714225769, + "rewards/accuracy_reward": 0.32500001788139343, + "rewards/cosine_rewards": -0.06150183826684952, + "rewards/format_reward": 0.9375, + "rewards/repetition_rewards": -0.0010984738764818758, + "step": 497 + }, + { + "clip_ratio": 0.0, + "completion_length": 1373.671875, + "epoch": 0.2536933265410087, + "grad_norm": 3.325255325148765, + "kl": 0.04833984375, + "learning_rate": 8.731533367294957e-07, + "loss": 0.0019, + "reward": 1.2019822597503662, + "reward_std": 0.38447779417037964, + "rewards/accuracy_reward": 0.296875, + "rewards/cosine_rewards": -0.04712319001555443, + "rewards/format_reward": 0.953125, + "rewards/repetition_rewards": -0.0008945107110776007, + "step": 498 + }, + { + "clip_ratio": 0.0, + "completion_length": 1309.28125, + "epoch": 0.25420275089149263, + "grad_norm": 5.499632802088616, + "kl": 0.072265625, + "learning_rate": 8.728986245542537e-07, + "loss": 0.0029, + "reward": 1.6111189126968384, + "reward_std": 0.1892632469534874, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/cosine_rewards": 0.07773812115192413, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0009941596072167158, + "step": 499 + }, + { + "clip_ratio": 0.0, + "completion_length": 1363.625, + "epoch": 0.2547121752419766, + "grad_norm": 7.195546109062687, + "kl": 0.0482177734375, + "learning_rate": 8.726439123790117e-07, + "loss": 0.0019, + "reward": 1.9320534467697144, + "reward_std": 0.4148600548505783, + "rewards/accuracy_reward": 0.7468750178813934, + "rewards/cosine_rewards": 0.20207761228084564, + "rewards/format_reward": 0.984375, + "rewards/repetition_rewards": -0.0012741541431751102, + "step": 500 + } + ], + "logging_steps": 1.0, + "max_steps": 3926, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}