| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 9.5625, |
| "eval_steps": 500, |
| "global_step": 450, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.36198425292969, |
| "epoch": 0.020833333333333332, |
| "grad_norm": 1.639632839083605, |
| "kl": 0.0, |
| "learning_rate": 9.979166666666667e-07, |
| "loss": 0.0, |
| "reward": 1.7404149770736694, |
| "reward_std": 0.0779535174369812, |
| "rewards/accuracy_reward": 0.7482273578643799, |
| "rewards/format_reward": 0.9921875, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.453125, |
| "epoch": 0.041666666666666664, |
| "grad_norm": 1.6143054455733772, |
| "kl": 0.000759124755859375, |
| "learning_rate": 9.958333333333333e-07, |
| "loss": 0.0001, |
| "reward": 1.7971906661987305, |
| "reward_std": 0.07512722909450531, |
| "rewards/accuracy_reward": 0.8076074123382568, |
| "rewards/format_reward": 0.9895833730697632, |
| "step": 2 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.90885925292969, |
| "epoch": 0.0625, |
| "grad_norm": 2.4515981685409955, |
| "kl": 0.000972747802734375, |
| "learning_rate": 9.9375e-07, |
| "loss": 0.0001, |
| "reward": 1.7853158712387085, |
| "reward_std": 0.056390173733234406, |
| "rewards/accuracy_reward": 0.7866179347038269, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 3 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.05989837646484, |
| "epoch": 0.08333333333333333, |
| "grad_norm": 1.6201269048159341, |
| "kl": 0.00115203857421875, |
| "learning_rate": 9.916666666666666e-07, |
| "loss": 0.0001, |
| "reward": 1.7735958099365234, |
| "reward_std": 0.0706130862236023, |
| "rewards/accuracy_reward": 0.7761998176574707, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 4 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.74089050292969, |
| "epoch": 0.10416666666666667, |
| "grad_norm": 1.5772248618980282, |
| "kl": 0.0014801025390625, |
| "learning_rate": 9.895833333333333e-07, |
| "loss": 0.0001, |
| "reward": 1.775779128074646, |
| "reward_std": 0.0632912740111351, |
| "rewards/accuracy_reward": 0.7783832550048828, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.27214050292969, |
| "epoch": 0.125, |
| "grad_norm": 1.6430473395229306, |
| "kl": 0.0023345947265625, |
| "learning_rate": 9.875e-07, |
| "loss": 0.0001, |
| "reward": 1.7499568462371826, |
| "reward_std": 0.06891956180334091, |
| "rewards/accuracy_reward": 0.7525607943534851, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 6 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.0390625, |
| "epoch": 0.14583333333333334, |
| "grad_norm": 2.1082448590555587, |
| "kl": 0.0029754638671875, |
| "learning_rate": 9.854166666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.7655704021453857, |
| "reward_std": 0.06707193702459335, |
| "rewards/accuracy_reward": 0.7707786560058594, |
| "rewards/format_reward": 0.9947916865348816, |
| "step": 7 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.79948425292969, |
| "epoch": 0.16666666666666666, |
| "grad_norm": 1.2774316678996838, |
| "kl": 0.003570556640625, |
| "learning_rate": 9.833333333333332e-07, |
| "loss": 0.0002, |
| "reward": 1.7789148092269897, |
| "reward_std": 0.06289087980985641, |
| "rewards/accuracy_reward": 0.7828210592269897, |
| "rewards/format_reward": 0.99609375, |
| "step": 8 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.25390625, |
| "epoch": 0.1875, |
| "grad_norm": 2.9892643389862297, |
| "kl": 0.00396728515625, |
| "learning_rate": 9.8125e-07, |
| "loss": 0.0002, |
| "reward": 1.7818154096603394, |
| "reward_std": 0.06365714222192764, |
| "rewards/accuracy_reward": 0.7844195365905762, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 9 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.72005462646484, |
| "epoch": 0.20833333333333334, |
| "grad_norm": 2.067866094157864, |
| "kl": 0.00555419921875, |
| "learning_rate": 9.791666666666667e-07, |
| "loss": 0.0003, |
| "reward": 1.768758773803711, |
| "reward_std": 0.061586372554302216, |
| "rewards/accuracy_reward": 0.7713630199432373, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.7109375, |
| "epoch": 0.22916666666666666, |
| "grad_norm": 1.9143306648652931, |
| "kl": 0.007568359375, |
| "learning_rate": 9.770833333333332e-07, |
| "loss": 0.0004, |
| "reward": 1.7840107679367065, |
| "reward_std": 0.06136108189821243, |
| "rewards/accuracy_reward": 0.7866148948669434, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 11 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.51171875, |
| "epoch": 0.25, |
| "grad_norm": 1.3340893635394109, |
| "kl": 0.0089111328125, |
| "learning_rate": 9.75e-07, |
| "loss": 0.0005, |
| "reward": 1.7980231046676636, |
| "reward_std": 0.05285460874438286, |
| "rewards/accuracy_reward": 0.799325168132782, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 12 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.94921875, |
| "epoch": 0.2708333333333333, |
| "grad_norm": 1.5039333862682631, |
| "kl": 0.01055908203125, |
| "learning_rate": 9.729166666666665e-07, |
| "loss": 0.0005, |
| "reward": 1.781097412109375, |
| "reward_std": 0.054250504821538925, |
| "rewards/accuracy_reward": 0.7810973525047302, |
| "rewards/format_reward": 1.0, |
| "step": 13 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.20442962646484, |
| "epoch": 0.2916666666666667, |
| "grad_norm": 1.320078780832244, |
| "kl": 0.01153564453125, |
| "learning_rate": 9.708333333333333e-07, |
| "loss": 0.0006, |
| "reward": 1.7951558828353882, |
| "reward_std": 0.05504516512155533, |
| "rewards/accuracy_reward": 0.7964579463005066, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 14 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.59635925292969, |
| "epoch": 0.3125, |
| "grad_norm": 1.2979058457846995, |
| "kl": 0.0120849609375, |
| "learning_rate": 9.6875e-07, |
| "loss": 0.0006, |
| "reward": 1.7878097295761108, |
| "reward_std": 0.05526263639330864, |
| "rewards/accuracy_reward": 0.7891117930412292, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.14974212646484, |
| "epoch": 0.3333333333333333, |
| "grad_norm": 1.326162224402637, |
| "kl": 0.01275634765625, |
| "learning_rate": 9.666666666666666e-07, |
| "loss": 0.0006, |
| "reward": 1.7894906997680664, |
| "reward_std": 0.04853988438844681, |
| "rewards/accuracy_reward": 0.7907928824424744, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 16 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.43489837646484, |
| "epoch": 0.3541666666666667, |
| "grad_norm": 2.7897636177227594, |
| "kl": 0.01263427734375, |
| "learning_rate": 9.645833333333333e-07, |
| "loss": 0.0005, |
| "reward": 1.8176600933074951, |
| "reward_std": 0.04950461909174919, |
| "rewards/accuracy_reward": 0.8176599740982056, |
| "rewards/format_reward": 1.0, |
| "step": 17 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.42708587646484, |
| "epoch": 0.375, |
| "grad_norm": 1.1152706578469112, |
| "kl": 0.01409912109375, |
| "learning_rate": 9.624999999999999e-07, |
| "loss": 0.0006, |
| "reward": 1.7780466079711914, |
| "reward_std": 0.0461871400475502, |
| "rewards/accuracy_reward": 0.7780466079711914, |
| "rewards/format_reward": 1.0, |
| "step": 18 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.89583587646484, |
| "epoch": 0.3958333333333333, |
| "grad_norm": 5.136433449809359, |
| "kl": 0.01300048828125, |
| "learning_rate": 9.604166666666666e-07, |
| "loss": 0.0006, |
| "reward": 1.7988059520721436, |
| "reward_std": 0.04341081529855728, |
| "rewards/accuracy_reward": 0.7988060712814331, |
| "rewards/format_reward": 1.0, |
| "step": 19 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.54296875, |
| "epoch": 0.4166666666666667, |
| "grad_norm": 1.285848750333943, |
| "kl": 0.01422119140625, |
| "learning_rate": 9.583333333333334e-07, |
| "loss": 0.0007, |
| "reward": 1.795292854309082, |
| "reward_std": 0.045156918466091156, |
| "rewards/accuracy_reward": 0.7952930331230164, |
| "rewards/format_reward": 1.0, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 103.421875, |
| "epoch": 0.4375, |
| "grad_norm": 1.309348147777355, |
| "kl": 0.0162353515625, |
| "learning_rate": 9.5625e-07, |
| "loss": 0.0007, |
| "reward": 1.793769121170044, |
| "reward_std": 0.050124406814575195, |
| "rewards/accuracy_reward": 0.7963732481002808, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 21 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 103.89192962646484, |
| "epoch": 0.4583333333333333, |
| "grad_norm": 1.2889966905867243, |
| "kl": 0.017578125, |
| "learning_rate": 9.541666666666667e-07, |
| "loss": 0.0007, |
| "reward": 1.8341398239135742, |
| "reward_std": 0.04439392685890198, |
| "rewards/accuracy_reward": 0.8341398239135742, |
| "rewards/format_reward": 1.0, |
| "step": 22 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 104.41796875, |
| "epoch": 0.4791666666666667, |
| "grad_norm": 1.4675439511923518, |
| "kl": 0.0167236328125, |
| "learning_rate": 9.520833333333333e-07, |
| "loss": 0.0007, |
| "reward": 1.836176872253418, |
| "reward_std": 0.046172261238098145, |
| "rewards/accuracy_reward": 0.836176872253418, |
| "rewards/format_reward": 1.0, |
| "step": 23 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 104.67448425292969, |
| "epoch": 0.5, |
| "grad_norm": 2.1035799373033917, |
| "kl": 0.0203857421875, |
| "learning_rate": 9.499999999999999e-07, |
| "loss": 0.0008, |
| "reward": 1.8129713535308838, |
| "reward_std": 0.04518614709377289, |
| "rewards/accuracy_reward": 0.812971293926239, |
| "rewards/format_reward": 1.0, |
| "step": 24 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 104.63151550292969, |
| "epoch": 0.5208333333333334, |
| "grad_norm": 1.4208098502006166, |
| "kl": 0.0186767578125, |
| "learning_rate": 9.479166666666666e-07, |
| "loss": 0.0008, |
| "reward": 1.8406684398651123, |
| "reward_std": 0.041067786514759064, |
| "rewards/accuracy_reward": 0.8406683802604675, |
| "rewards/format_reward": 1.0, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.16927337646484, |
| "epoch": 0.5416666666666666, |
| "grad_norm": 1.7838214183915748, |
| "kl": 0.0203857421875, |
| "learning_rate": 9.458333333333333e-07, |
| "loss": 0.0008, |
| "reward": 1.803572177886963, |
| "reward_std": 0.04902785271406174, |
| "rewards/accuracy_reward": 0.8035721778869629, |
| "rewards/format_reward": 1.0, |
| "step": 26 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.23046875, |
| "epoch": 0.5625, |
| "grad_norm": 1.2204814881350547, |
| "kl": 0.0211181640625, |
| "learning_rate": 9.4375e-07, |
| "loss": 0.0009, |
| "reward": 1.8056421279907227, |
| "reward_std": 0.04466244578361511, |
| "rewards/accuracy_reward": 0.8056421279907227, |
| "rewards/format_reward": 1.0, |
| "step": 27 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.625, |
| "epoch": 0.5833333333333334, |
| "grad_norm": 1.802147920265982, |
| "kl": 0.0218505859375, |
| "learning_rate": 9.416666666666666e-07, |
| "loss": 0.0009, |
| "reward": 1.828833818435669, |
| "reward_std": 0.042848870158195496, |
| "rewards/accuracy_reward": 0.8288335800170898, |
| "rewards/format_reward": 1.0, |
| "step": 28 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.18099212646484, |
| "epoch": 0.6041666666666666, |
| "grad_norm": 2.372355341881645, |
| "kl": 0.021240234375, |
| "learning_rate": 9.395833333333333e-07, |
| "loss": 0.0009, |
| "reward": 1.8150265216827393, |
| "reward_std": 0.04657554626464844, |
| "rewards/accuracy_reward": 0.8150264620780945, |
| "rewards/format_reward": 1.0, |
| "step": 29 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.60807800292969, |
| "epoch": 0.625, |
| "grad_norm": 1.6452523523321965, |
| "kl": 0.022216796875, |
| "learning_rate": 9.374999999999999e-07, |
| "loss": 0.0009, |
| "reward": 1.8578990697860718, |
| "reward_std": 0.039906859397888184, |
| "rewards/accuracy_reward": 0.8578989505767822, |
| "rewards/format_reward": 1.0, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 114.49349212646484, |
| "epoch": 0.6458333333333334, |
| "grad_norm": 1.9283111710202978, |
| "kl": 0.022705078125, |
| "learning_rate": 9.354166666666667e-07, |
| "loss": 0.001, |
| "reward": 1.7941240072250366, |
| "reward_std": 0.046397458761930466, |
| "rewards/accuracy_reward": 0.7941240072250366, |
| "rewards/format_reward": 1.0, |
| "step": 31 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 117.61589050292969, |
| "epoch": 0.6666666666666666, |
| "grad_norm": 1.8161381749377306, |
| "kl": 0.02490234375, |
| "learning_rate": 9.333333333333333e-07, |
| "loss": 0.001, |
| "reward": 1.807027816772461, |
| "reward_std": 0.047066252678632736, |
| "rewards/accuracy_reward": 0.8070278167724609, |
| "rewards/format_reward": 1.0, |
| "step": 32 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 114.55599212646484, |
| "epoch": 0.6875, |
| "grad_norm": 1.4095225702209093, |
| "kl": 0.028076171875, |
| "learning_rate": 9.3125e-07, |
| "loss": 0.0011, |
| "reward": 1.816405177116394, |
| "reward_std": 0.0405634380877018, |
| "rewards/accuracy_reward": 0.816405177116394, |
| "rewards/format_reward": 1.0, |
| "step": 33 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 115.86589050292969, |
| "epoch": 0.7083333333333334, |
| "grad_norm": 1.9475471383587142, |
| "kl": 0.0264892578125, |
| "learning_rate": 9.291666666666666e-07, |
| "loss": 0.0011, |
| "reward": 1.8178232908248901, |
| "reward_std": 0.048409104347229004, |
| "rewards/accuracy_reward": 0.8191253542900085, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 34 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 114.81771087646484, |
| "epoch": 0.7291666666666666, |
| "grad_norm": 1.816929377906017, |
| "kl": 0.0250244140625, |
| "learning_rate": 9.270833333333333e-07, |
| "loss": 0.001, |
| "reward": 1.8112201690673828, |
| "reward_std": 0.04231969267129898, |
| "rewards/accuracy_reward": 0.811220109462738, |
| "rewards/format_reward": 1.0, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 115.69271087646484, |
| "epoch": 0.75, |
| "grad_norm": 1.9245265212018168, |
| "kl": 0.0260009765625, |
| "learning_rate": 9.25e-07, |
| "loss": 0.0011, |
| "reward": 1.800992727279663, |
| "reward_std": 0.04579651355743408, |
| "rewards/accuracy_reward": 0.8022947311401367, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 36 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 115.50911712646484, |
| "epoch": 0.7708333333333334, |
| "grad_norm": 2.3374928077264565, |
| "kl": 0.0274658203125, |
| "learning_rate": 9.229166666666667e-07, |
| "loss": 0.0011, |
| "reward": 1.8183115720748901, |
| "reward_std": 0.047783225774765015, |
| "rewards/accuracy_reward": 0.820915699005127, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 37 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 115.3515625, |
| "epoch": 0.7916666666666666, |
| "grad_norm": 1.959104932697817, |
| "kl": 0.0269775390625, |
| "learning_rate": 9.208333333333332e-07, |
| "loss": 0.0011, |
| "reward": 1.7997591495513916, |
| "reward_std": 0.04710128903388977, |
| "rewards/accuracy_reward": 0.8010611534118652, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 38 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 115.93489837646484, |
| "epoch": 0.8125, |
| "grad_norm": 5.405686552072785, |
| "kl": 0.023193359375, |
| "learning_rate": 9.187499999999999e-07, |
| "loss": 0.0009, |
| "reward": 1.8103388547897339, |
| "reward_std": 0.048775218427181244, |
| "rewards/accuracy_reward": 0.8129429817199707, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 39 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.42578125, |
| "epoch": 0.8333333333333334, |
| "grad_norm": 2.5109857220624874, |
| "kl": 0.02587890625, |
| "learning_rate": 9.166666666666665e-07, |
| "loss": 0.001, |
| "reward": 1.8131688833236694, |
| "reward_std": 0.045695092529058456, |
| "rewards/accuracy_reward": 0.8157729506492615, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 114.1015625, |
| "epoch": 0.8541666666666666, |
| "grad_norm": 1.3500957332806702, |
| "kl": 0.0267333984375, |
| "learning_rate": 9.145833333333333e-07, |
| "loss": 0.0011, |
| "reward": 1.8379626274108887, |
| "reward_std": 0.03906077891588211, |
| "rewards/accuracy_reward": 0.8392646312713623, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 41 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 114.87109375, |
| "epoch": 0.875, |
| "grad_norm": 1.9553905487315055, |
| "kl": 0.02734375, |
| "learning_rate": 9.124999999999999e-07, |
| "loss": 0.0011, |
| "reward": 1.8075612783432007, |
| "reward_std": 0.046475451439619064, |
| "rewards/accuracy_reward": 0.8075612783432007, |
| "rewards/format_reward": 1.0, |
| "step": 42 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.93359375, |
| "epoch": 0.8958333333333334, |
| "grad_norm": 2.687075325377198, |
| "kl": 0.0262451171875, |
| "learning_rate": 9.104166666666666e-07, |
| "loss": 0.0011, |
| "reward": 1.8184566497802734, |
| "reward_std": 0.04643276333808899, |
| "rewards/accuracy_reward": 0.8197587728500366, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 43 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.43489837646484, |
| "epoch": 0.9166666666666666, |
| "grad_norm": 1.2273175474621774, |
| "kl": 0.026611328125, |
| "learning_rate": 9.083333333333332e-07, |
| "loss": 0.0011, |
| "reward": 1.7925729751586914, |
| "reward_std": 0.04880434274673462, |
| "rewards/accuracy_reward": 0.7938751578330994, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 44 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.70052337646484, |
| "epoch": 0.9375, |
| "grad_norm": 3.4979750979513184, |
| "kl": 0.0242919921875, |
| "learning_rate": 9.0625e-07, |
| "loss": 0.001, |
| "reward": 1.8129024505615234, |
| "reward_std": 0.04281633347272873, |
| "rewards/accuracy_reward": 0.8142046332359314, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.12239837646484, |
| "epoch": 0.9583333333333334, |
| "grad_norm": 1.581390275171433, |
| "kl": 0.024658203125, |
| "learning_rate": 9.041666666666667e-07, |
| "loss": 0.001, |
| "reward": 1.8339308500289917, |
| "reward_std": 0.0428236648440361, |
| "rewards/accuracy_reward": 0.8339308500289917, |
| "rewards/format_reward": 1.0, |
| "step": 46 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.57410430908203, |
| "epoch": 0.9791666666666666, |
| "grad_norm": 1.2859639310341215, |
| "kl": 0.031982421875, |
| "learning_rate": 9.020833333333333e-07, |
| "loss": 0.0013, |
| "reward": 1.8121892213821411, |
| "reward_std": 0.043297141790390015, |
| "rewards/accuracy_reward": 0.8135243654251099, |
| "rewards/format_reward": 0.998664915561676, |
| "step": 47 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.44271087646484, |
| "epoch": 1.0208333333333333, |
| "grad_norm": 2.3284880809656916, |
| "kl": 0.0240478515625, |
| "learning_rate": 9e-07, |
| "loss": 0.001, |
| "reward": 1.842546820640564, |
| "reward_std": 0.03681856021285057, |
| "rewards/accuracy_reward": 0.842546820640564, |
| "rewards/format_reward": 1.0, |
| "step": 48 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.63932800292969, |
| "epoch": 1.0416666666666667, |
| "grad_norm": 11.066834298993447, |
| "kl": 0.0245361328125, |
| "learning_rate": 8.979166666666666e-07, |
| "loss": 0.001, |
| "reward": 1.810459852218628, |
| "reward_std": 0.04241730272769928, |
| "rewards/accuracy_reward": 0.8117618560791016, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 49 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.625, |
| "epoch": 1.0625, |
| "grad_norm": 3.9999069520932107, |
| "kl": 0.0284423828125, |
| "learning_rate": 8.958333333333334e-07, |
| "loss": 0.0011, |
| "reward": 1.8403337001800537, |
| "reward_std": 0.035806819796562195, |
| "rewards/accuracy_reward": 0.8403337001800537, |
| "rewards/format_reward": 1.0, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.18620300292969, |
| "epoch": 1.0833333333333333, |
| "grad_norm": 1.9543615318552672, |
| "kl": 0.0263671875, |
| "learning_rate": 8.9375e-07, |
| "loss": 0.0011, |
| "reward": 1.8314783573150635, |
| "reward_std": 0.03615511581301689, |
| "rewards/accuracy_reward": 0.8314781188964844, |
| "rewards/format_reward": 1.0, |
| "step": 51 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 103.74349212646484, |
| "epoch": 1.1041666666666667, |
| "grad_norm": 1.7271064117720305, |
| "kl": 0.0284423828125, |
| "learning_rate": 8.916666666666667e-07, |
| "loss": 0.0012, |
| "reward": 1.8100008964538574, |
| "reward_std": 0.04172190651297569, |
| "rewards/accuracy_reward": 0.810001015663147, |
| "rewards/format_reward": 1.0, |
| "step": 52 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 103.9609375, |
| "epoch": 1.125, |
| "grad_norm": 3.0099066254538225, |
| "kl": 0.0263671875, |
| "learning_rate": 8.895833333333332e-07, |
| "loss": 0.0011, |
| "reward": 1.8225668668746948, |
| "reward_std": 0.03787129372358322, |
| "rewards/accuracy_reward": 0.8225669860839844, |
| "rewards/format_reward": 1.0, |
| "step": 53 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 103.16667175292969, |
| "epoch": 1.1458333333333333, |
| "grad_norm": 2.1966545517434746, |
| "kl": 0.02783203125, |
| "learning_rate": 8.874999999999999e-07, |
| "loss": 0.0012, |
| "reward": 1.825528860092163, |
| "reward_std": 0.0393807552754879, |
| "rewards/accuracy_reward": 0.8255288004875183, |
| "rewards/format_reward": 1.0, |
| "step": 54 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 102.93880462646484, |
| "epoch": 1.1666666666666667, |
| "grad_norm": 2.2377689992101195, |
| "kl": 0.0289306640625, |
| "learning_rate": 8.854166666666666e-07, |
| "loss": 0.0012, |
| "reward": 1.84073007106781, |
| "reward_std": 0.036864347755908966, |
| "rewards/accuracy_reward": 0.8407299518585205, |
| "rewards/format_reward": 1.0, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 104.11198425292969, |
| "epoch": 1.1875, |
| "grad_norm": 1.4566342977805207, |
| "kl": 0.0250244140625, |
| "learning_rate": 8.833333333333333e-07, |
| "loss": 0.001, |
| "reward": 1.8056623935699463, |
| "reward_std": 0.04149676859378815, |
| "rewards/accuracy_reward": 0.8056623339653015, |
| "rewards/format_reward": 1.0, |
| "step": 56 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 102.09765625, |
| "epoch": 1.2083333333333333, |
| "grad_norm": 6.235839920646965, |
| "kl": 0.0255126953125, |
| "learning_rate": 8.812499999999999e-07, |
| "loss": 0.0011, |
| "reward": 1.8528995513916016, |
| "reward_std": 0.033966001123189926, |
| "rewards/accuracy_reward": 0.8528995513916016, |
| "rewards/format_reward": 1.0, |
| "step": 57 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 101.87630462646484, |
| "epoch": 1.2291666666666667, |
| "grad_norm": 1.5034574110083547, |
| "kl": 0.02978515625, |
| "learning_rate": 8.791666666666666e-07, |
| "loss": 0.0012, |
| "reward": 1.835959792137146, |
| "reward_std": 0.02824896201491356, |
| "rewards/accuracy_reward": 0.8359596729278564, |
| "rewards/format_reward": 1.0, |
| "step": 58 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 104.05599212646484, |
| "epoch": 1.25, |
| "grad_norm": 1.6494502522344965, |
| "kl": 0.0301513671875, |
| "learning_rate": 8.770833333333333e-07, |
| "loss": 0.0013, |
| "reward": 1.8419498205184937, |
| "reward_std": 0.03271109610795975, |
| "rewards/accuracy_reward": 0.8419498205184937, |
| "rewards/format_reward": 1.0, |
| "step": 59 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 103.515625, |
| "epoch": 1.2708333333333333, |
| "grad_norm": 2.5168432356014954, |
| "kl": 0.02587890625, |
| "learning_rate": 8.75e-07, |
| "loss": 0.0011, |
| "reward": 1.8278319835662842, |
| "reward_std": 0.04185899719595909, |
| "rewards/accuracy_reward": 0.8278318643569946, |
| "rewards/format_reward": 1.0, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.421875, |
| "epoch": 1.2916666666666667, |
| "grad_norm": 1.3151086988925837, |
| "kl": 0.0242919921875, |
| "learning_rate": 8.729166666666666e-07, |
| "loss": 0.001, |
| "reward": 1.8187057971954346, |
| "reward_std": 0.03452453017234802, |
| "rewards/accuracy_reward": 0.8187057375907898, |
| "rewards/format_reward": 1.0, |
| "step": 61 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.04948425292969, |
| "epoch": 1.3125, |
| "grad_norm": 2.810138538550519, |
| "kl": 0.02783203125, |
| "learning_rate": 8.708333333333333e-07, |
| "loss": 0.0012, |
| "reward": 1.846640944480896, |
| "reward_std": 0.04408061131834984, |
| "rewards/accuracy_reward": 0.8492451906204224, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 62 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.23698425292969, |
| "epoch": 1.3333333333333333, |
| "grad_norm": 1.563896261562582, |
| "kl": 0.03076171875, |
| "learning_rate": 8.687499999999999e-07, |
| "loss": 0.0013, |
| "reward": 1.8380591869354248, |
| "reward_std": 0.03534460812807083, |
| "rewards/accuracy_reward": 0.8380589485168457, |
| "rewards/format_reward": 1.0, |
| "step": 63 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.85286712646484, |
| "epoch": 1.3541666666666667, |
| "grad_norm": 3.020754504226419, |
| "kl": 0.0255126953125, |
| "learning_rate": 8.666666666666667e-07, |
| "loss": 0.0011, |
| "reward": 1.8550291061401367, |
| "reward_std": 0.037260740995407104, |
| "rewards/accuracy_reward": 0.8563313484191895, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 64 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.10026550292969, |
| "epoch": 1.375, |
| "grad_norm": 3.306677048985413, |
| "kl": 0.026123046875, |
| "learning_rate": 8.645833333333333e-07, |
| "loss": 0.0011, |
| "reward": 1.8508269786834717, |
| "reward_std": 0.0380985364317894, |
| "rewards/accuracy_reward": 0.8508269190788269, |
| "rewards/format_reward": 1.0, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.24870300292969, |
| "epoch": 1.3958333333333333, |
| "grad_norm": 1.6070004388732007, |
| "kl": 0.0263671875, |
| "learning_rate": 8.625e-07, |
| "loss": 0.0011, |
| "reward": 1.857025146484375, |
| "reward_std": 0.03701246529817581, |
| "rewards/accuracy_reward": 0.857025146484375, |
| "rewards/format_reward": 1.0, |
| "step": 66 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.89323425292969, |
| "epoch": 1.4166666666666667, |
| "grad_norm": 2.3548695186042186, |
| "kl": 0.02783203125, |
| "learning_rate": 8.604166666666667e-07, |
| "loss": 0.0011, |
| "reward": 1.8029731512069702, |
| "reward_std": 0.042077165096998215, |
| "rewards/accuracy_reward": 0.8055772185325623, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 67 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.38671875, |
| "epoch": 1.4375, |
| "grad_norm": 1.278138762095491, |
| "kl": 0.0228271484375, |
| "learning_rate": 8.583333333333332e-07, |
| "loss": 0.0009, |
| "reward": 1.8338496685028076, |
| "reward_std": 0.039083532989025116, |
| "rewards/accuracy_reward": 0.8338495492935181, |
| "rewards/format_reward": 1.0, |
| "step": 68 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.65495300292969, |
| "epoch": 1.4583333333333333, |
| "grad_norm": 2.070944061321321, |
| "kl": 0.0284423828125, |
| "learning_rate": 8.5625e-07, |
| "loss": 0.0012, |
| "reward": 1.8258914947509766, |
| "reward_std": 0.03814253211021423, |
| "rewards/accuracy_reward": 0.8271937370300293, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 69 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.45442962646484, |
| "epoch": 1.4791666666666667, |
| "grad_norm": 1.3471433182606216, |
| "kl": 0.0281982421875, |
| "learning_rate": 8.541666666666666e-07, |
| "loss": 0.0012, |
| "reward": 1.8398675918579102, |
| "reward_std": 0.0354573093354702, |
| "rewards/accuracy_reward": 0.8398677706718445, |
| "rewards/format_reward": 1.0, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.04427337646484, |
| "epoch": 1.5, |
| "grad_norm": 1.6026685707149524, |
| "kl": 0.02880859375, |
| "learning_rate": 8.520833333333333e-07, |
| "loss": 0.0012, |
| "reward": 1.853353500366211, |
| "reward_std": 0.03885906934738159, |
| "rewards/accuracy_reward": 0.8546554446220398, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 71 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.84635925292969, |
| "epoch": 1.5208333333333335, |
| "grad_norm": 2.1398544036186014, |
| "kl": 0.02783203125, |
| "learning_rate": 8.499999999999999e-07, |
| "loss": 0.0012, |
| "reward": 1.8261979818344116, |
| "reward_std": 0.03391870856285095, |
| "rewards/accuracy_reward": 0.8261978626251221, |
| "rewards/format_reward": 1.0, |
| "step": 72 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.69271087646484, |
| "epoch": 1.5416666666666665, |
| "grad_norm": 4.020748588139163, |
| "kl": 0.025634765625, |
| "learning_rate": 8.479166666666667e-07, |
| "loss": 0.0011, |
| "reward": 1.8563817739486694, |
| "reward_std": 0.031248420476913452, |
| "rewards/accuracy_reward": 0.8563817739486694, |
| "rewards/format_reward": 1.0, |
| "step": 73 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.48046875, |
| "epoch": 1.5625, |
| "grad_norm": 1.3740118540705408, |
| "kl": 0.0291748046875, |
| "learning_rate": 8.458333333333333e-07, |
| "loss": 0.0012, |
| "reward": 1.8323755264282227, |
| "reward_std": 0.03359724208712578, |
| "rewards/accuracy_reward": 0.8323755264282227, |
| "rewards/format_reward": 1.0, |
| "step": 74 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.44792175292969, |
| "epoch": 1.5833333333333335, |
| "grad_norm": 2.2940140494468553, |
| "kl": 0.026611328125, |
| "learning_rate": 8.4375e-07, |
| "loss": 0.0011, |
| "reward": 1.8279385566711426, |
| "reward_std": 0.03183002024888992, |
| "rewards/accuracy_reward": 0.8279385566711426, |
| "rewards/format_reward": 1.0, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.87890625, |
| "epoch": 1.6041666666666665, |
| "grad_norm": 1.9014935653138212, |
| "kl": 0.0299072265625, |
| "learning_rate": 8.416666666666666e-07, |
| "loss": 0.0013, |
| "reward": 1.8360404968261719, |
| "reward_std": 0.02742108330130577, |
| "rewards/accuracy_reward": 0.8360404372215271, |
| "rewards/format_reward": 1.0, |
| "step": 76 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.87239837646484, |
| "epoch": 1.625, |
| "grad_norm": 1.7025904896492243, |
| "kl": 0.02587890625, |
| "learning_rate": 8.395833333333333e-07, |
| "loss": 0.0012, |
| "reward": 1.7988513708114624, |
| "reward_std": 0.035461340099573135, |
| "rewards/accuracy_reward": 0.7988513708114624, |
| "rewards/format_reward": 1.0, |
| "step": 77 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.25651550292969, |
| "epoch": 1.6458333333333335, |
| "grad_norm": 2.0737766177718564, |
| "kl": 0.0262451171875, |
| "learning_rate": 8.375e-07, |
| "loss": 0.0011, |
| "reward": 1.8587430715560913, |
| "reward_std": 0.031236987560987473, |
| "rewards/accuracy_reward": 0.8600451946258545, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 78 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.46875, |
| "epoch": 1.6666666666666665, |
| "grad_norm": 1.8776151359500948, |
| "kl": 0.0283203125, |
| "learning_rate": 8.354166666666667e-07, |
| "loss": 0.0012, |
| "reward": 1.8535531759262085, |
| "reward_std": 0.03250068426132202, |
| "rewards/accuracy_reward": 0.853553056716919, |
| "rewards/format_reward": 1.0, |
| "step": 79 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.76692962646484, |
| "epoch": 1.6875, |
| "grad_norm": 1.4792182225675727, |
| "kl": 0.0274658203125, |
| "learning_rate": 8.333333333333333e-07, |
| "loss": 0.0012, |
| "reward": 1.8359715938568115, |
| "reward_std": 0.027499686926603317, |
| "rewards/accuracy_reward": 0.8359713554382324, |
| "rewards/format_reward": 1.0, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.09114837646484, |
| "epoch": 1.7083333333333335, |
| "grad_norm": 1.429067785258255, |
| "kl": 0.034423828125, |
| "learning_rate": 8.3125e-07, |
| "loss": 0.0015, |
| "reward": 1.8376563787460327, |
| "reward_std": 0.03089229390025139, |
| "rewards/accuracy_reward": 0.8389585018157959, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 81 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.81901550292969, |
| "epoch": 1.7291666666666665, |
| "grad_norm": 1.7947286530222653, |
| "kl": 0.0269775390625, |
| "learning_rate": 8.291666666666666e-07, |
| "loss": 0.0011, |
| "reward": 1.8310506343841553, |
| "reward_std": 0.0325641892850399, |
| "rewards/accuracy_reward": 0.8323527574539185, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 82 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.32292175292969, |
| "epoch": 1.75, |
| "grad_norm": 1.840921164440233, |
| "kl": 0.028076171875, |
| "learning_rate": 8.270833333333333e-07, |
| "loss": 0.0012, |
| "reward": 1.8312219381332397, |
| "reward_std": 0.029772888869047165, |
| "rewards/accuracy_reward": 0.8312219381332397, |
| "rewards/format_reward": 1.0, |
| "step": 83 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.17839050292969, |
| "epoch": 1.7708333333333335, |
| "grad_norm": 1.8438240192624837, |
| "kl": 0.02685546875, |
| "learning_rate": 8.249999999999999e-07, |
| "loss": 0.0012, |
| "reward": 1.8248231410980225, |
| "reward_std": 0.03228841722011566, |
| "rewards/accuracy_reward": 0.8248231410980225, |
| "rewards/format_reward": 1.0, |
| "step": 84 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.45703125, |
| "epoch": 1.7916666666666665, |
| "grad_norm": 1.529693340603501, |
| "kl": 0.028076171875, |
| "learning_rate": 8.229166666666666e-07, |
| "loss": 0.0012, |
| "reward": 1.8117046356201172, |
| "reward_std": 0.03222049027681351, |
| "rewards/accuracy_reward": 0.8117045760154724, |
| "rewards/format_reward": 1.0, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.87630462646484, |
| "epoch": 1.8125, |
| "grad_norm": 3.9505026027468504, |
| "kl": 0.031494140625, |
| "learning_rate": 8.208333333333332e-07, |
| "loss": 0.0013, |
| "reward": 1.814018726348877, |
| "reward_std": 0.031958386301994324, |
| "rewards/accuracy_reward": 0.814018726348877, |
| "rewards/format_reward": 1.0, |
| "step": 86 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.60807800292969, |
| "epoch": 1.8333333333333335, |
| "grad_norm": 9.21528924853295, |
| "kl": 0.0274658203125, |
| "learning_rate": 8.187499999999999e-07, |
| "loss": 0.0012, |
| "reward": 1.8301608562469482, |
| "reward_std": 0.02933676168322563, |
| "rewards/accuracy_reward": 0.8301607370376587, |
| "rewards/format_reward": 1.0, |
| "step": 87 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.74089050292969, |
| "epoch": 1.8541666666666665, |
| "grad_norm": 1.329936418308406, |
| "kl": 0.03173828125, |
| "learning_rate": 8.166666666666666e-07, |
| "loss": 0.0013, |
| "reward": 1.8375142812728882, |
| "reward_std": 0.03457921743392944, |
| "rewards/accuracy_reward": 0.8375141024589539, |
| "rewards/format_reward": 1.0, |
| "step": 88 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 104.92578125, |
| "epoch": 1.875, |
| "grad_norm": 1.3218491918211248, |
| "kl": 0.029541015625, |
| "learning_rate": 8.145833333333333e-07, |
| "loss": 0.0013, |
| "reward": 1.8522485494613647, |
| "reward_std": 0.028189565986394882, |
| "rewards/accuracy_reward": 0.8522485494613647, |
| "rewards/format_reward": 1.0, |
| "step": 89 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.46354675292969, |
| "epoch": 1.8958333333333335, |
| "grad_norm": 2.094729152422118, |
| "kl": 0.0283203125, |
| "learning_rate": 8.125e-07, |
| "loss": 0.0013, |
| "reward": 1.8424299955368042, |
| "reward_std": 0.034867409616708755, |
| "rewards/accuracy_reward": 0.8424299955368042, |
| "rewards/format_reward": 1.0, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.65364837646484, |
| "epoch": 1.9166666666666665, |
| "grad_norm": 7.368992349088265, |
| "kl": 0.0283203125, |
| "learning_rate": 8.104166666666666e-07, |
| "loss": 0.0012, |
| "reward": 1.855029582977295, |
| "reward_std": 0.027775254100561142, |
| "rewards/accuracy_reward": 0.8550295233726501, |
| "rewards/format_reward": 1.0, |
| "step": 91 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.18880462646484, |
| "epoch": 1.9375, |
| "grad_norm": 2.0045320605738333, |
| "kl": 0.0341796875, |
| "learning_rate": 8.083333333333334e-07, |
| "loss": 0.0015, |
| "reward": 1.8531224727630615, |
| "reward_std": 0.02575305663049221, |
| "rewards/accuracy_reward": 0.853122353553772, |
| "rewards/format_reward": 1.0, |
| "step": 92 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.109375, |
| "epoch": 1.9583333333333335, |
| "grad_norm": 1.9199651103984947, |
| "kl": 0.02587890625, |
| "learning_rate": 8.0625e-07, |
| "loss": 0.0011, |
| "reward": 1.8517922163009644, |
| "reward_std": 0.02801516279578209, |
| "rewards/accuracy_reward": 0.8517922163009644, |
| "rewards/format_reward": 1.0, |
| "step": 93 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.6648941040039, |
| "epoch": 1.9791666666666665, |
| "grad_norm": 1.2125015465439164, |
| "kl": 0.025146484375, |
| "learning_rate": 8.041666666666667e-07, |
| "loss": 0.0011, |
| "reward": 1.8468000888824463, |
| "reward_std": 0.027415748685598373, |
| "rewards/accuracy_reward": 0.8467998504638672, |
| "rewards/format_reward": 1.0, |
| "step": 94 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.87109375, |
| "epoch": 2.0208333333333335, |
| "grad_norm": 2.893832337696663, |
| "kl": 0.0240478515625, |
| "learning_rate": 8.020833333333333e-07, |
| "loss": 0.0011, |
| "reward": 1.8671659231185913, |
| "reward_std": 0.03437124937772751, |
| "rewards/accuracy_reward": 0.8684679865837097, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.80729675292969, |
| "epoch": 2.0416666666666665, |
| "grad_norm": 2.4656073464269124, |
| "kl": 0.028564453125, |
| "learning_rate": 8e-07, |
| "loss": 0.0013, |
| "reward": 1.8920382261276245, |
| "reward_std": 0.024690520018339157, |
| "rewards/accuracy_reward": 0.8933402895927429, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 96 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.49739837646484, |
| "epoch": 2.0625, |
| "grad_norm": 2.8323749391253124, |
| "kl": 0.025390625, |
| "learning_rate": 7.979166666666667e-07, |
| "loss": 0.001, |
| "reward": 1.8716254234313965, |
| "reward_std": 0.028798673301935196, |
| "rewards/accuracy_reward": 0.8716254234313965, |
| "rewards/format_reward": 1.0, |
| "step": 97 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.86979675292969, |
| "epoch": 2.0833333333333335, |
| "grad_norm": 1.842256938186257, |
| "kl": 0.0303955078125, |
| "learning_rate": 7.958333333333333e-07, |
| "loss": 0.0013, |
| "reward": 1.849453330039978, |
| "reward_std": 0.033137306571006775, |
| "rewards/accuracy_reward": 0.849453330039978, |
| "rewards/format_reward": 1.0, |
| "step": 98 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.98828125, |
| "epoch": 2.1041666666666665, |
| "grad_norm": 1.625206074387942, |
| "kl": 0.02783203125, |
| "learning_rate": 7.937499999999999e-07, |
| "loss": 0.0012, |
| "reward": 1.864532232284546, |
| "reward_std": 0.02849118784070015, |
| "rewards/accuracy_reward": 0.8645319938659668, |
| "rewards/format_reward": 1.0, |
| "step": 99 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.125, |
| "epoch": 2.125, |
| "grad_norm": 1.8687654433880987, |
| "kl": 0.02880859375, |
| "learning_rate": 7.916666666666666e-07, |
| "loss": 0.0012, |
| "reward": 1.8906430006027222, |
| "reward_std": 0.02808341383934021, |
| "rewards/accuracy_reward": 0.8906428813934326, |
| "rewards/format_reward": 1.0, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.30859375, |
| "epoch": 2.1458333333333335, |
| "grad_norm": 1.812765837401351, |
| "kl": 0.0341796875, |
| "learning_rate": 7.895833333333332e-07, |
| "loss": 0.0015, |
| "reward": 1.84462308883667, |
| "reward_std": 0.02905876934528351, |
| "rewards/accuracy_reward": 0.8446230292320251, |
| "rewards/format_reward": 1.0, |
| "step": 101 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.15625, |
| "epoch": 2.1666666666666665, |
| "grad_norm": 1.3199461131282353, |
| "kl": 0.04736328125, |
| "learning_rate": 7.875e-07, |
| "loss": 0.002, |
| "reward": 1.881453275680542, |
| "reward_std": 0.025718865916132927, |
| "rewards/accuracy_reward": 0.8814532160758972, |
| "rewards/format_reward": 1.0, |
| "step": 102 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.74739837646484, |
| "epoch": 2.1875, |
| "grad_norm": 1.4379737344223804, |
| "kl": 0.0257568359375, |
| "learning_rate": 7.854166666666666e-07, |
| "loss": 0.0011, |
| "reward": 1.852853775024414, |
| "reward_std": 0.03441212326288223, |
| "rewards/accuracy_reward": 0.8541558980941772, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 103 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.17839050292969, |
| "epoch": 2.2083333333333335, |
| "grad_norm": 1.4747961451326568, |
| "kl": 0.02587890625, |
| "learning_rate": 7.833333333333333e-07, |
| "loss": 0.0011, |
| "reward": 1.8564555644989014, |
| "reward_std": 0.03334889933466911, |
| "rewards/accuracy_reward": 0.8564555048942566, |
| "rewards/format_reward": 1.0, |
| "step": 104 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.84114837646484, |
| "epoch": 2.2291666666666665, |
| "grad_norm": 1.595566840318838, |
| "kl": 0.0277099609375, |
| "learning_rate": 7.812499999999999e-07, |
| "loss": 0.0012, |
| "reward": 1.847129225730896, |
| "reward_std": 0.034906916320323944, |
| "rewards/accuracy_reward": 0.847129225730896, |
| "rewards/format_reward": 1.0, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.56640625, |
| "epoch": 2.25, |
| "grad_norm": 1.7252491389020819, |
| "kl": 0.029296875, |
| "learning_rate": 7.791666666666667e-07, |
| "loss": 0.0012, |
| "reward": 1.8392118215560913, |
| "reward_std": 0.032303862273693085, |
| "rewards/accuracy_reward": 0.8392118811607361, |
| "rewards/format_reward": 1.0, |
| "step": 106 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.73046875, |
| "epoch": 2.2708333333333335, |
| "grad_norm": 1.3233610087560426, |
| "kl": 0.027587890625, |
| "learning_rate": 7.770833333333333e-07, |
| "loss": 0.0012, |
| "reward": 1.8880364894866943, |
| "reward_std": 0.0294729545712471, |
| "rewards/accuracy_reward": 0.8880362510681152, |
| "rewards/format_reward": 1.0, |
| "step": 107 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.72135925292969, |
| "epoch": 2.2916666666666665, |
| "grad_norm": 1.3388400280888855, |
| "kl": 0.0308837890625, |
| "learning_rate": 7.75e-07, |
| "loss": 0.0013, |
| "reward": 1.841947078704834, |
| "reward_std": 0.03206552192568779, |
| "rewards/accuracy_reward": 0.8419471979141235, |
| "rewards/format_reward": 1.0, |
| "step": 108 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.37239837646484, |
| "epoch": 2.3125, |
| "grad_norm": 1.6808004103234182, |
| "kl": 0.0301513671875, |
| "learning_rate": 7.729166666666666e-07, |
| "loss": 0.0012, |
| "reward": 1.872849464416504, |
| "reward_std": 0.028602521866559982, |
| "rewards/accuracy_reward": 0.8728495836257935, |
| "rewards/format_reward": 1.0, |
| "step": 109 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.41536712646484, |
| "epoch": 2.3333333333333335, |
| "grad_norm": 1.1449720839601671, |
| "kl": 0.0284423828125, |
| "learning_rate": 7.708333333333333e-07, |
| "loss": 0.0012, |
| "reward": 1.8464049100875854, |
| "reward_std": 0.027897782623767853, |
| "rewards/accuracy_reward": 0.8464047908782959, |
| "rewards/format_reward": 1.0, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.18620300292969, |
| "epoch": 2.3541666666666665, |
| "grad_norm": 1.651982521267855, |
| "kl": 0.0279541015625, |
| "learning_rate": 7.6875e-07, |
| "loss": 0.0012, |
| "reward": 1.859885573387146, |
| "reward_std": 0.03053418919444084, |
| "rewards/accuracy_reward": 0.8611876368522644, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 111 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.31901550292969, |
| "epoch": 2.375, |
| "grad_norm": 1.273847905587726, |
| "kl": 0.03125, |
| "learning_rate": 7.666666666666667e-07, |
| "loss": 0.0014, |
| "reward": 1.8361539840698242, |
| "reward_std": 0.03006243333220482, |
| "rewards/accuracy_reward": 0.8374561667442322, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 112 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.92578125, |
| "epoch": 2.3958333333333335, |
| "grad_norm": 1.1387571840491482, |
| "kl": 0.0301513671875, |
| "learning_rate": 7.645833333333332e-07, |
| "loss": 0.0013, |
| "reward": 1.866465449333191, |
| "reward_std": 0.023312915116548538, |
| "rewards/accuracy_reward": 0.8664655089378357, |
| "rewards/format_reward": 1.0, |
| "step": 113 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.92708587646484, |
| "epoch": 2.4166666666666665, |
| "grad_norm": 1.3560724890382403, |
| "kl": 0.029541015625, |
| "learning_rate": 7.624999999999999e-07, |
| "loss": 0.0013, |
| "reward": 1.8626346588134766, |
| "reward_std": 0.025532353669404984, |
| "rewards/accuracy_reward": 0.8626348376274109, |
| "rewards/format_reward": 1.0, |
| "step": 114 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.17448425292969, |
| "epoch": 2.4375, |
| "grad_norm": 1.3842406282593351, |
| "kl": 0.03076171875, |
| "learning_rate": 7.604166666666666e-07, |
| "loss": 0.0013, |
| "reward": 1.8882970809936523, |
| "reward_std": 0.022818906232714653, |
| "rewards/accuracy_reward": 0.8882970809936523, |
| "rewards/format_reward": 1.0, |
| "step": 115 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.09375, |
| "epoch": 2.4583333333333335, |
| "grad_norm": 1.4789503320621176, |
| "kl": 0.03271484375, |
| "learning_rate": 7.583333333333333e-07, |
| "loss": 0.0014, |
| "reward": 1.86592435836792, |
| "reward_std": 0.025339588522911072, |
| "rewards/accuracy_reward": 0.8659243583679199, |
| "rewards/format_reward": 1.0, |
| "step": 116 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.37890625, |
| "epoch": 2.4791666666666665, |
| "grad_norm": 2.1260312341423484, |
| "kl": 0.0281982421875, |
| "learning_rate": 7.5625e-07, |
| "loss": 0.0012, |
| "reward": 1.8531742095947266, |
| "reward_std": 0.025770537555217743, |
| "rewards/accuracy_reward": 0.8531742095947266, |
| "rewards/format_reward": 1.0, |
| "step": 117 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.8828125, |
| "epoch": 2.5, |
| "grad_norm": 8.068389238759387, |
| "kl": 0.0341796875, |
| "learning_rate": 7.541666666666666e-07, |
| "loss": 0.0015, |
| "reward": 1.8539113998413086, |
| "reward_std": 0.026992302387952805, |
| "rewards/accuracy_reward": 0.8552135825157166, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 118 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.63932800292969, |
| "epoch": 2.5208333333333335, |
| "grad_norm": 4.211053430432126, |
| "kl": 0.03662109375, |
| "learning_rate": 7.520833333333333e-07, |
| "loss": 0.0016, |
| "reward": 1.8711378574371338, |
| "reward_std": 0.033812545239925385, |
| "rewards/accuracy_reward": 0.8724400401115417, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 119 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 114.11979675292969, |
| "epoch": 2.5416666666666665, |
| "grad_norm": 1.9127642747673794, |
| "kl": 0.034423828125, |
| "learning_rate": 7.5e-07, |
| "loss": 0.0014, |
| "reward": 1.8767274618148804, |
| "reward_std": 0.03441750630736351, |
| "rewards/accuracy_reward": 0.8793315887451172, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.91796875, |
| "epoch": 2.5625, |
| "grad_norm": 2.7351925655744354, |
| "kl": 0.03515625, |
| "learning_rate": 7.479166666666667e-07, |
| "loss": 0.0015, |
| "reward": 1.848404884338379, |
| "reward_std": 0.02676878124475479, |
| "rewards/accuracy_reward": 0.8484048843383789, |
| "rewards/format_reward": 1.0, |
| "step": 121 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 114.83203125, |
| "epoch": 2.5833333333333335, |
| "grad_norm": 1.9431827631537368, |
| "kl": 0.037841796875, |
| "learning_rate": 7.458333333333333e-07, |
| "loss": 0.0016, |
| "reward": 1.8470710515975952, |
| "reward_std": 0.03329627588391304, |
| "rewards/accuracy_reward": 0.8496752977371216, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 122 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 116.77474212646484, |
| "epoch": 2.6041666666666665, |
| "grad_norm": 1.5150916718262497, |
| "kl": 0.04296875, |
| "learning_rate": 7.4375e-07, |
| "loss": 0.0017, |
| "reward": 1.8823201656341553, |
| "reward_std": 0.028970589861273766, |
| "rewards/accuracy_reward": 0.8823199272155762, |
| "rewards/format_reward": 1.0, |
| "step": 123 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 115.453125, |
| "epoch": 2.625, |
| "grad_norm": 1.349104952050238, |
| "kl": 0.037841796875, |
| "learning_rate": 7.416666666666666e-07, |
| "loss": 0.0016, |
| "reward": 1.8449368476867676, |
| "reward_std": 0.040684543550014496, |
| "rewards/accuracy_reward": 0.8462389707565308, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 124 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 115.18620300292969, |
| "epoch": 2.6458333333333335, |
| "grad_norm": 1.3525153623022323, |
| "kl": 0.041748046875, |
| "learning_rate": 7.395833333333334e-07, |
| "loss": 0.0018, |
| "reward": 1.8602406978607178, |
| "reward_std": 0.028682291507720947, |
| "rewards/accuracy_reward": 0.8628449440002441, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 114.31771087646484, |
| "epoch": 2.6666666666666665, |
| "grad_norm": 3.3377952313667607, |
| "kl": 0.03515625, |
| "learning_rate": 7.375e-07, |
| "loss": 0.0015, |
| "reward": 1.8633064031600952, |
| "reward_std": 0.029187675565481186, |
| "rewards/accuracy_reward": 0.8633064031600952, |
| "rewards/format_reward": 1.0, |
| "step": 126 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 115.09765625, |
| "epoch": 2.6875, |
| "grad_norm": 3.295416813891869, |
| "kl": 0.034912109375, |
| "learning_rate": 7.354166666666667e-07, |
| "loss": 0.0014, |
| "reward": 1.8418749570846558, |
| "reward_std": 0.030345208942890167, |
| "rewards/accuracy_reward": 0.8418749570846558, |
| "rewards/format_reward": 1.0, |
| "step": 127 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.78646087646484, |
| "epoch": 2.7083333333333335, |
| "grad_norm": 1.8029248151985298, |
| "kl": 0.038330078125, |
| "learning_rate": 7.333333333333332e-07, |
| "loss": 0.0016, |
| "reward": 1.878383994102478, |
| "reward_std": 0.030093541368842125, |
| "rewards/accuracy_reward": 0.8796859979629517, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 128 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.72526550292969, |
| "epoch": 2.7291666666666665, |
| "grad_norm": 4.026175515300724, |
| "kl": 0.037841796875, |
| "learning_rate": 7.312499999999999e-07, |
| "loss": 0.0016, |
| "reward": 1.8830840587615967, |
| "reward_std": 0.03116484545171261, |
| "rewards/accuracy_reward": 0.8830841779708862, |
| "rewards/format_reward": 1.0, |
| "step": 129 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.31510925292969, |
| "epoch": 2.75, |
| "grad_norm": 3.247277440713756, |
| "kl": 0.036865234375, |
| "learning_rate": 7.291666666666666e-07, |
| "loss": 0.0015, |
| "reward": 1.88235604763031, |
| "reward_std": 0.03274049982428551, |
| "rewards/accuracy_reward": 0.8836580514907837, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.67708587646484, |
| "epoch": 2.7708333333333335, |
| "grad_norm": 2.3589645345270296, |
| "kl": 0.036376953125, |
| "learning_rate": 7.270833333333333e-07, |
| "loss": 0.0015, |
| "reward": 1.8647716045379639, |
| "reward_std": 0.037114016711711884, |
| "rewards/accuracy_reward": 0.8673758506774902, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 131 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.82292175292969, |
| "epoch": 2.7916666666666665, |
| "grad_norm": 1.3075748353830943, |
| "kl": 0.038330078125, |
| "learning_rate": 7.249999999999999e-07, |
| "loss": 0.0016, |
| "reward": 1.8723247051239014, |
| "reward_std": 0.042349301278591156, |
| "rewards/accuracy_reward": 0.874928891658783, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 132 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.65885925292969, |
| "epoch": 2.8125, |
| "grad_norm": 1.7497378920781035, |
| "kl": 0.041259765625, |
| "learning_rate": 7.229166666666666e-07, |
| "loss": 0.0017, |
| "reward": 1.870335578918457, |
| "reward_std": 0.03280794620513916, |
| "rewards/accuracy_reward": 0.8703355193138123, |
| "rewards/format_reward": 1.0, |
| "step": 133 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.25651550292969, |
| "epoch": 2.8333333333333335, |
| "grad_norm": 1.8805437427920557, |
| "kl": 0.037841796875, |
| "learning_rate": 7.208333333333332e-07, |
| "loss": 0.0016, |
| "reward": 1.8854488134384155, |
| "reward_std": 0.030766207724809647, |
| "rewards/accuracy_reward": 0.8867508769035339, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 134 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.58333587646484, |
| "epoch": 2.8541666666666665, |
| "grad_norm": 1.9097670028819336, |
| "kl": 0.034423828125, |
| "learning_rate": 7.1875e-07, |
| "loss": 0.0014, |
| "reward": 1.8947999477386475, |
| "reward_std": 0.02784878760576248, |
| "rewards/accuracy_reward": 0.8961019515991211, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 135 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.38802337646484, |
| "epoch": 2.875, |
| "grad_norm": 2.148757546537433, |
| "kl": 0.035888671875, |
| "learning_rate": 7.166666666666667e-07, |
| "loss": 0.0015, |
| "reward": 1.867633581161499, |
| "reward_std": 0.027851156890392303, |
| "rewards/accuracy_reward": 0.867633581161499, |
| "rewards/format_reward": 1.0, |
| "step": 136 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.4921875, |
| "epoch": 2.8958333333333335, |
| "grad_norm": 1.68972106618665, |
| "kl": 0.031982421875, |
| "learning_rate": 7.145833333333333e-07, |
| "loss": 0.0013, |
| "reward": 1.8540122509002686, |
| "reward_std": 0.03191521018743515, |
| "rewards/accuracy_reward": 0.854012131690979, |
| "rewards/format_reward": 1.0, |
| "step": 137 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.47135925292969, |
| "epoch": 2.9166666666666665, |
| "grad_norm": 4.753459469453496, |
| "kl": 0.037353515625, |
| "learning_rate": 7.125e-07, |
| "loss": 0.0015, |
| "reward": 1.8603503704071045, |
| "reward_std": 0.03011004999279976, |
| "rewards/accuracy_reward": 0.8603503704071045, |
| "rewards/format_reward": 1.0, |
| "step": 138 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.7109375, |
| "epoch": 2.9375, |
| "grad_norm": 3.309769561279882, |
| "kl": 0.03369140625, |
| "learning_rate": 7.104166666666667e-07, |
| "loss": 0.0014, |
| "reward": 1.8809528350830078, |
| "reward_std": 0.029512833803892136, |
| "rewards/accuracy_reward": 0.8809528350830078, |
| "rewards/format_reward": 1.0, |
| "step": 139 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.109375, |
| "epoch": 2.9583333333333335, |
| "grad_norm": 1.5596309697590072, |
| "kl": 0.034912109375, |
| "learning_rate": 7.083333333333334e-07, |
| "loss": 0.0014, |
| "reward": 1.8926336765289307, |
| "reward_std": 0.029566586017608643, |
| "rewards/accuracy_reward": 0.8926336169242859, |
| "rewards/format_reward": 1.0, |
| "step": 140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 115.12683868408203, |
| "epoch": 2.9791666666666665, |
| "grad_norm": 2.3377448744574503, |
| "kl": 0.035400390625, |
| "learning_rate": 7.0625e-07, |
| "loss": 0.0014, |
| "reward": 1.870600938796997, |
| "reward_std": 0.028039831668138504, |
| "rewards/accuracy_reward": 0.8706008791923523, |
| "rewards/format_reward": 1.0, |
| "step": 141 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 114.96745300292969, |
| "epoch": 3.0208333333333335, |
| "grad_norm": 1.2246745731883282, |
| "kl": 0.031494140625, |
| "learning_rate": 7.041666666666667e-07, |
| "loss": 0.0013, |
| "reward": 1.9021186828613281, |
| "reward_std": 0.023153727874159813, |
| "rewards/accuracy_reward": 0.9021186828613281, |
| "rewards/format_reward": 1.0, |
| "step": 142 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 117.39974212646484, |
| "epoch": 3.0416666666666665, |
| "grad_norm": 2.052509803568965, |
| "kl": 0.03564453125, |
| "learning_rate": 7.020833333333332e-07, |
| "loss": 0.0015, |
| "reward": 1.875624656677246, |
| "reward_std": 0.026206960901618004, |
| "rewards/accuracy_reward": 0.876926839351654, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 143 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 115.80208587646484, |
| "epoch": 3.0625, |
| "grad_norm": 2.354461875696106, |
| "kl": 0.033203125, |
| "learning_rate": 7e-07, |
| "loss": 0.0014, |
| "reward": 1.8924367427825928, |
| "reward_std": 0.027437550947070122, |
| "rewards/accuracy_reward": 0.8924366235733032, |
| "rewards/format_reward": 1.0, |
| "step": 144 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 116.94792175292969, |
| "epoch": 3.0833333333333335, |
| "grad_norm": 2.0470904135861314, |
| "kl": 0.03759765625, |
| "learning_rate": 6.979166666666666e-07, |
| "loss": 0.0015, |
| "reward": 1.8674253225326538, |
| "reward_std": 0.03383837640285492, |
| "rewards/accuracy_reward": 0.8687273263931274, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 145 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 116.73177337646484, |
| "epoch": 3.1041666666666665, |
| "grad_norm": 2.4829007525918714, |
| "kl": 0.035400390625, |
| "learning_rate": 6.958333333333333e-07, |
| "loss": 0.0015, |
| "reward": 1.8920822143554688, |
| "reward_std": 0.027063176035881042, |
| "rewards/accuracy_reward": 0.8933842182159424, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 146 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 116.31380462646484, |
| "epoch": 3.125, |
| "grad_norm": 1.8187259240411606, |
| "kl": 0.03564453125, |
| "learning_rate": 6.937499999999999e-07, |
| "loss": 0.0015, |
| "reward": 1.8620394468307495, |
| "reward_std": 0.026920508593320847, |
| "rewards/accuracy_reward": 0.86203932762146, |
| "rewards/format_reward": 1.0, |
| "step": 147 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 116.4765625, |
| "epoch": 3.1458333333333335, |
| "grad_norm": 2.130554923785352, |
| "kl": 0.038818359375, |
| "learning_rate": 6.916666666666666e-07, |
| "loss": 0.0016, |
| "reward": 1.881149172782898, |
| "reward_std": 0.03235545754432678, |
| "rewards/accuracy_reward": 0.8837532997131348, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 148 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 115.44271087646484, |
| "epoch": 3.1666666666666665, |
| "grad_norm": 1.3415308591991317, |
| "kl": 0.041015625, |
| "learning_rate": 6.895833333333333e-07, |
| "loss": 0.0017, |
| "reward": 1.8772742748260498, |
| "reward_std": 0.03051435947418213, |
| "rewards/accuracy_reward": 0.8785762190818787, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 149 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 114.02604675292969, |
| "epoch": 3.1875, |
| "grad_norm": 3.61409411357304, |
| "kl": 0.035400390625, |
| "learning_rate": 6.875e-07, |
| "loss": 0.0015, |
| "reward": 1.8919541835784912, |
| "reward_std": 0.025183459743857384, |
| "rewards/accuracy_reward": 0.8919543027877808, |
| "rewards/format_reward": 1.0, |
| "step": 150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.07161712646484, |
| "epoch": 3.2083333333333335, |
| "grad_norm": 1.6998976071316378, |
| "kl": 0.03955078125, |
| "learning_rate": 6.854166666666666e-07, |
| "loss": 0.0017, |
| "reward": 1.8747581243515015, |
| "reward_std": 0.0327390655875206, |
| "rewards/accuracy_reward": 0.8760601878166199, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 151 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.97396087646484, |
| "epoch": 3.2291666666666665, |
| "grad_norm": 4.510995755729955, |
| "kl": 0.034912109375, |
| "learning_rate": 6.833333333333333e-07, |
| "loss": 0.0015, |
| "reward": 1.8948159217834473, |
| "reward_std": 0.024841880425810814, |
| "rewards/accuracy_reward": 0.8948158621788025, |
| "rewards/format_reward": 1.0, |
| "step": 152 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.51432800292969, |
| "epoch": 3.25, |
| "grad_norm": 1.434499918763638, |
| "kl": 0.03515625, |
| "learning_rate": 6.8125e-07, |
| "loss": 0.0015, |
| "reward": 1.9077692031860352, |
| "reward_std": 0.02048024721443653, |
| "rewards/accuracy_reward": 0.9077692031860352, |
| "rewards/format_reward": 1.0, |
| "step": 153 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.29948425292969, |
| "epoch": 3.2708333333333335, |
| "grad_norm": 2.6480453809741307, |
| "kl": 0.03662109375, |
| "learning_rate": 6.791666666666667e-07, |
| "loss": 0.0015, |
| "reward": 1.8736419677734375, |
| "reward_std": 0.026421895250678062, |
| "rewards/accuracy_reward": 0.8736419677734375, |
| "rewards/format_reward": 1.0, |
| "step": 154 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.30339050292969, |
| "epoch": 3.2916666666666665, |
| "grad_norm": 1.6157614309136974, |
| "kl": 0.034423828125, |
| "learning_rate": 6.770833333333333e-07, |
| "loss": 0.0015, |
| "reward": 1.8868639469146729, |
| "reward_std": 0.02565614879131317, |
| "rewards/accuracy_reward": 0.8868638873100281, |
| "rewards/format_reward": 1.0, |
| "step": 155 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.47917175292969, |
| "epoch": 3.3125, |
| "grad_norm": 3.614042738523906, |
| "kl": 0.03125, |
| "learning_rate": 6.75e-07, |
| "loss": 0.0014, |
| "reward": 1.854689598083496, |
| "reward_std": 0.026655998080968857, |
| "rewards/accuracy_reward": 0.8546894788742065, |
| "rewards/format_reward": 1.0, |
| "step": 156 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.14714050292969, |
| "epoch": 3.3333333333333335, |
| "grad_norm": 3.6282010355717285, |
| "kl": 0.035400390625, |
| "learning_rate": 6.729166666666666e-07, |
| "loss": 0.0015, |
| "reward": 1.8728668689727783, |
| "reward_std": 0.02623789571225643, |
| "rewards/accuracy_reward": 0.8728668093681335, |
| "rewards/format_reward": 1.0, |
| "step": 157 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.16146087646484, |
| "epoch": 3.3541666666666665, |
| "grad_norm": 3.096302314051656, |
| "kl": 0.07373046875, |
| "learning_rate": 6.708333333333333e-07, |
| "loss": 0.003, |
| "reward": 1.8780524730682373, |
| "reward_std": 0.02697448432445526, |
| "rewards/accuracy_reward": 0.8780522346496582, |
| "rewards/format_reward": 1.0, |
| "step": 158 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 104.66536712646484, |
| "epoch": 3.375, |
| "grad_norm": 1.8815482668618946, |
| "kl": 0.034423828125, |
| "learning_rate": 6.6875e-07, |
| "loss": 0.0015, |
| "reward": 1.869466781616211, |
| "reward_std": 0.025683503597974777, |
| "rewards/accuracy_reward": 0.8694667816162109, |
| "rewards/format_reward": 1.0, |
| "step": 159 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 103.4375, |
| "epoch": 3.3958333333333335, |
| "grad_norm": 3.6534900968467423, |
| "kl": 0.036376953125, |
| "learning_rate": 6.666666666666666e-07, |
| "loss": 0.0016, |
| "reward": 1.8859853744506836, |
| "reward_std": 0.025588493794202805, |
| "rewards/accuracy_reward": 0.8859855532646179, |
| "rewards/format_reward": 1.0, |
| "step": 160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 103.21875, |
| "epoch": 3.4166666666666665, |
| "grad_norm": 2.8670572231969214, |
| "kl": 0.0361328125, |
| "learning_rate": 6.645833333333333e-07, |
| "loss": 0.0015, |
| "reward": 1.8808234930038452, |
| "reward_std": 0.025275420397520065, |
| "rewards/accuracy_reward": 0.8808236122131348, |
| "rewards/format_reward": 1.0, |
| "step": 161 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 102.73567962646484, |
| "epoch": 3.4375, |
| "grad_norm": 1.291526238821269, |
| "kl": 0.033203125, |
| "learning_rate": 6.624999999999999e-07, |
| "loss": 0.0015, |
| "reward": 1.8656648397445679, |
| "reward_std": 0.02325437031686306, |
| "rewards/accuracy_reward": 0.8656649589538574, |
| "rewards/format_reward": 1.0, |
| "step": 162 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 103.76953125, |
| "epoch": 3.4583333333333335, |
| "grad_norm": 3.2481258355141716, |
| "kl": 0.034423828125, |
| "learning_rate": 6.604166666666667e-07, |
| "loss": 0.0015, |
| "reward": 1.8807281255722046, |
| "reward_std": 0.023684537038207054, |
| "rewards/accuracy_reward": 0.8807281255722046, |
| "rewards/format_reward": 1.0, |
| "step": 163 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 101.98046875, |
| "epoch": 3.4791666666666665, |
| "grad_norm": 1.913896791712438, |
| "kl": 0.034912109375, |
| "learning_rate": 6.583333333333333e-07, |
| "loss": 0.0015, |
| "reward": 1.8801133632659912, |
| "reward_std": 0.020142659544944763, |
| "rewards/accuracy_reward": 0.8801132440567017, |
| "rewards/format_reward": 1.0, |
| "step": 164 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 101.25651550292969, |
| "epoch": 3.5, |
| "grad_norm": 1.5822819853292256, |
| "kl": 0.037109375, |
| "learning_rate": 6.5625e-07, |
| "loss": 0.0016, |
| "reward": 1.8631795644760132, |
| "reward_std": 0.023415524512529373, |
| "rewards/accuracy_reward": 0.8631795644760132, |
| "rewards/format_reward": 1.0, |
| "step": 165 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 102.02083587646484, |
| "epoch": 3.5208333333333335, |
| "grad_norm": 1.4328687070061792, |
| "kl": 0.0341796875, |
| "learning_rate": 6.541666666666666e-07, |
| "loss": 0.0015, |
| "reward": 1.8415846824645996, |
| "reward_std": 0.022014908492565155, |
| "rewards/accuracy_reward": 0.8415846824645996, |
| "rewards/format_reward": 1.0, |
| "step": 166 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 103.24609375, |
| "epoch": 3.5416666666666665, |
| "grad_norm": 2.6519462272626697, |
| "kl": 0.033447265625, |
| "learning_rate": 6.520833333333333e-07, |
| "loss": 0.0014, |
| "reward": 1.8888130187988281, |
| "reward_std": 0.0215926356613636, |
| "rewards/accuracy_reward": 0.8888130187988281, |
| "rewards/format_reward": 1.0, |
| "step": 167 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 101.84375, |
| "epoch": 3.5625, |
| "grad_norm": 1.7631289618422423, |
| "kl": 0.032958984375, |
| "learning_rate": 6.5e-07, |
| "loss": 0.0015, |
| "reward": 1.8750677108764648, |
| "reward_std": 0.023012561723589897, |
| "rewards/accuracy_reward": 0.8750675916671753, |
| "rewards/format_reward": 1.0, |
| "step": 168 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.60026550292969, |
| "epoch": 3.5833333333333335, |
| "grad_norm": 2.0388719106728863, |
| "kl": 0.0361328125, |
| "learning_rate": 6.479166666666667e-07, |
| "loss": 0.0015, |
| "reward": 1.8989547491073608, |
| "reward_std": 0.024981368333101273, |
| "rewards/accuracy_reward": 0.8989547491073608, |
| "rewards/format_reward": 1.0, |
| "step": 169 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 104.02864837646484, |
| "epoch": 3.6041666666666665, |
| "grad_norm": 3.0894511291299485, |
| "kl": 0.03369140625, |
| "learning_rate": 6.458333333333333e-07, |
| "loss": 0.0014, |
| "reward": 1.8803495168685913, |
| "reward_std": 0.024355322122573853, |
| "rewards/accuracy_reward": 0.8803495168685913, |
| "rewards/format_reward": 1.0, |
| "step": 170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.703125, |
| "epoch": 3.625, |
| "grad_norm": 1.6763141295980748, |
| "kl": 0.03662109375, |
| "learning_rate": 6.4375e-07, |
| "loss": 0.0016, |
| "reward": 1.8981932401657104, |
| "reward_std": 0.0216450747102499, |
| "rewards/accuracy_reward": 0.898193359375, |
| "rewards/format_reward": 1.0, |
| "step": 171 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 104.81901550292969, |
| "epoch": 3.6458333333333335, |
| "grad_norm": 1.5197798990089235, |
| "kl": 0.03369140625, |
| "learning_rate": 6.416666666666667e-07, |
| "loss": 0.0015, |
| "reward": 1.8727521896362305, |
| "reward_std": 0.02708452008664608, |
| "rewards/accuracy_reward": 0.87275230884552, |
| "rewards/format_reward": 1.0, |
| "step": 172 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.34245300292969, |
| "epoch": 3.6666666666666665, |
| "grad_norm": 4.145991173961336, |
| "kl": 0.049072265625, |
| "learning_rate": 6.395833333333333e-07, |
| "loss": 0.002, |
| "reward": 1.9041762351989746, |
| "reward_std": 0.024535808712244034, |
| "rewards/accuracy_reward": 0.9041762351989746, |
| "rewards/format_reward": 1.0, |
| "step": 173 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.91667175292969, |
| "epoch": 3.6875, |
| "grad_norm": 3.626404435362734, |
| "kl": 0.038330078125, |
| "learning_rate": 6.374999999999999e-07, |
| "loss": 0.0016, |
| "reward": 1.8905611038208008, |
| "reward_std": 0.02579084411263466, |
| "rewards/accuracy_reward": 0.891863226890564, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 174 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.83464050292969, |
| "epoch": 3.7083333333333335, |
| "grad_norm": 1.4575692565981726, |
| "kl": 0.037841796875, |
| "learning_rate": 6.354166666666666e-07, |
| "loss": 0.0015, |
| "reward": 1.9227180480957031, |
| "reward_std": 0.02236122451722622, |
| "rewards/accuracy_reward": 0.9227181673049927, |
| "rewards/format_reward": 1.0, |
| "step": 175 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.88542175292969, |
| "epoch": 3.7291666666666665, |
| "grad_norm": 6.573675951265146, |
| "kl": 0.040771484375, |
| "learning_rate": 6.333333333333332e-07, |
| "loss": 0.0018, |
| "reward": 1.8883719444274902, |
| "reward_std": 0.023411914706230164, |
| "rewards/accuracy_reward": 0.8883718848228455, |
| "rewards/format_reward": 1.0, |
| "step": 176 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.8828125, |
| "epoch": 3.75, |
| "grad_norm": 1.3628527356709195, |
| "kl": 0.038818359375, |
| "learning_rate": 6.3125e-07, |
| "loss": 0.0016, |
| "reward": 1.8975229263305664, |
| "reward_std": 0.022585680708289146, |
| "rewards/accuracy_reward": 0.8975229263305664, |
| "rewards/format_reward": 1.0, |
| "step": 177 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.2890625, |
| "epoch": 3.7708333333333335, |
| "grad_norm": 1.7649706449204177, |
| "kl": 0.04052734375, |
| "learning_rate": 6.291666666666666e-07, |
| "loss": 0.0017, |
| "reward": 1.8944144248962402, |
| "reward_std": 0.02505827508866787, |
| "rewards/accuracy_reward": 0.8944144248962402, |
| "rewards/format_reward": 1.0, |
| "step": 178 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 115.35546875, |
| "epoch": 3.7916666666666665, |
| "grad_norm": 2.198651075477608, |
| "kl": 0.039306640625, |
| "learning_rate": 6.270833333333333e-07, |
| "loss": 0.0016, |
| "reward": 1.891271948814392, |
| "reward_std": 0.028914332389831543, |
| "rewards/accuracy_reward": 0.8938760757446289, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 179 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 115.40885925292969, |
| "epoch": 3.8125, |
| "grad_norm": 10.660708725864556, |
| "kl": 0.041748046875, |
| "learning_rate": 6.249999999999999e-07, |
| "loss": 0.0017, |
| "reward": 1.8875398635864258, |
| "reward_std": 0.024141697213053703, |
| "rewards/accuracy_reward": 0.887539803981781, |
| "rewards/format_reward": 1.0, |
| "step": 180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 118.68229675292969, |
| "epoch": 3.8333333333333335, |
| "grad_norm": 2.2321500902799647, |
| "kl": 0.04345703125, |
| "learning_rate": 6.229166666666666e-07, |
| "loss": 0.0018, |
| "reward": 1.8584728240966797, |
| "reward_std": 0.03329972177743912, |
| "rewards/accuracy_reward": 0.8623790740966797, |
| "rewards/format_reward": 0.99609375, |
| "step": 181 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 117.95442962646484, |
| "epoch": 3.8541666666666665, |
| "grad_norm": 1.414758072615096, |
| "kl": 0.038818359375, |
| "learning_rate": 6.208333333333334e-07, |
| "loss": 0.0016, |
| "reward": 1.8576724529266357, |
| "reward_std": 0.03029349073767662, |
| "rewards/accuracy_reward": 0.8602765798568726, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 182 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 116.26171875, |
| "epoch": 3.875, |
| "grad_norm": 2.767294776217013, |
| "kl": 0.037353515625, |
| "learning_rate": 6.1875e-07, |
| "loss": 0.0015, |
| "reward": 1.8842320442199707, |
| "reward_std": 0.026229776442050934, |
| "rewards/accuracy_reward": 0.8842320442199707, |
| "rewards/format_reward": 1.0, |
| "step": 183 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 116.26302337646484, |
| "epoch": 3.8958333333333335, |
| "grad_norm": 2.0069160206563423, |
| "kl": 0.04541015625, |
| "learning_rate": 6.166666666666667e-07, |
| "loss": 0.0018, |
| "reward": 1.906886100769043, |
| "reward_std": 0.02242848090827465, |
| "rewards/accuracy_reward": 0.9068862795829773, |
| "rewards/format_reward": 1.0, |
| "step": 184 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 118.28515625, |
| "epoch": 3.9166666666666665, |
| "grad_norm": 2.9167012710524416, |
| "kl": 0.041015625, |
| "learning_rate": 6.145833333333333e-07, |
| "loss": 0.0017, |
| "reward": 1.8881382942199707, |
| "reward_std": 0.027306437492370605, |
| "rewards/accuracy_reward": 0.8881382942199707, |
| "rewards/format_reward": 1.0, |
| "step": 185 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 118.50260925292969, |
| "epoch": 3.9375, |
| "grad_norm": 2.5894313089017706, |
| "kl": 0.03955078125, |
| "learning_rate": 6.125000000000001e-07, |
| "loss": 0.0016, |
| "reward": 1.8834333419799805, |
| "reward_std": 0.02489401400089264, |
| "rewards/accuracy_reward": 0.8834332227706909, |
| "rewards/format_reward": 1.0, |
| "step": 186 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 115.67578125, |
| "epoch": 3.9583333333333335, |
| "grad_norm": 2.8476363151522173, |
| "kl": 0.039794921875, |
| "learning_rate": 6.104166666666667e-07, |
| "loss": 0.0017, |
| "reward": 1.8963178396224976, |
| "reward_std": 0.02300976775586605, |
| "rewards/accuracy_reward": 0.8963178992271423, |
| "rewards/format_reward": 1.0, |
| "step": 187 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.1749038696289, |
| "epoch": 3.9791666666666665, |
| "grad_norm": 102.50025138985052, |
| "kl": 0.9296875, |
| "learning_rate": 6.083333333333333e-07, |
| "loss": 0.0374, |
| "reward": 1.8949506282806396, |
| "reward_std": 0.03087977133691311, |
| "rewards/accuracy_reward": 0.8962857723236084, |
| "rewards/format_reward": 0.998664915561676, |
| "step": 188 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 114.61458587646484, |
| "epoch": 4.020833333333333, |
| "grad_norm": 2.4242741219191553, |
| "kl": 0.03857421875, |
| "learning_rate": 6.062499999999999e-07, |
| "loss": 0.0016, |
| "reward": 1.8952863216400146, |
| "reward_std": 0.025780394673347473, |
| "rewards/accuracy_reward": 0.8952862620353699, |
| "rewards/format_reward": 1.0, |
| "step": 189 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.21745300292969, |
| "epoch": 4.041666666666667, |
| "grad_norm": 1.3513636521416783, |
| "kl": 0.046630859375, |
| "learning_rate": 6.041666666666666e-07, |
| "loss": 0.0019, |
| "reward": 1.9090301990509033, |
| "reward_std": 0.029132381081581116, |
| "rewards/accuracy_reward": 0.9090301394462585, |
| "rewards/format_reward": 1.0, |
| "step": 190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.36328125, |
| "epoch": 4.0625, |
| "grad_norm": 2.0726251594251166, |
| "kl": 0.051513671875, |
| "learning_rate": 6.020833333333333e-07, |
| "loss": 0.0021, |
| "reward": 1.8756507635116577, |
| "reward_std": 0.027445685118436813, |
| "rewards/accuracy_reward": 0.8756507635116577, |
| "rewards/format_reward": 1.0, |
| "step": 191 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.05859375, |
| "epoch": 4.083333333333333, |
| "grad_norm": 1.5875132735828674, |
| "kl": 0.056396484375, |
| "learning_rate": 6e-07, |
| "loss": 0.0023, |
| "reward": 1.8988969326019287, |
| "reward_std": 0.03272823989391327, |
| "rewards/accuracy_reward": 0.9015010595321655, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 192 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.86458587646484, |
| "epoch": 4.104166666666667, |
| "grad_norm": 4.44164388575917, |
| "kl": 0.06201171875, |
| "learning_rate": 5.979166666666666e-07, |
| "loss": 0.0026, |
| "reward": 1.8918952941894531, |
| "reward_std": 0.028478458523750305, |
| "rewards/accuracy_reward": 0.8918952941894531, |
| "rewards/format_reward": 1.0, |
| "step": 193 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 104.47135925292969, |
| "epoch": 4.125, |
| "grad_norm": 1.8931652214160692, |
| "kl": 0.08251953125, |
| "learning_rate": 5.958333333333333e-07, |
| "loss": 0.0034, |
| "reward": 1.877640724182129, |
| "reward_std": 0.03283580765128136, |
| "rewards/accuracy_reward": 0.8789429664611816, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 194 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 104.86458587646484, |
| "epoch": 4.145833333333333, |
| "grad_norm": 1.4398266478518003, |
| "kl": 0.06884765625, |
| "learning_rate": 5.937499999999999e-07, |
| "loss": 0.0028, |
| "reward": 1.893932580947876, |
| "reward_std": 0.023818641901016235, |
| "rewards/accuracy_reward": 0.8939325213432312, |
| "rewards/format_reward": 1.0, |
| "step": 195 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.59765625, |
| "epoch": 4.166666666666667, |
| "grad_norm": 1.9079500637571045, |
| "kl": 0.057861328125, |
| "learning_rate": 5.916666666666667e-07, |
| "loss": 0.0024, |
| "reward": 1.8989866971969604, |
| "reward_std": 0.029820134863257408, |
| "rewards/accuracy_reward": 0.9002887606620789, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 196 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.17317962646484, |
| "epoch": 4.1875, |
| "grad_norm": 1.6892092308161555, |
| "kl": 0.05419921875, |
| "learning_rate": 5.895833333333333e-07, |
| "loss": 0.0022, |
| "reward": 1.9026626348495483, |
| "reward_std": 0.023554224520921707, |
| "rewards/accuracy_reward": 0.9026626348495483, |
| "rewards/format_reward": 1.0, |
| "step": 197 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.45182800292969, |
| "epoch": 4.208333333333333, |
| "grad_norm": 2.0094232681652513, |
| "kl": 0.05126953125, |
| "learning_rate": 5.875e-07, |
| "loss": 0.0021, |
| "reward": 1.9009517431259155, |
| "reward_std": 0.026670873165130615, |
| "rewards/accuracy_reward": 0.9009518027305603, |
| "rewards/format_reward": 1.0, |
| "step": 198 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.99609375, |
| "epoch": 4.229166666666667, |
| "grad_norm": 1.4660353955474104, |
| "kl": 0.045654296875, |
| "learning_rate": 5.854166666666666e-07, |
| "loss": 0.0018, |
| "reward": 1.8858369588851929, |
| "reward_std": 0.02353881672024727, |
| "rewards/accuracy_reward": 0.8858367800712585, |
| "rewards/format_reward": 1.0, |
| "step": 199 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.44792175292969, |
| "epoch": 4.25, |
| "grad_norm": 1.9832423612020598, |
| "kl": 0.04833984375, |
| "learning_rate": 5.833333333333334e-07, |
| "loss": 0.002, |
| "reward": 1.9034093618392944, |
| "reward_std": 0.024380242452025414, |
| "rewards/accuracy_reward": 0.9034093618392944, |
| "rewards/format_reward": 1.0, |
| "step": 200 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.12890625, |
| "epoch": 4.270833333333333, |
| "grad_norm": 1.9973956888778517, |
| "kl": 0.04541015625, |
| "learning_rate": 5.8125e-07, |
| "loss": 0.0019, |
| "reward": 1.918021559715271, |
| "reward_std": 0.01975633203983307, |
| "rewards/accuracy_reward": 0.9180216789245605, |
| "rewards/format_reward": 1.0, |
| "step": 201 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.20182800292969, |
| "epoch": 4.291666666666667, |
| "grad_norm": 1.9799525839497707, |
| "kl": 0.046875, |
| "learning_rate": 5.791666666666667e-07, |
| "loss": 0.0019, |
| "reward": 1.903671145439148, |
| "reward_std": 0.02328427881002426, |
| "rewards/accuracy_reward": 0.903671145439148, |
| "rewards/format_reward": 1.0, |
| "step": 202 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.953125, |
| "epoch": 4.3125, |
| "grad_norm": 2.1041072216648176, |
| "kl": 0.0439453125, |
| "learning_rate": 5.770833333333332e-07, |
| "loss": 0.0018, |
| "reward": 1.8897308111190796, |
| "reward_std": 0.02862635999917984, |
| "rewards/accuracy_reward": 0.891032874584198, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 203 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.08203125, |
| "epoch": 4.333333333333333, |
| "grad_norm": 1.5204073340174282, |
| "kl": 0.039794921875, |
| "learning_rate": 5.749999999999999e-07, |
| "loss": 0.0016, |
| "reward": 1.8759989738464355, |
| "reward_std": 0.02895565889775753, |
| "rewards/accuracy_reward": 0.8759989738464355, |
| "rewards/format_reward": 1.0, |
| "step": 204 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.44010925292969, |
| "epoch": 4.354166666666667, |
| "grad_norm": 2.001436237063012, |
| "kl": 0.046142578125, |
| "learning_rate": 5.729166666666667e-07, |
| "loss": 0.0019, |
| "reward": 1.8900182247161865, |
| "reward_std": 0.02687385492026806, |
| "rewards/accuracy_reward": 0.8913201689720154, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 205 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.47396087646484, |
| "epoch": 4.375, |
| "grad_norm": 3.6969174457282366, |
| "kl": 0.041015625, |
| "learning_rate": 5.708333333333333e-07, |
| "loss": 0.0017, |
| "reward": 1.8979003429412842, |
| "reward_std": 0.0217414703220129, |
| "rewards/accuracy_reward": 0.8979001045227051, |
| "rewards/format_reward": 1.0, |
| "step": 206 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.609375, |
| "epoch": 4.395833333333333, |
| "grad_norm": 2.0011010687642816, |
| "kl": 0.0400390625, |
| "learning_rate": 5.6875e-07, |
| "loss": 0.0017, |
| "reward": 1.890995740890503, |
| "reward_std": 0.025938181206583977, |
| "rewards/accuracy_reward": 0.8909956812858582, |
| "rewards/format_reward": 1.0, |
| "step": 207 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.12890625, |
| "epoch": 4.416666666666667, |
| "grad_norm": 1.2721197172111447, |
| "kl": 0.041259765625, |
| "learning_rate": 5.666666666666666e-07, |
| "loss": 0.0017, |
| "reward": 1.8735730648040771, |
| "reward_std": 0.02578229270875454, |
| "rewards/accuracy_reward": 0.8735730051994324, |
| "rewards/format_reward": 1.0, |
| "step": 208 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.66667175292969, |
| "epoch": 4.4375, |
| "grad_norm": 2.5188213924760667, |
| "kl": 0.07958984375, |
| "learning_rate": 5.645833333333333e-07, |
| "loss": 0.0032, |
| "reward": 1.8987796306610107, |
| "reward_std": 0.025997933000326157, |
| "rewards/accuracy_reward": 0.898779571056366, |
| "rewards/format_reward": 1.0, |
| "step": 209 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.171875, |
| "epoch": 4.458333333333333, |
| "grad_norm": 2.1506836862902876, |
| "kl": 0.03515625, |
| "learning_rate": 5.625e-07, |
| "loss": 0.0015, |
| "reward": 1.891427993774414, |
| "reward_std": 0.02087043598294258, |
| "rewards/accuracy_reward": 0.8914279937744141, |
| "rewards/format_reward": 1.0, |
| "step": 210 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.43880462646484, |
| "epoch": 4.479166666666667, |
| "grad_norm": 1.589375389211378, |
| "kl": 0.033203125, |
| "learning_rate": 5.604166666666667e-07, |
| "loss": 0.0014, |
| "reward": 1.8716447353363037, |
| "reward_std": 0.023239165544509888, |
| "rewards/accuracy_reward": 0.8716444969177246, |
| "rewards/format_reward": 1.0, |
| "step": 211 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.87890625, |
| "epoch": 4.5, |
| "grad_norm": 2.9907672143516737, |
| "kl": 0.038330078125, |
| "learning_rate": 5.583333333333333e-07, |
| "loss": 0.0016, |
| "reward": 1.8827204704284668, |
| "reward_std": 0.028304576873779297, |
| "rewards/accuracy_reward": 0.8827204704284668, |
| "rewards/format_reward": 1.0, |
| "step": 212 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.86849212646484, |
| "epoch": 4.520833333333333, |
| "grad_norm": 2.551740129842165, |
| "kl": 0.0390625, |
| "learning_rate": 5.5625e-07, |
| "loss": 0.0016, |
| "reward": 1.888314127922058, |
| "reward_std": 0.02232741191983223, |
| "rewards/accuracy_reward": 0.8883141279220581, |
| "rewards/format_reward": 1.0, |
| "step": 213 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.05989837646484, |
| "epoch": 4.541666666666667, |
| "grad_norm": 4.368554014866076, |
| "kl": 0.044921875, |
| "learning_rate": 5.541666666666666e-07, |
| "loss": 0.0019, |
| "reward": 1.9319076538085938, |
| "reward_std": 0.020289087668061256, |
| "rewards/accuracy_reward": 0.9319076538085938, |
| "rewards/format_reward": 1.0, |
| "step": 214 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.23828125, |
| "epoch": 4.5625, |
| "grad_norm": 1.4440015070439494, |
| "kl": 0.035888671875, |
| "learning_rate": 5.520833333333334e-07, |
| "loss": 0.0016, |
| "reward": 1.852222204208374, |
| "reward_std": 0.030050039291381836, |
| "rewards/accuracy_reward": 0.8535243272781372, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 215 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.00911712646484, |
| "epoch": 4.583333333333333, |
| "grad_norm": 2.12920095214347, |
| "kl": 0.038818359375, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0016, |
| "reward": 1.8882272243499756, |
| "reward_std": 0.024995621293783188, |
| "rewards/accuracy_reward": 0.8882272243499756, |
| "rewards/format_reward": 1.0, |
| "step": 216 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.91146087646484, |
| "epoch": 4.604166666666667, |
| "grad_norm": 3.2550572641319016, |
| "kl": 0.03955078125, |
| "learning_rate": 5.479166666666667e-07, |
| "loss": 0.0016, |
| "reward": 1.9003304243087769, |
| "reward_std": 0.023031365126371384, |
| "rewards/accuracy_reward": 0.9003303050994873, |
| "rewards/format_reward": 1.0, |
| "step": 217 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.77474212646484, |
| "epoch": 4.625, |
| "grad_norm": 1.4248440110140324, |
| "kl": 0.039794921875, |
| "learning_rate": 5.458333333333332e-07, |
| "loss": 0.0016, |
| "reward": 1.873998999595642, |
| "reward_std": 0.02907104603946209, |
| "rewards/accuracy_reward": 0.8753010630607605, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 218 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.58984375, |
| "epoch": 4.645833333333333, |
| "grad_norm": 2.6245139424544983, |
| "kl": 0.03857421875, |
| "learning_rate": 5.4375e-07, |
| "loss": 0.0016, |
| "reward": 1.9035629034042358, |
| "reward_std": 0.022365760058164597, |
| "rewards/accuracy_reward": 0.9035629034042358, |
| "rewards/format_reward": 1.0, |
| "step": 219 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.59896087646484, |
| "epoch": 4.666666666666667, |
| "grad_norm": 1.2926097030348538, |
| "kl": 0.03955078125, |
| "learning_rate": 5.416666666666666e-07, |
| "loss": 0.0016, |
| "reward": 1.890451192855835, |
| "reward_std": 0.025147125124931335, |
| "rewards/accuracy_reward": 0.8904510736465454, |
| "rewards/format_reward": 1.0, |
| "step": 220 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.26432800292969, |
| "epoch": 4.6875, |
| "grad_norm": 1.2010688063099724, |
| "kl": 0.03955078125, |
| "learning_rate": 5.395833333333333e-07, |
| "loss": 0.0016, |
| "reward": 1.865971326828003, |
| "reward_std": 0.030050549656152725, |
| "rewards/accuracy_reward": 0.8672735095024109, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 221 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.88932800292969, |
| "epoch": 4.708333333333333, |
| "grad_norm": 1.4851010437040935, |
| "kl": 0.0380859375, |
| "learning_rate": 5.374999999999999e-07, |
| "loss": 0.0016, |
| "reward": 1.8917864561080933, |
| "reward_std": 0.021093130111694336, |
| "rewards/accuracy_reward": 0.8917864561080933, |
| "rewards/format_reward": 1.0, |
| "step": 222 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.63932800292969, |
| "epoch": 4.729166666666667, |
| "grad_norm": 1.774445050351076, |
| "kl": 0.041748046875, |
| "learning_rate": 5.354166666666666e-07, |
| "loss": 0.0017, |
| "reward": 1.8998801708221436, |
| "reward_std": 0.025802936404943466, |
| "rewards/accuracy_reward": 0.9011821150779724, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 223 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.80599212646484, |
| "epoch": 4.75, |
| "grad_norm": 1.9122212791313642, |
| "kl": 0.04150390625, |
| "learning_rate": 5.333333333333333e-07, |
| "loss": 0.0017, |
| "reward": 1.898808479309082, |
| "reward_std": 0.02279862016439438, |
| "rewards/accuracy_reward": 0.898808479309082, |
| "rewards/format_reward": 1.0, |
| "step": 224 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.49609375, |
| "epoch": 4.770833333333333, |
| "grad_norm": 1.8595252737006025, |
| "kl": 0.0400390625, |
| "learning_rate": 5.3125e-07, |
| "loss": 0.0016, |
| "reward": 1.8906606435775757, |
| "reward_std": 0.021603485569357872, |
| "rewards/accuracy_reward": 0.8906607031822205, |
| "rewards/format_reward": 1.0, |
| "step": 225 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.00260925292969, |
| "epoch": 4.791666666666667, |
| "grad_norm": 3.9925980009816278, |
| "kl": 0.04833984375, |
| "learning_rate": 5.291666666666666e-07, |
| "loss": 0.002, |
| "reward": 1.896075963973999, |
| "reward_std": 0.02797180414199829, |
| "rewards/accuracy_reward": 0.8973779678344727, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 226 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.4609375, |
| "epoch": 4.8125, |
| "grad_norm": 3.455468575777609, |
| "kl": 0.052001953125, |
| "learning_rate": 5.270833333333333e-07, |
| "loss": 0.0021, |
| "reward": 1.9048645496368408, |
| "reward_std": 0.02417534589767456, |
| "rewards/accuracy_reward": 0.9048646688461304, |
| "rewards/format_reward": 1.0, |
| "step": 227 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.59765625, |
| "epoch": 4.833333333333333, |
| "grad_norm": 1.5619005634713135, |
| "kl": 0.042236328125, |
| "learning_rate": 5.25e-07, |
| "loss": 0.0017, |
| "reward": 1.905022382736206, |
| "reward_std": 0.021135296672582626, |
| "rewards/accuracy_reward": 0.905022144317627, |
| "rewards/format_reward": 1.0, |
| "step": 228 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.34245300292969, |
| "epoch": 4.854166666666667, |
| "grad_norm": 1.8685437866312475, |
| "kl": 0.048095703125, |
| "learning_rate": 5.229166666666667e-07, |
| "loss": 0.002, |
| "reward": 1.89357590675354, |
| "reward_std": 0.025921311229467392, |
| "rewards/accuracy_reward": 0.8948779106140137, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 229 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.43620300292969, |
| "epoch": 4.875, |
| "grad_norm": 5.452442844003646, |
| "kl": 0.043212890625, |
| "learning_rate": 5.208333333333334e-07, |
| "loss": 0.0018, |
| "reward": 1.911612868309021, |
| "reward_std": 0.024873455986380577, |
| "rewards/accuracy_reward": 0.9129147529602051, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 230 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.40495300292969, |
| "epoch": 4.895833333333333, |
| "grad_norm": 2.179930419928341, |
| "kl": 0.04296875, |
| "learning_rate": 5.1875e-07, |
| "loss": 0.0017, |
| "reward": 1.8822953701019287, |
| "reward_std": 0.027776187285780907, |
| "rewards/accuracy_reward": 0.8822951316833496, |
| "rewards/format_reward": 1.0, |
| "step": 231 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.78646087646484, |
| "epoch": 4.916666666666667, |
| "grad_norm": 5.555743499581958, |
| "kl": 0.04931640625, |
| "learning_rate": 5.166666666666667e-07, |
| "loss": 0.0021, |
| "reward": 1.9162639379501343, |
| "reward_std": 0.021101072430610657, |
| "rewards/accuracy_reward": 0.9162638187408447, |
| "rewards/format_reward": 1.0, |
| "step": 232 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.9765625, |
| "epoch": 4.9375, |
| "grad_norm": 1.457940457573027, |
| "kl": 0.03955078125, |
| "learning_rate": 5.145833333333332e-07, |
| "loss": 0.0016, |
| "reward": 1.9340026378631592, |
| "reward_std": 0.020655512809753418, |
| "rewards/accuracy_reward": 0.9340025782585144, |
| "rewards/format_reward": 1.0, |
| "step": 233 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.7734375, |
| "epoch": 4.958333333333333, |
| "grad_norm": 1.4365694122319177, |
| "kl": 0.04345703125, |
| "learning_rate": 5.125e-07, |
| "loss": 0.0018, |
| "reward": 1.8954408168792725, |
| "reward_std": 0.02152765914797783, |
| "rewards/accuracy_reward": 0.8954406976699829, |
| "rewards/format_reward": 1.0, |
| "step": 234 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.25367736816406, |
| "epoch": 4.979166666666667, |
| "grad_norm": 1.734243537392683, |
| "kl": 0.04248046875, |
| "learning_rate": 5.104166666666666e-07, |
| "loss": 0.0017, |
| "reward": 1.9107515811920166, |
| "reward_std": 0.018960019573569298, |
| "rewards/accuracy_reward": 0.9107515811920166, |
| "rewards/format_reward": 1.0, |
| "step": 235 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.74739837646484, |
| "epoch": 5.020833333333333, |
| "grad_norm": 1.3870009697213672, |
| "kl": 0.04833984375, |
| "learning_rate": 5.083333333333333e-07, |
| "loss": 0.002, |
| "reward": 1.8969802856445312, |
| "reward_std": 0.029202213510870934, |
| "rewards/accuracy_reward": 0.8982824087142944, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 236 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.33464050292969, |
| "epoch": 5.041666666666667, |
| "grad_norm": 1.6927816472717638, |
| "kl": 0.0439453125, |
| "learning_rate": 5.062499999999999e-07, |
| "loss": 0.0018, |
| "reward": 1.9044766426086426, |
| "reward_std": 0.02422555536031723, |
| "rewards/accuracy_reward": 0.9044766426086426, |
| "rewards/format_reward": 1.0, |
| "step": 237 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.39714050292969, |
| "epoch": 5.0625, |
| "grad_norm": 2.3914909546196594, |
| "kl": 0.04248046875, |
| "learning_rate": 5.041666666666667e-07, |
| "loss": 0.0018, |
| "reward": 1.9420578479766846, |
| "reward_std": 0.020115545019507408, |
| "rewards/accuracy_reward": 0.9420577883720398, |
| "rewards/format_reward": 1.0, |
| "step": 238 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.46224212646484, |
| "epoch": 5.083333333333333, |
| "grad_norm": 2.834307847073615, |
| "kl": 0.046630859375, |
| "learning_rate": 5.020833333333333e-07, |
| "loss": 0.0019, |
| "reward": 1.9152748584747314, |
| "reward_std": 0.02309068851172924, |
| "rewards/accuracy_reward": 0.9152747392654419, |
| "rewards/format_reward": 1.0, |
| "step": 239 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.2265625, |
| "epoch": 5.104166666666667, |
| "grad_norm": 2.1094761559134816, |
| "kl": 0.04052734375, |
| "learning_rate": 5e-07, |
| "loss": 0.0017, |
| "reward": 1.8892682790756226, |
| "reward_std": 0.023487474769353867, |
| "rewards/accuracy_reward": 0.8892682790756226, |
| "rewards/format_reward": 1.0, |
| "step": 240 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.31380462646484, |
| "epoch": 5.125, |
| "grad_norm": 1.4100911159096023, |
| "kl": 0.039306640625, |
| "learning_rate": 4.979166666666666e-07, |
| "loss": 0.0016, |
| "reward": 1.8915185928344727, |
| "reward_std": 0.023427218198776245, |
| "rewards/accuracy_reward": 0.891518771648407, |
| "rewards/format_reward": 1.0, |
| "step": 241 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.84114837646484, |
| "epoch": 5.145833333333333, |
| "grad_norm": 1.6579016766504264, |
| "kl": 0.04052734375, |
| "learning_rate": 4.958333333333333e-07, |
| "loss": 0.0017, |
| "reward": 1.9029381275177002, |
| "reward_std": 0.023404449224472046, |
| "rewards/accuracy_reward": 0.9029380083084106, |
| "rewards/format_reward": 1.0, |
| "step": 242 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.58333587646484, |
| "epoch": 5.166666666666667, |
| "grad_norm": 2.4190316509927547, |
| "kl": 0.0439453125, |
| "learning_rate": 4.9375e-07, |
| "loss": 0.0018, |
| "reward": 1.9102628231048584, |
| "reward_std": 0.02647389844059944, |
| "rewards/accuracy_reward": 0.9115647077560425, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 243 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.34245300292969, |
| "epoch": 5.1875, |
| "grad_norm": 1.6788038142728927, |
| "kl": 0.0390625, |
| "learning_rate": 4.916666666666666e-07, |
| "loss": 0.0016, |
| "reward": 1.8926337957382202, |
| "reward_std": 0.022663813084363937, |
| "rewards/accuracy_reward": 0.8926336765289307, |
| "rewards/format_reward": 1.0, |
| "step": 244 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.6640625, |
| "epoch": 5.208333333333333, |
| "grad_norm": 2.1053897766649934, |
| "kl": 0.041015625, |
| "learning_rate": 4.895833333333333e-07, |
| "loss": 0.0017, |
| "reward": 1.9033950567245483, |
| "reward_std": 0.024107707664370537, |
| "rewards/accuracy_reward": 0.9033951163291931, |
| "rewards/format_reward": 1.0, |
| "step": 245 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.30599212646484, |
| "epoch": 5.229166666666667, |
| "grad_norm": 8.173004175170574, |
| "kl": 0.042236328125, |
| "learning_rate": 4.875e-07, |
| "loss": 0.0017, |
| "reward": 1.8888883590698242, |
| "reward_std": 0.028628483414649963, |
| "rewards/accuracy_reward": 0.8901904821395874, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 246 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.86328125, |
| "epoch": 5.25, |
| "grad_norm": 2.028487749549242, |
| "kl": 0.044921875, |
| "learning_rate": 4.854166666666666e-07, |
| "loss": 0.0019, |
| "reward": 1.9012870788574219, |
| "reward_std": 0.02197723090648651, |
| "rewards/accuracy_reward": 0.9012872576713562, |
| "rewards/format_reward": 1.0, |
| "step": 247 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.8125, |
| "epoch": 5.270833333333333, |
| "grad_norm": 2.0303516292175523, |
| "kl": 0.03955078125, |
| "learning_rate": 4.833333333333333e-07, |
| "loss": 0.0016, |
| "reward": 1.8865455389022827, |
| "reward_std": 0.025701235979795456, |
| "rewards/accuracy_reward": 0.8865455985069275, |
| "rewards/format_reward": 1.0, |
| "step": 248 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.77604675292969, |
| "epoch": 5.291666666666667, |
| "grad_norm": 1.5986544250252577, |
| "kl": 0.040771484375, |
| "learning_rate": 4.812499999999999e-07, |
| "loss": 0.0017, |
| "reward": 1.8878264427185059, |
| "reward_std": 0.023908209055662155, |
| "rewards/accuracy_reward": 0.8878263831138611, |
| "rewards/format_reward": 1.0, |
| "step": 249 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.72005462646484, |
| "epoch": 5.3125, |
| "grad_norm": 2.0564085146093825, |
| "kl": 0.04541015625, |
| "learning_rate": 4.791666666666667e-07, |
| "loss": 0.0019, |
| "reward": 1.9011871814727783, |
| "reward_std": 0.026990963146090508, |
| "rewards/accuracy_reward": 0.902489185333252, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 250 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.52604675292969, |
| "epoch": 5.333333333333333, |
| "grad_norm": 2.637539554946258, |
| "kl": 0.046875, |
| "learning_rate": 4.770833333333334e-07, |
| "loss": 0.0019, |
| "reward": 1.904599666595459, |
| "reward_std": 0.02259230427443981, |
| "rewards/accuracy_reward": 0.9059017896652222, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 251 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.51823425292969, |
| "epoch": 5.354166666666667, |
| "grad_norm": 2.0291093041211545, |
| "kl": 0.043701171875, |
| "learning_rate": 4.7499999999999995e-07, |
| "loss": 0.0018, |
| "reward": 1.9127092361450195, |
| "reward_std": 0.02183235064148903, |
| "rewards/accuracy_reward": 0.9127092361450195, |
| "rewards/format_reward": 1.0, |
| "step": 252 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.5234375, |
| "epoch": 5.375, |
| "grad_norm": 4.505797454206583, |
| "kl": 0.04345703125, |
| "learning_rate": 4.7291666666666666e-07, |
| "loss": 0.0018, |
| "reward": 1.9092210531234741, |
| "reward_std": 0.021370170637965202, |
| "rewards/accuracy_reward": 0.9092210531234741, |
| "rewards/format_reward": 1.0, |
| "step": 253 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.19661712646484, |
| "epoch": 5.395833333333333, |
| "grad_norm": 1.8902834610451218, |
| "kl": 0.044677734375, |
| "learning_rate": 4.708333333333333e-07, |
| "loss": 0.0019, |
| "reward": 1.917715311050415, |
| "reward_std": 0.018400993198156357, |
| "rewards/accuracy_reward": 0.9177150726318359, |
| "rewards/format_reward": 1.0, |
| "step": 254 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.390625, |
| "epoch": 5.416666666666667, |
| "grad_norm": 2.563147653109146, |
| "kl": 0.0458984375, |
| "learning_rate": 4.6874999999999996e-07, |
| "loss": 0.0019, |
| "reward": 1.8932042121887207, |
| "reward_std": 0.021255169063806534, |
| "rewards/accuracy_reward": 0.8932042121887207, |
| "rewards/format_reward": 1.0, |
| "step": 255 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.11067962646484, |
| "epoch": 5.4375, |
| "grad_norm": 4.520917096171478, |
| "kl": 0.042236328125, |
| "learning_rate": 4.6666666666666666e-07, |
| "loss": 0.0018, |
| "reward": 1.9107736349105835, |
| "reward_std": 0.020730838179588318, |
| "rewards/accuracy_reward": 0.910773515701294, |
| "rewards/format_reward": 1.0, |
| "step": 256 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.9140625, |
| "epoch": 5.458333333333333, |
| "grad_norm": 2.9043788526253866, |
| "kl": 0.046875, |
| "learning_rate": 4.645833333333333e-07, |
| "loss": 0.0019, |
| "reward": 1.932241439819336, |
| "reward_std": 0.019916361197829247, |
| "rewards/accuracy_reward": 0.932241678237915, |
| "rewards/format_reward": 1.0, |
| "step": 257 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.98698425292969, |
| "epoch": 5.479166666666667, |
| "grad_norm": 4.137371389224913, |
| "kl": 0.04443359375, |
| "learning_rate": 4.625e-07, |
| "loss": 0.0018, |
| "reward": 1.8949460983276367, |
| "reward_std": 0.021680889651179314, |
| "rewards/accuracy_reward": 0.8949460983276367, |
| "rewards/format_reward": 1.0, |
| "step": 258 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.06120300292969, |
| "epoch": 5.5, |
| "grad_norm": 1.5956956245046428, |
| "kl": 0.046875, |
| "learning_rate": 4.604166666666666e-07, |
| "loss": 0.002, |
| "reward": 1.9305871725082397, |
| "reward_std": 0.016989264637231827, |
| "rewards/accuracy_reward": 0.9305871725082397, |
| "rewards/format_reward": 1.0, |
| "step": 259 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.46875, |
| "epoch": 5.520833333333333, |
| "grad_norm": 2.3801100164721527, |
| "kl": 0.05126953125, |
| "learning_rate": 4.5833333333333327e-07, |
| "loss": 0.0021, |
| "reward": 1.9142370223999023, |
| "reward_std": 0.020726464688777924, |
| "rewards/accuracy_reward": 0.9142370223999023, |
| "rewards/format_reward": 1.0, |
| "step": 260 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.06380462646484, |
| "epoch": 5.541666666666667, |
| "grad_norm": 1.7313384895006823, |
| "kl": 0.051025390625, |
| "learning_rate": 4.5624999999999997e-07, |
| "loss": 0.0021, |
| "reward": 1.8905744552612305, |
| "reward_std": 0.02504381351172924, |
| "rewards/accuracy_reward": 0.8905746340751648, |
| "rewards/format_reward": 1.0, |
| "step": 261 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.88151550292969, |
| "epoch": 5.5625, |
| "grad_norm": 1.5486364631929457, |
| "kl": 0.046875, |
| "learning_rate": 4.541666666666666e-07, |
| "loss": 0.002, |
| "reward": 1.8817743062973022, |
| "reward_std": 0.02400146797299385, |
| "rewards/accuracy_reward": 0.8817743062973022, |
| "rewards/format_reward": 1.0, |
| "step": 262 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.19921875, |
| "epoch": 5.583333333333333, |
| "grad_norm": 1.9468347826097554, |
| "kl": 0.0498046875, |
| "learning_rate": 4.5208333333333333e-07, |
| "loss": 0.0021, |
| "reward": 1.9123249053955078, |
| "reward_std": 0.02088339626789093, |
| "rewards/accuracy_reward": 0.9123249053955078, |
| "rewards/format_reward": 1.0, |
| "step": 263 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.50651550292969, |
| "epoch": 5.604166666666667, |
| "grad_norm": 1.8134017920844003, |
| "kl": 0.050537109375, |
| "learning_rate": 4.5e-07, |
| "loss": 0.002, |
| "reward": 1.9056270122528076, |
| "reward_std": 0.020590659230947495, |
| "rewards/accuracy_reward": 0.9056269526481628, |
| "rewards/format_reward": 1.0, |
| "step": 264 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.43359375, |
| "epoch": 5.625, |
| "grad_norm": 2.097289435304608, |
| "kl": 0.058837890625, |
| "learning_rate": 4.479166666666667e-07, |
| "loss": 0.0024, |
| "reward": 1.8596689701080322, |
| "reward_std": 0.026524469256401062, |
| "rewards/accuracy_reward": 0.8609709143638611, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 265 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.8515625, |
| "epoch": 5.645833333333333, |
| "grad_norm": 2.9899940677112995, |
| "kl": 0.0556640625, |
| "learning_rate": 4.4583333333333334e-07, |
| "loss": 0.0023, |
| "reward": 1.891918420791626, |
| "reward_std": 0.02338644489645958, |
| "rewards/accuracy_reward": 0.8919183611869812, |
| "rewards/format_reward": 1.0, |
| "step": 266 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.14714050292969, |
| "epoch": 5.666666666666667, |
| "grad_norm": 1.5447722254023164, |
| "kl": 0.052734375, |
| "learning_rate": 4.4374999999999993e-07, |
| "loss": 0.0022, |
| "reward": 1.8943192958831787, |
| "reward_std": 0.021093344315886497, |
| "rewards/accuracy_reward": 0.8943192362785339, |
| "rewards/format_reward": 1.0, |
| "step": 267 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.02083587646484, |
| "epoch": 5.6875, |
| "grad_norm": 1.7252851771135564, |
| "kl": 0.05419921875, |
| "learning_rate": 4.4166666666666664e-07, |
| "loss": 0.0023, |
| "reward": 1.8994736671447754, |
| "reward_std": 0.025407809764146805, |
| "rewards/accuracy_reward": 0.9007757902145386, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 268 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.19792175292969, |
| "epoch": 5.708333333333333, |
| "grad_norm": 1.4821607784380106, |
| "kl": 0.05810546875, |
| "learning_rate": 4.395833333333333e-07, |
| "loss": 0.0025, |
| "reward": 1.9176604747772217, |
| "reward_std": 0.019861234351992607, |
| "rewards/accuracy_reward": 0.9176604151725769, |
| "rewards/format_reward": 1.0, |
| "step": 269 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.72265625, |
| "epoch": 5.729166666666667, |
| "grad_norm": 6.345280883170612, |
| "kl": 0.052734375, |
| "learning_rate": 4.375e-07, |
| "loss": 0.0022, |
| "reward": 1.9127343893051147, |
| "reward_std": 0.019745318219065666, |
| "rewards/accuracy_reward": 0.9127345085144043, |
| "rewards/format_reward": 1.0, |
| "step": 270 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.1640625, |
| "epoch": 5.75, |
| "grad_norm": 1.8167510177139423, |
| "kl": 0.052734375, |
| "learning_rate": 4.3541666666666664e-07, |
| "loss": 0.0022, |
| "reward": 1.9143961668014526, |
| "reward_std": 0.021544938907027245, |
| "rewards/accuracy_reward": 0.9143962264060974, |
| "rewards/format_reward": 1.0, |
| "step": 271 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.9296875, |
| "epoch": 5.770833333333333, |
| "grad_norm": 2.4532452869571566, |
| "kl": 0.05322265625, |
| "learning_rate": 4.3333333333333335e-07, |
| "loss": 0.0021, |
| "reward": 1.8847317695617676, |
| "reward_std": 0.02402741275727749, |
| "rewards/accuracy_reward": 0.8847318887710571, |
| "rewards/format_reward": 1.0, |
| "step": 272 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.16146087646484, |
| "epoch": 5.791666666666667, |
| "grad_norm": 4.133102609868267, |
| "kl": 0.04541015625, |
| "learning_rate": 4.3125e-07, |
| "loss": 0.0019, |
| "reward": 1.9220575094223022, |
| "reward_std": 0.01720447652041912, |
| "rewards/accuracy_reward": 0.9220575094223022, |
| "rewards/format_reward": 1.0, |
| "step": 273 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 104.95573425292969, |
| "epoch": 5.8125, |
| "grad_norm": 7.211708520736478, |
| "kl": 0.04541015625, |
| "learning_rate": 4.291666666666666e-07, |
| "loss": 0.0019, |
| "reward": 1.9195680618286133, |
| "reward_std": 0.019044464454054832, |
| "rewards/accuracy_reward": 0.9195680618286133, |
| "rewards/format_reward": 1.0, |
| "step": 274 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.79427337646484, |
| "epoch": 5.833333333333333, |
| "grad_norm": 2.5193114976640394, |
| "kl": 0.047607421875, |
| "learning_rate": 4.270833333333333e-07, |
| "loss": 0.002, |
| "reward": 1.9032373428344727, |
| "reward_std": 0.01688932441174984, |
| "rewards/accuracy_reward": 0.903237521648407, |
| "rewards/format_reward": 1.0, |
| "step": 275 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.97135925292969, |
| "epoch": 5.854166666666667, |
| "grad_norm": 1.922113400060024, |
| "kl": 0.046875, |
| "learning_rate": 4.2499999999999995e-07, |
| "loss": 0.002, |
| "reward": 1.8878214359283447, |
| "reward_std": 0.023371964693069458, |
| "rewards/accuracy_reward": 0.8878213763237, |
| "rewards/format_reward": 1.0, |
| "step": 276 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.08073425292969, |
| "epoch": 5.875, |
| "grad_norm": 1.786331439327784, |
| "kl": 0.04541015625, |
| "learning_rate": 4.2291666666666666e-07, |
| "loss": 0.0019, |
| "reward": 1.9193141460418701, |
| "reward_std": 0.02163059636950493, |
| "rewards/accuracy_reward": 0.9206160306930542, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 277 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.82421875, |
| "epoch": 5.895833333333333, |
| "grad_norm": 4.285948048806076, |
| "kl": 0.0419921875, |
| "learning_rate": 4.208333333333333e-07, |
| "loss": 0.0017, |
| "reward": 1.915520429611206, |
| "reward_std": 0.021722108125686646, |
| "rewards/accuracy_reward": 0.9155203700065613, |
| "rewards/format_reward": 1.0, |
| "step": 278 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.68229675292969, |
| "epoch": 5.916666666666667, |
| "grad_norm": 2.1862985744626102, |
| "kl": 0.045166015625, |
| "learning_rate": 4.1875e-07, |
| "loss": 0.0019, |
| "reward": 1.8867233991622925, |
| "reward_std": 0.022505465894937515, |
| "rewards/accuracy_reward": 0.8867233991622925, |
| "rewards/format_reward": 1.0, |
| "step": 279 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.41536712646484, |
| "epoch": 5.9375, |
| "grad_norm": 2.6822943568035655, |
| "kl": 0.050537109375, |
| "learning_rate": 4.1666666666666667e-07, |
| "loss": 0.0021, |
| "reward": 1.9038598537445068, |
| "reward_std": 0.02783789113163948, |
| "rewards/accuracy_reward": 0.9051617980003357, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 280 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.13542175292969, |
| "epoch": 5.958333333333333, |
| "grad_norm": 1.8215228571783213, |
| "kl": 0.0498046875, |
| "learning_rate": 4.145833333333333e-07, |
| "loss": 0.0021, |
| "reward": 1.9209411144256592, |
| "reward_std": 0.020260518416762352, |
| "rewards/accuracy_reward": 0.9209408760070801, |
| "rewards/format_reward": 1.0, |
| "step": 281 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.25901794433594, |
| "epoch": 5.979166666666667, |
| "grad_norm": 1.7303647275137022, |
| "kl": 0.04931640625, |
| "learning_rate": 4.1249999999999997e-07, |
| "loss": 0.0021, |
| "reward": 1.8940850496292114, |
| "reward_std": 0.02454877458512783, |
| "rewards/accuracy_reward": 0.8954200744628906, |
| "rewards/format_reward": 0.998664915561676, |
| "step": 282 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.49870300292969, |
| "epoch": 6.020833333333333, |
| "grad_norm": 2.065626526970926, |
| "kl": 0.045654296875, |
| "learning_rate": 4.104166666666666e-07, |
| "loss": 0.0019, |
| "reward": 1.8828470706939697, |
| "reward_std": 0.023679915815591812, |
| "rewards/accuracy_reward": 0.8841490745544434, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 283 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.63021087646484, |
| "epoch": 6.041666666666667, |
| "grad_norm": 2.407620241107693, |
| "kl": 0.044189453125, |
| "learning_rate": 4.083333333333333e-07, |
| "loss": 0.0018, |
| "reward": 1.922331690788269, |
| "reward_std": 0.02023524045944214, |
| "rewards/accuracy_reward": 0.9223315715789795, |
| "rewards/format_reward": 1.0, |
| "step": 284 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.02734375, |
| "epoch": 6.0625, |
| "grad_norm": 2.583573023564423, |
| "kl": 0.043212890625, |
| "learning_rate": 4.0625e-07, |
| "loss": 0.0018, |
| "reward": 1.9238051176071167, |
| "reward_std": 0.01875409483909607, |
| "rewards/accuracy_reward": 0.9238051176071167, |
| "rewards/format_reward": 1.0, |
| "step": 285 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.75130462646484, |
| "epoch": 6.083333333333333, |
| "grad_norm": 1.5921015904802127, |
| "kl": 0.04736328125, |
| "learning_rate": 4.041666666666667e-07, |
| "loss": 0.0019, |
| "reward": 1.90175199508667, |
| "reward_std": 0.01868622750043869, |
| "rewards/accuracy_reward": 0.9017519950866699, |
| "rewards/format_reward": 1.0, |
| "step": 286 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.14453125, |
| "epoch": 6.104166666666667, |
| "grad_norm": 2.063415833551645, |
| "kl": 0.04638671875, |
| "learning_rate": 4.0208333333333333e-07, |
| "loss": 0.0019, |
| "reward": 1.9288432598114014, |
| "reward_std": 0.017594818025827408, |
| "rewards/accuracy_reward": 0.9288431406021118, |
| "rewards/format_reward": 1.0, |
| "step": 287 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.52995300292969, |
| "epoch": 6.125, |
| "grad_norm": 2.6980955335354597, |
| "kl": 0.043212890625, |
| "learning_rate": 4e-07, |
| "loss": 0.0018, |
| "reward": 1.9106167554855347, |
| "reward_std": 0.020342741161584854, |
| "rewards/accuracy_reward": 0.9106166362762451, |
| "rewards/format_reward": 1.0, |
| "step": 288 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.89583587646484, |
| "epoch": 6.145833333333333, |
| "grad_norm": 1.7057280834613109, |
| "kl": 0.041259765625, |
| "learning_rate": 3.9791666666666663e-07, |
| "loss": 0.0017, |
| "reward": 1.9230403900146484, |
| "reward_std": 0.018507663160562515, |
| "rewards/accuracy_reward": 0.9230403900146484, |
| "rewards/format_reward": 1.0, |
| "step": 289 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.31640625, |
| "epoch": 6.166666666666667, |
| "grad_norm": 1.7481050352203042, |
| "kl": 0.04296875, |
| "learning_rate": 3.958333333333333e-07, |
| "loss": 0.0018, |
| "reward": 1.8892457485198975, |
| "reward_std": 0.025482675060629845, |
| "rewards/accuracy_reward": 0.8905477523803711, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 290 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.57552337646484, |
| "epoch": 6.1875, |
| "grad_norm": 1.5790651747348239, |
| "kl": 0.045654296875, |
| "learning_rate": 3.9375e-07, |
| "loss": 0.0019, |
| "reward": 1.9172825813293457, |
| "reward_std": 0.01870821975171566, |
| "rewards/accuracy_reward": 0.9172827005386353, |
| "rewards/format_reward": 1.0, |
| "step": 291 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.67708587646484, |
| "epoch": 6.208333333333333, |
| "grad_norm": 5.985924437288418, |
| "kl": 0.0458984375, |
| "learning_rate": 3.9166666666666664e-07, |
| "loss": 0.0019, |
| "reward": 1.9062340259552002, |
| "reward_std": 0.017811615020036697, |
| "rewards/accuracy_reward": 0.9062339663505554, |
| "rewards/format_reward": 1.0, |
| "step": 292 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.33854675292969, |
| "epoch": 6.229166666666667, |
| "grad_norm": 29.636849516000904, |
| "kl": 0.046630859375, |
| "learning_rate": 3.8958333333333334e-07, |
| "loss": 0.0019, |
| "reward": 1.9097627401351929, |
| "reward_std": 0.025265123695135117, |
| "rewards/accuracy_reward": 0.9110648036003113, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 293 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.21745300292969, |
| "epoch": 6.25, |
| "grad_norm": 2.499873249076194, |
| "kl": 0.052001953125, |
| "learning_rate": 3.875e-07, |
| "loss": 0.0021, |
| "reward": 1.908249855041504, |
| "reward_std": 0.021753787994384766, |
| "rewards/accuracy_reward": 0.9082497358322144, |
| "rewards/format_reward": 1.0, |
| "step": 294 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.96875, |
| "epoch": 6.270833333333333, |
| "grad_norm": 1.43250838119596, |
| "kl": 0.0478515625, |
| "learning_rate": 3.8541666666666665e-07, |
| "loss": 0.002, |
| "reward": 1.9135513305664062, |
| "reward_std": 0.02083742991089821, |
| "rewards/accuracy_reward": 0.9148534536361694, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 295 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.49479675292969, |
| "epoch": 6.291666666666667, |
| "grad_norm": 1.9939986843684863, |
| "kl": 0.046630859375, |
| "learning_rate": 3.8333333333333335e-07, |
| "loss": 0.002, |
| "reward": 1.9211857318878174, |
| "reward_std": 0.019976306706666946, |
| "rewards/accuracy_reward": 0.9211856722831726, |
| "rewards/format_reward": 1.0, |
| "step": 296 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.37239837646484, |
| "epoch": 6.3125, |
| "grad_norm": 1.8430355626574486, |
| "kl": 0.047607421875, |
| "learning_rate": 3.8124999999999995e-07, |
| "loss": 0.002, |
| "reward": 1.9055722951889038, |
| "reward_std": 0.020207837224006653, |
| "rewards/accuracy_reward": 0.9055722951889038, |
| "rewards/format_reward": 1.0, |
| "step": 297 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.99870300292969, |
| "epoch": 6.333333333333333, |
| "grad_norm": 4.534254461924897, |
| "kl": 0.044921875, |
| "learning_rate": 3.7916666666666665e-07, |
| "loss": 0.0019, |
| "reward": 1.8959176540374756, |
| "reward_std": 0.018412087112665176, |
| "rewards/accuracy_reward": 0.8959175944328308, |
| "rewards/format_reward": 1.0, |
| "step": 298 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.14714050292969, |
| "epoch": 6.354166666666667, |
| "grad_norm": 1.5417912395172437, |
| "kl": 0.04638671875, |
| "learning_rate": 3.770833333333333e-07, |
| "loss": 0.002, |
| "reward": 1.9178366661071777, |
| "reward_std": 0.01834617182612419, |
| "rewards/accuracy_reward": 0.9178365468978882, |
| "rewards/format_reward": 1.0, |
| "step": 299 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.67448425292969, |
| "epoch": 6.375, |
| "grad_norm": 10.381006533833029, |
| "kl": 0.043701171875, |
| "learning_rate": 3.75e-07, |
| "loss": 0.0019, |
| "reward": 1.9170053005218506, |
| "reward_std": 0.020167209208011627, |
| "rewards/accuracy_reward": 0.9183073043823242, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 300 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.34635925292969, |
| "epoch": 6.395833333333333, |
| "grad_norm": 3.2103701478375366, |
| "kl": 0.046630859375, |
| "learning_rate": 3.7291666666666666e-07, |
| "loss": 0.0019, |
| "reward": 1.9029948711395264, |
| "reward_std": 0.019157804548740387, |
| "rewards/accuracy_reward": 0.9029948711395264, |
| "rewards/format_reward": 1.0, |
| "step": 301 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.34114837646484, |
| "epoch": 6.416666666666667, |
| "grad_norm": 2.1215276638392253, |
| "kl": 0.051025390625, |
| "learning_rate": 3.708333333333333e-07, |
| "loss": 0.0021, |
| "reward": 1.9299815893173218, |
| "reward_std": 0.01765059307217598, |
| "rewards/accuracy_reward": 0.9299815893173218, |
| "rewards/format_reward": 1.0, |
| "step": 302 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.96354675292969, |
| "epoch": 6.4375, |
| "grad_norm": 2.0251198297391815, |
| "kl": 0.04736328125, |
| "learning_rate": 3.6875e-07, |
| "loss": 0.002, |
| "reward": 1.901715636253357, |
| "reward_std": 0.024447208270430565, |
| "rewards/accuracy_reward": 0.9030176401138306, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 303 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.21224212646484, |
| "epoch": 6.458333333333333, |
| "grad_norm": 5.5850053870802645, |
| "kl": 0.04345703125, |
| "learning_rate": 3.666666666666666e-07, |
| "loss": 0.0018, |
| "reward": 1.9226205348968506, |
| "reward_std": 0.01778128370642662, |
| "rewards/accuracy_reward": 0.9226205348968506, |
| "rewards/format_reward": 1.0, |
| "step": 304 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.30989837646484, |
| "epoch": 6.479166666666667, |
| "grad_norm": 2.1418335808540183, |
| "kl": 0.045654296875, |
| "learning_rate": 3.645833333333333e-07, |
| "loss": 0.0019, |
| "reward": 1.9199604988098145, |
| "reward_std": 0.018466424196958542, |
| "rewards/accuracy_reward": 0.919960618019104, |
| "rewards/format_reward": 1.0, |
| "step": 305 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.81771087646484, |
| "epoch": 6.5, |
| "grad_norm": 1.5934526555915005, |
| "kl": 0.051513671875, |
| "learning_rate": 3.6249999999999997e-07, |
| "loss": 0.0021, |
| "reward": 1.9024890661239624, |
| "reward_std": 0.018778668716549873, |
| "rewards/accuracy_reward": 0.9024890661239624, |
| "rewards/format_reward": 1.0, |
| "step": 306 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.77474212646484, |
| "epoch": 6.520833333333333, |
| "grad_norm": 9.90923642720643, |
| "kl": 0.04736328125, |
| "learning_rate": 3.604166666666666e-07, |
| "loss": 0.002, |
| "reward": 1.9020146131515503, |
| "reward_std": 0.021591586992144585, |
| "rewards/accuracy_reward": 0.9033166766166687, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 307 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.80859375, |
| "epoch": 6.541666666666667, |
| "grad_norm": 2.2902946965667996, |
| "kl": 0.040771484375, |
| "learning_rate": 3.583333333333333e-07, |
| "loss": 0.0017, |
| "reward": 1.9083236455917358, |
| "reward_std": 0.020869677886366844, |
| "rewards/accuracy_reward": 0.9083236455917358, |
| "rewards/format_reward": 1.0, |
| "step": 308 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.97135925292969, |
| "epoch": 6.5625, |
| "grad_norm": 2.141619909298115, |
| "kl": 0.04443359375, |
| "learning_rate": 3.5625e-07, |
| "loss": 0.0019, |
| "reward": 1.895308494567871, |
| "reward_std": 0.021585416048765182, |
| "rewards/accuracy_reward": 0.8966106176376343, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 309 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.83984375, |
| "epoch": 6.583333333333333, |
| "grad_norm": 1.722945461090862, |
| "kl": 0.052490234375, |
| "learning_rate": 3.541666666666667e-07, |
| "loss": 0.0022, |
| "reward": 1.9142106771469116, |
| "reward_std": 0.022508492693305016, |
| "rewards/accuracy_reward": 0.91551274061203, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 310 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.95703125, |
| "epoch": 6.604166666666667, |
| "grad_norm": 3.6778187227400396, |
| "kl": 0.0498046875, |
| "learning_rate": 3.5208333333333333e-07, |
| "loss": 0.0021, |
| "reward": 1.8938100337982178, |
| "reward_std": 0.024937432259321213, |
| "rewards/accuracy_reward": 0.8951120376586914, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 311 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.52995300292969, |
| "epoch": 6.625, |
| "grad_norm": 2.535477979468292, |
| "kl": 0.0634765625, |
| "learning_rate": 3.5e-07, |
| "loss": 0.0026, |
| "reward": 1.900233268737793, |
| "reward_std": 0.01887938380241394, |
| "rewards/accuracy_reward": 0.9002333879470825, |
| "rewards/format_reward": 1.0, |
| "step": 312 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.04817962646484, |
| "epoch": 6.645833333333333, |
| "grad_norm": 2.288272855129929, |
| "kl": 0.045654296875, |
| "learning_rate": 3.4791666666666664e-07, |
| "loss": 0.0019, |
| "reward": 1.912428855895996, |
| "reward_std": 0.020974930375814438, |
| "rewards/accuracy_reward": 0.9124290347099304, |
| "rewards/format_reward": 1.0, |
| "step": 313 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.02864837646484, |
| "epoch": 6.666666666666667, |
| "grad_norm": 2.0183407375033413, |
| "kl": 0.04736328125, |
| "learning_rate": 3.458333333333333e-07, |
| "loss": 0.0019, |
| "reward": 1.9255565404891968, |
| "reward_std": 0.01740911416709423, |
| "rewards/accuracy_reward": 0.9255565404891968, |
| "rewards/format_reward": 1.0, |
| "step": 314 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.26302337646484, |
| "epoch": 6.6875, |
| "grad_norm": 2.4818346839800007, |
| "kl": 0.04736328125, |
| "learning_rate": 3.4375e-07, |
| "loss": 0.0019, |
| "reward": 1.882767915725708, |
| "reward_std": 0.024225857108831406, |
| "rewards/accuracy_reward": 0.8827678561210632, |
| "rewards/format_reward": 1.0, |
| "step": 315 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.06771087646484, |
| "epoch": 6.708333333333333, |
| "grad_norm": 1.4343972340076592, |
| "kl": 0.05322265625, |
| "learning_rate": 3.4166666666666664e-07, |
| "loss": 0.0022, |
| "reward": 1.9222266674041748, |
| "reward_std": 0.015834566205739975, |
| "rewards/accuracy_reward": 0.92222660779953, |
| "rewards/format_reward": 1.0, |
| "step": 316 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.90234375, |
| "epoch": 6.729166666666667, |
| "grad_norm": 1.852771798485069, |
| "kl": 0.0517578125, |
| "learning_rate": 3.3958333333333335e-07, |
| "loss": 0.0022, |
| "reward": 1.9179143905639648, |
| "reward_std": 0.02144519053399563, |
| "rewards/accuracy_reward": 0.9192163944244385, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 317 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.6328125, |
| "epoch": 6.75, |
| "grad_norm": 3.6471691567389057, |
| "kl": 0.054443359375, |
| "learning_rate": 3.375e-07, |
| "loss": 0.0022, |
| "reward": 1.909096598625183, |
| "reward_std": 0.01967495307326317, |
| "rewards/accuracy_reward": 0.9090965986251831, |
| "rewards/format_reward": 1.0, |
| "step": 318 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.61458587646484, |
| "epoch": 6.770833333333333, |
| "grad_norm": 4.178729179494792, |
| "kl": 0.047607421875, |
| "learning_rate": 3.3541666666666665e-07, |
| "loss": 0.002, |
| "reward": 1.923593521118164, |
| "reward_std": 0.016256026923656464, |
| "rewards/accuracy_reward": 0.9235934019088745, |
| "rewards/format_reward": 1.0, |
| "step": 319 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.46875, |
| "epoch": 6.791666666666667, |
| "grad_norm": 3.850110305328207, |
| "kl": 0.05224609375, |
| "learning_rate": 3.333333333333333e-07, |
| "loss": 0.0022, |
| "reward": 1.8894399404525757, |
| "reward_std": 0.0215081088244915, |
| "rewards/accuracy_reward": 0.8894399404525757, |
| "rewards/format_reward": 1.0, |
| "step": 320 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.98567962646484, |
| "epoch": 6.8125, |
| "grad_norm": 1.5295790546923704, |
| "kl": 0.049560546875, |
| "learning_rate": 3.3124999999999995e-07, |
| "loss": 0.0021, |
| "reward": 1.902695894241333, |
| "reward_std": 0.022552501410245895, |
| "rewards/accuracy_reward": 0.9039978384971619, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 321 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.2109375, |
| "epoch": 6.833333333333333, |
| "grad_norm": 2.177500326990495, |
| "kl": 0.049560546875, |
| "learning_rate": 3.2916666666666666e-07, |
| "loss": 0.0021, |
| "reward": 1.9026316404342651, |
| "reward_std": 0.028222566470503807, |
| "rewards/accuracy_reward": 0.9039337038993835, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 322 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.26823425292969, |
| "epoch": 6.854166666666667, |
| "grad_norm": 2.25656186299227, |
| "kl": 0.049072265625, |
| "learning_rate": 3.270833333333333e-07, |
| "loss": 0.002, |
| "reward": 1.9238814115524292, |
| "reward_std": 0.01956191472709179, |
| "rewards/accuracy_reward": 0.9238814115524292, |
| "rewards/format_reward": 1.0, |
| "step": 323 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.48177337646484, |
| "epoch": 6.875, |
| "grad_norm": 1.962758450391304, |
| "kl": 0.044677734375, |
| "learning_rate": 3.25e-07, |
| "loss": 0.0019, |
| "reward": 1.902562141418457, |
| "reward_std": 0.016970310360193253, |
| "rewards/accuracy_reward": 0.902562141418457, |
| "rewards/format_reward": 1.0, |
| "step": 324 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.42317962646484, |
| "epoch": 6.895833333333333, |
| "grad_norm": 1.932226827561351, |
| "kl": 0.052978515625, |
| "learning_rate": 3.2291666666666666e-07, |
| "loss": 0.0022, |
| "reward": 1.9169180393218994, |
| "reward_std": 0.01541107427328825, |
| "rewards/accuracy_reward": 0.9169179797172546, |
| "rewards/format_reward": 1.0, |
| "step": 325 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.95442962646484, |
| "epoch": 6.916666666666667, |
| "grad_norm": 2.123672528629966, |
| "kl": 0.0546875, |
| "learning_rate": 3.2083333333333337e-07, |
| "loss": 0.0023, |
| "reward": 1.9134833812713623, |
| "reward_std": 0.018955400213599205, |
| "rewards/accuracy_reward": 0.9134833812713623, |
| "rewards/format_reward": 1.0, |
| "step": 326 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.23177337646484, |
| "epoch": 6.9375, |
| "grad_norm": 2.7379081437953436, |
| "kl": 0.04736328125, |
| "learning_rate": 3.1874999999999997e-07, |
| "loss": 0.0019, |
| "reward": 1.8997161388397217, |
| "reward_std": 0.019093122333288193, |
| "rewards/accuracy_reward": 0.8997160792350769, |
| "rewards/format_reward": 1.0, |
| "step": 327 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.70052337646484, |
| "epoch": 6.958333333333333, |
| "grad_norm": 2.033398718348136, |
| "kl": 0.051025390625, |
| "learning_rate": 3.166666666666666e-07, |
| "loss": 0.0021, |
| "reward": 1.9058278799057007, |
| "reward_std": 0.020390968769788742, |
| "rewards/accuracy_reward": 0.9058279991149902, |
| "rewards/format_reward": 1.0, |
| "step": 328 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.6568832397461, |
| "epoch": 6.979166666666667, |
| "grad_norm": 4.2350616184619625, |
| "kl": 0.0478515625, |
| "learning_rate": 3.145833333333333e-07, |
| "loss": 0.002, |
| "reward": 1.9187196493148804, |
| "reward_std": 0.019620845094323158, |
| "rewards/accuracy_reward": 0.9187195301055908, |
| "rewards/format_reward": 1.0, |
| "step": 329 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.58984375, |
| "epoch": 7.020833333333333, |
| "grad_norm": 1.3225122948151316, |
| "kl": 0.04833984375, |
| "learning_rate": 3.1249999999999997e-07, |
| "loss": 0.002, |
| "reward": 1.904463768005371, |
| "reward_std": 0.016730796545743942, |
| "rewards/accuracy_reward": 0.9044637680053711, |
| "rewards/format_reward": 1.0, |
| "step": 330 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.29036712646484, |
| "epoch": 7.041666666666667, |
| "grad_norm": 2.0001558004119135, |
| "kl": 0.04931640625, |
| "learning_rate": 3.104166666666667e-07, |
| "loss": 0.002, |
| "reward": 1.9282145500183105, |
| "reward_std": 0.020631009712815285, |
| "rewards/accuracy_reward": 0.9282145500183105, |
| "rewards/format_reward": 1.0, |
| "step": 331 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.10026550292969, |
| "epoch": 7.0625, |
| "grad_norm": 1.7677360413399623, |
| "kl": 0.046142578125, |
| "learning_rate": 3.0833333333333333e-07, |
| "loss": 0.0019, |
| "reward": 1.913468360900879, |
| "reward_std": 0.01975328102707863, |
| "rewards/accuracy_reward": 0.9134685397148132, |
| "rewards/format_reward": 1.0, |
| "step": 332 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.64583587646484, |
| "epoch": 7.083333333333333, |
| "grad_norm": 1.5863979383855535, |
| "kl": 0.044921875, |
| "learning_rate": 3.0625000000000003e-07, |
| "loss": 0.0019, |
| "reward": 1.9101800918579102, |
| "reward_std": 0.018298618495464325, |
| "rewards/accuracy_reward": 0.9101800918579102, |
| "rewards/format_reward": 1.0, |
| "step": 333 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.95573425292969, |
| "epoch": 7.104166666666667, |
| "grad_norm": 1.7406147667004834, |
| "kl": 0.04296875, |
| "learning_rate": 3.0416666666666663e-07, |
| "loss": 0.0017, |
| "reward": 1.9128578901290894, |
| "reward_std": 0.025313010439276695, |
| "rewards/accuracy_reward": 0.9141599535942078, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 334 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.0390625, |
| "epoch": 7.125, |
| "grad_norm": 6.234665754184476, |
| "kl": 0.046875, |
| "learning_rate": 3.020833333333333e-07, |
| "loss": 0.0019, |
| "reward": 1.8841427564620972, |
| "reward_std": 0.020634343847632408, |
| "rewards/accuracy_reward": 0.8841428160667419, |
| "rewards/format_reward": 1.0, |
| "step": 335 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.65104675292969, |
| "epoch": 7.145833333333333, |
| "grad_norm": 2.942912834547579, |
| "kl": 0.053955078125, |
| "learning_rate": 3e-07, |
| "loss": 0.0022, |
| "reward": 1.9155168533325195, |
| "reward_std": 0.019924897700548172, |
| "rewards/accuracy_reward": 0.9155170321464539, |
| "rewards/format_reward": 1.0, |
| "step": 336 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.85026550292969, |
| "epoch": 7.166666666666667, |
| "grad_norm": 1.658655338447552, |
| "kl": 0.048583984375, |
| "learning_rate": 2.9791666666666664e-07, |
| "loss": 0.002, |
| "reward": 1.9088094234466553, |
| "reward_std": 0.025105763226747513, |
| "rewards/accuracy_reward": 0.9101114273071289, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 337 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.1953125, |
| "epoch": 7.1875, |
| "grad_norm": 2.4406244402139365, |
| "kl": 0.04833984375, |
| "learning_rate": 2.9583333333333334e-07, |
| "loss": 0.002, |
| "reward": 1.9100263118743896, |
| "reward_std": 0.02519826404750347, |
| "rewards/accuracy_reward": 0.9113283157348633, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 338 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.19140625, |
| "epoch": 7.208333333333333, |
| "grad_norm": 4.271413410927959, |
| "kl": 0.048828125, |
| "learning_rate": 2.9375e-07, |
| "loss": 0.002, |
| "reward": 1.9093725681304932, |
| "reward_std": 0.01967555098235607, |
| "rewards/accuracy_reward": 0.9093725085258484, |
| "rewards/format_reward": 1.0, |
| "step": 339 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 104.4140625, |
| "epoch": 7.229166666666667, |
| "grad_norm": 2.3763872048797414, |
| "kl": 0.0654296875, |
| "learning_rate": 2.916666666666667e-07, |
| "loss": 0.0027, |
| "reward": 1.90401029586792, |
| "reward_std": 0.023377878591418266, |
| "rewards/accuracy_reward": 0.9053124189376831, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 340 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.7265625, |
| "epoch": 7.25, |
| "grad_norm": 2.806628637706787, |
| "kl": 0.04443359375, |
| "learning_rate": 2.8958333333333335e-07, |
| "loss": 0.0019, |
| "reward": 1.8880078792572021, |
| "reward_std": 0.023502841591835022, |
| "rewards/accuracy_reward": 0.8880078196525574, |
| "rewards/format_reward": 1.0, |
| "step": 341 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.31510925292969, |
| "epoch": 7.270833333333333, |
| "grad_norm": 3.43433922412685, |
| "kl": 0.0888671875, |
| "learning_rate": 2.8749999999999995e-07, |
| "loss": 0.0036, |
| "reward": 1.9123291969299316, |
| "reward_std": 0.020550193265080452, |
| "rewards/accuracy_reward": 0.9123293161392212, |
| "rewards/format_reward": 1.0, |
| "step": 342 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 103.36589050292969, |
| "epoch": 7.291666666666667, |
| "grad_norm": 2.842653921619938, |
| "kl": 0.046142578125, |
| "learning_rate": 2.8541666666666665e-07, |
| "loss": 0.0019, |
| "reward": 1.9058549404144287, |
| "reward_std": 0.019686056300997734, |
| "rewards/accuracy_reward": 0.9058548808097839, |
| "rewards/format_reward": 1.0, |
| "step": 343 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.89974212646484, |
| "epoch": 7.3125, |
| "grad_norm": 2.0405865834971473, |
| "kl": 0.046630859375, |
| "learning_rate": 2.833333333333333e-07, |
| "loss": 0.0019, |
| "reward": 1.9217561483383179, |
| "reward_std": 0.018797121942043304, |
| "rewards/accuracy_reward": 0.9217562675476074, |
| "rewards/format_reward": 1.0, |
| "step": 344 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.70182800292969, |
| "epoch": 7.333333333333333, |
| "grad_norm": 1.9486921558633905, |
| "kl": 0.0458984375, |
| "learning_rate": 2.8125e-07, |
| "loss": 0.0019, |
| "reward": 1.8922131061553955, |
| "reward_std": 0.02469293400645256, |
| "rewards/accuracy_reward": 0.892212986946106, |
| "rewards/format_reward": 1.0, |
| "step": 345 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.19140625, |
| "epoch": 7.354166666666667, |
| "grad_norm": 2.5259874539949303, |
| "kl": 0.045654296875, |
| "learning_rate": 2.7916666666666666e-07, |
| "loss": 0.0019, |
| "reward": 1.905487060546875, |
| "reward_std": 0.023547440767288208, |
| "rewards/accuracy_reward": 0.905487060546875, |
| "rewards/format_reward": 1.0, |
| "step": 346 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 104.25130462646484, |
| "epoch": 7.375, |
| "grad_norm": 6.590006203990273, |
| "kl": 0.055908203125, |
| "learning_rate": 2.770833333333333e-07, |
| "loss": 0.0023, |
| "reward": 1.931574821472168, |
| "reward_std": 0.019262373447418213, |
| "rewards/accuracy_reward": 0.9315750002861023, |
| "rewards/format_reward": 1.0, |
| "step": 347 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.35677337646484, |
| "epoch": 7.395833333333333, |
| "grad_norm": 4.321196737881709, |
| "kl": 0.049560546875, |
| "learning_rate": 2.75e-07, |
| "loss": 0.0021, |
| "reward": 1.9387693405151367, |
| "reward_std": 0.017124010249972343, |
| "rewards/accuracy_reward": 0.9387692213058472, |
| "rewards/format_reward": 1.0, |
| "step": 348 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.671875, |
| "epoch": 7.416666666666667, |
| "grad_norm": 2.8457346329200557, |
| "kl": 0.0458984375, |
| "learning_rate": 2.729166666666666e-07, |
| "loss": 0.0019, |
| "reward": 1.9095313549041748, |
| "reward_std": 0.021084271371364594, |
| "rewards/accuracy_reward": 0.90953129529953, |
| "rewards/format_reward": 1.0, |
| "step": 349 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.71224212646484, |
| "epoch": 7.4375, |
| "grad_norm": 1.993210730812182, |
| "kl": 0.046875, |
| "learning_rate": 2.708333333333333e-07, |
| "loss": 0.0019, |
| "reward": 1.9209964275360107, |
| "reward_std": 0.020304953679442406, |
| "rewards/accuracy_reward": 0.9209963083267212, |
| "rewards/format_reward": 1.0, |
| "step": 350 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.25521087646484, |
| "epoch": 7.458333333333333, |
| "grad_norm": 2.4373953511615856, |
| "kl": 0.0458984375, |
| "learning_rate": 2.6874999999999997e-07, |
| "loss": 0.0019, |
| "reward": 1.9061431884765625, |
| "reward_std": 0.01994149014353752, |
| "rewards/accuracy_reward": 0.906143069267273, |
| "rewards/format_reward": 1.0, |
| "step": 351 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.44140625, |
| "epoch": 7.479166666666667, |
| "grad_norm": 3.7778106185692635, |
| "kl": 0.053466796875, |
| "learning_rate": 2.6666666666666667e-07, |
| "loss": 0.0022, |
| "reward": 1.9190186262130737, |
| "reward_std": 0.020440340042114258, |
| "rewards/accuracy_reward": 0.9190186262130737, |
| "rewards/format_reward": 1.0, |
| "step": 352 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.86589050292969, |
| "epoch": 7.5, |
| "grad_norm": 2.376594210526858, |
| "kl": 0.047119140625, |
| "learning_rate": 2.645833333333333e-07, |
| "loss": 0.002, |
| "reward": 1.9218724966049194, |
| "reward_std": 0.02028195932507515, |
| "rewards/accuracy_reward": 0.9218723773956299, |
| "rewards/format_reward": 1.0, |
| "step": 353 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.45052337646484, |
| "epoch": 7.520833333333333, |
| "grad_norm": 1.7750680835812318, |
| "kl": 0.048583984375, |
| "learning_rate": 2.625e-07, |
| "loss": 0.002, |
| "reward": 1.90578293800354, |
| "reward_std": 0.018642796203494072, |
| "rewards/accuracy_reward": 0.9057828187942505, |
| "rewards/format_reward": 1.0, |
| "step": 354 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.1484375, |
| "epoch": 7.541666666666667, |
| "grad_norm": 2.022098717469963, |
| "kl": 0.04833984375, |
| "learning_rate": 2.604166666666667e-07, |
| "loss": 0.002, |
| "reward": 1.895311713218689, |
| "reward_std": 0.024539759382605553, |
| "rewards/accuracy_reward": 0.8966139554977417, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 355 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.81380462646484, |
| "epoch": 7.5625, |
| "grad_norm": 4.701010132350987, |
| "kl": 0.053466796875, |
| "learning_rate": 2.5833333333333333e-07, |
| "loss": 0.0022, |
| "reward": 1.9116785526275635, |
| "reward_std": 0.01904495432972908, |
| "rewards/accuracy_reward": 0.9116784930229187, |
| "rewards/format_reward": 1.0, |
| "step": 356 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.61328125, |
| "epoch": 7.583333333333333, |
| "grad_norm": 2.7971888222519197, |
| "kl": 0.04638671875, |
| "learning_rate": 2.5625e-07, |
| "loss": 0.0019, |
| "reward": 1.9103072881698608, |
| "reward_std": 0.019077036529779434, |
| "rewards/accuracy_reward": 0.9103072881698608, |
| "rewards/format_reward": 1.0, |
| "step": 357 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.35677337646484, |
| "epoch": 7.604166666666667, |
| "grad_norm": 3.340602697847883, |
| "kl": 0.049560546875, |
| "learning_rate": 2.5416666666666663e-07, |
| "loss": 0.002, |
| "reward": 1.9107370376586914, |
| "reward_std": 0.01656663417816162, |
| "rewards/accuracy_reward": 0.9107369780540466, |
| "rewards/format_reward": 1.0, |
| "step": 358 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.40755462646484, |
| "epoch": 7.625, |
| "grad_norm": 3.4757576150836527, |
| "kl": 0.052978515625, |
| "learning_rate": 2.5208333333333334e-07, |
| "loss": 0.0022, |
| "reward": 1.897055983543396, |
| "reward_std": 0.02028050646185875, |
| "rewards/accuracy_reward": 0.897055983543396, |
| "rewards/format_reward": 1.0, |
| "step": 359 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.87109375, |
| "epoch": 7.645833333333333, |
| "grad_norm": 1.7537209554556075, |
| "kl": 0.052001953125, |
| "learning_rate": 2.5e-07, |
| "loss": 0.0021, |
| "reward": 1.9118655920028687, |
| "reward_std": 0.019177807494997978, |
| "rewards/accuracy_reward": 0.9118657112121582, |
| "rewards/format_reward": 1.0, |
| "step": 360 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.87760925292969, |
| "epoch": 7.666666666666667, |
| "grad_norm": 1.985054115096998, |
| "kl": 0.053955078125, |
| "learning_rate": 2.4791666666666664e-07, |
| "loss": 0.0022, |
| "reward": 1.9363340139389038, |
| "reward_std": 0.01809048466384411, |
| "rewards/accuracy_reward": 0.9363340139389038, |
| "rewards/format_reward": 1.0, |
| "step": 361 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.9375, |
| "epoch": 7.6875, |
| "grad_norm": 2.640046167255212, |
| "kl": 0.055908203125, |
| "learning_rate": 2.458333333333333e-07, |
| "loss": 0.0023, |
| "reward": 1.8969202041625977, |
| "reward_std": 0.02410067245364189, |
| "rewards/accuracy_reward": 0.8982224464416504, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 362 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.78646087646484, |
| "epoch": 7.708333333333333, |
| "grad_norm": 2.0927423356674155, |
| "kl": 0.05322265625, |
| "learning_rate": 2.4375e-07, |
| "loss": 0.0023, |
| "reward": 1.934058427810669, |
| "reward_std": 0.020023031160235405, |
| "rewards/accuracy_reward": 0.9340583682060242, |
| "rewards/format_reward": 1.0, |
| "step": 363 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.98177337646484, |
| "epoch": 7.729166666666667, |
| "grad_norm": 5.374648001232616, |
| "kl": 0.05517578125, |
| "learning_rate": 2.4166666666666665e-07, |
| "loss": 0.0023, |
| "reward": 1.9142836332321167, |
| "reward_std": 0.01773514598608017, |
| "rewards/accuracy_reward": 0.9142836928367615, |
| "rewards/format_reward": 1.0, |
| "step": 364 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.72396087646484, |
| "epoch": 7.75, |
| "grad_norm": 1.5292733041237832, |
| "kl": 0.050537109375, |
| "learning_rate": 2.3958333333333335e-07, |
| "loss": 0.0021, |
| "reward": 1.9197324514389038, |
| "reward_std": 0.0220477432012558, |
| "rewards/accuracy_reward": 0.9210345149040222, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 365 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.1875, |
| "epoch": 7.770833333333333, |
| "grad_norm": 9.487726177733885, |
| "kl": 0.045654296875, |
| "learning_rate": 2.3749999999999998e-07, |
| "loss": 0.0019, |
| "reward": 1.927847146987915, |
| "reward_std": 0.017860591411590576, |
| "rewards/accuracy_reward": 0.9278470873832703, |
| "rewards/format_reward": 1.0, |
| "step": 366 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.3203125, |
| "epoch": 7.791666666666667, |
| "grad_norm": 2.667778768562944, |
| "kl": 0.0458984375, |
| "learning_rate": 2.3541666666666665e-07, |
| "loss": 0.0019, |
| "reward": 1.9139199256896973, |
| "reward_std": 0.02642824873328209, |
| "rewards/accuracy_reward": 0.9178261756896973, |
| "rewards/format_reward": 0.99609375, |
| "step": 367 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.01302337646484, |
| "epoch": 7.8125, |
| "grad_norm": 2.86218482085828, |
| "kl": 0.051025390625, |
| "learning_rate": 2.3333333333333333e-07, |
| "loss": 0.0021, |
| "reward": 1.9041519165039062, |
| "reward_std": 0.02404339425265789, |
| "rewards/accuracy_reward": 0.9054540395736694, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 368 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.94401550292969, |
| "epoch": 7.833333333333333, |
| "grad_norm": 1.5489939957891778, |
| "kl": 0.05322265625, |
| "learning_rate": 2.3125e-07, |
| "loss": 0.0022, |
| "reward": 1.9263066053390503, |
| "reward_std": 0.019394386559724808, |
| "rewards/accuracy_reward": 0.9263066649436951, |
| "rewards/format_reward": 1.0, |
| "step": 369 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.15755462646484, |
| "epoch": 7.854166666666667, |
| "grad_norm": 2.64116020814343, |
| "kl": 0.052490234375, |
| "learning_rate": 2.2916666666666663e-07, |
| "loss": 0.0021, |
| "reward": 1.9046939611434937, |
| "reward_std": 0.023857450112700462, |
| "rewards/accuracy_reward": 0.9059960842132568, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 370 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.32552337646484, |
| "epoch": 7.875, |
| "grad_norm": 1.80983212656234, |
| "kl": 0.044921875, |
| "learning_rate": 2.270833333333333e-07, |
| "loss": 0.0018, |
| "reward": 1.934501051902771, |
| "reward_std": 0.016163021326065063, |
| "rewards/accuracy_reward": 0.934501051902771, |
| "rewards/format_reward": 1.0, |
| "step": 371 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.37890625, |
| "epoch": 7.895833333333333, |
| "grad_norm": 10.563112786998811, |
| "kl": 0.048583984375, |
| "learning_rate": 2.25e-07, |
| "loss": 0.002, |
| "reward": 1.8924764394760132, |
| "reward_std": 0.019610995426774025, |
| "rewards/accuracy_reward": 0.892476499080658, |
| "rewards/format_reward": 1.0, |
| "step": 372 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.34896087646484, |
| "epoch": 7.916666666666667, |
| "grad_norm": 2.533818918571777, |
| "kl": 0.044921875, |
| "learning_rate": 2.2291666666666667e-07, |
| "loss": 0.0019, |
| "reward": 1.906684398651123, |
| "reward_std": 0.01782190427184105, |
| "rewards/accuracy_reward": 0.906684398651123, |
| "rewards/format_reward": 1.0, |
| "step": 373 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.51823425292969, |
| "epoch": 7.9375, |
| "grad_norm": 2.2661373394338926, |
| "kl": 0.0546875, |
| "learning_rate": 2.2083333333333332e-07, |
| "loss": 0.0022, |
| "reward": 1.9244441986083984, |
| "reward_std": 0.01977790892124176, |
| "rewards/accuracy_reward": 0.9244440793991089, |
| "rewards/format_reward": 1.0, |
| "step": 374 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.78385925292969, |
| "epoch": 7.958333333333333, |
| "grad_norm": 1.7111686099562637, |
| "kl": 0.051025390625, |
| "learning_rate": 2.1875e-07, |
| "loss": 0.0021, |
| "reward": 1.897660255432129, |
| "reward_std": 0.02316589280962944, |
| "rewards/accuracy_reward": 0.8976603746414185, |
| "rewards/format_reward": 1.0, |
| "step": 375 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.38451385498047, |
| "epoch": 7.979166666666667, |
| "grad_norm": 3.8657178191371275, |
| "kl": 0.044921875, |
| "learning_rate": 2.1666666666666667e-07, |
| "loss": 0.0019, |
| "reward": 1.91867196559906, |
| "reward_std": 0.018634842708706856, |
| "rewards/accuracy_reward": 0.9186719655990601, |
| "rewards/format_reward": 1.0, |
| "step": 376 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.22917175292969, |
| "epoch": 8.020833333333334, |
| "grad_norm": 1.9741658039132675, |
| "kl": 0.04638671875, |
| "learning_rate": 2.145833333333333e-07, |
| "loss": 0.0019, |
| "reward": 1.9112396240234375, |
| "reward_std": 0.01786581240594387, |
| "rewards/accuracy_reward": 0.9112398028373718, |
| "rewards/format_reward": 1.0, |
| "step": 377 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.52604675292969, |
| "epoch": 8.041666666666666, |
| "grad_norm": 4.239952555901759, |
| "kl": 0.04931640625, |
| "learning_rate": 2.1249999999999998e-07, |
| "loss": 0.002, |
| "reward": 1.9145668745040894, |
| "reward_std": 0.03050382435321808, |
| "rewards/accuracy_reward": 0.9171710014343262, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 378 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.51171875, |
| "epoch": 8.0625, |
| "grad_norm": 3.1873677902649264, |
| "kl": 0.04638671875, |
| "learning_rate": 2.1041666666666665e-07, |
| "loss": 0.0019, |
| "reward": 1.9099664688110352, |
| "reward_std": 0.021909143775701523, |
| "rewards/accuracy_reward": 0.9099664688110352, |
| "rewards/format_reward": 1.0, |
| "step": 379 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.73046875, |
| "epoch": 8.083333333333334, |
| "grad_norm": 1.4761981134253073, |
| "kl": 0.043212890625, |
| "learning_rate": 2.0833333333333333e-07, |
| "loss": 0.0018, |
| "reward": 1.908822774887085, |
| "reward_std": 0.026725394651293755, |
| "rewards/accuracy_reward": 0.9114267826080322, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 380 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.9765625, |
| "epoch": 8.104166666666666, |
| "grad_norm": 2.5298325743515413, |
| "kl": 0.055908203125, |
| "learning_rate": 2.0624999999999998e-07, |
| "loss": 0.0023, |
| "reward": 1.919610857963562, |
| "reward_std": 0.022303760051727295, |
| "rewards/accuracy_reward": 0.919610857963562, |
| "rewards/format_reward": 1.0, |
| "step": 381 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.39453125, |
| "epoch": 8.125, |
| "grad_norm": 1.6701037952221194, |
| "kl": 0.04736328125, |
| "learning_rate": 2.0416666666666666e-07, |
| "loss": 0.002, |
| "reward": 1.8952322006225586, |
| "reward_std": 0.02632908523082733, |
| "rewards/accuracy_reward": 0.8978363275527954, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 382 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.75911712646484, |
| "epoch": 8.145833333333334, |
| "grad_norm": 1.8105877210751262, |
| "kl": 0.049560546875, |
| "learning_rate": 2.0208333333333334e-07, |
| "loss": 0.002, |
| "reward": 1.9192695617675781, |
| "reward_std": 0.02205723151564598, |
| "rewards/accuracy_reward": 0.9192695617675781, |
| "rewards/format_reward": 1.0, |
| "step": 383 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.484375, |
| "epoch": 8.166666666666666, |
| "grad_norm": 3.6256736252714084, |
| "kl": 0.052001953125, |
| "learning_rate": 2e-07, |
| "loss": 0.0022, |
| "reward": 1.899370551109314, |
| "reward_std": 0.02411123923957348, |
| "rewards/accuracy_reward": 0.9006726145744324, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 384 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.23698425292969, |
| "epoch": 8.1875, |
| "grad_norm": 2.8889796981288867, |
| "kl": 0.05517578125, |
| "learning_rate": 1.9791666666666664e-07, |
| "loss": 0.0023, |
| "reward": 1.9239459037780762, |
| "reward_std": 0.020000584423542023, |
| "rewards/accuracy_reward": 0.9239459037780762, |
| "rewards/format_reward": 1.0, |
| "step": 385 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.01953125, |
| "epoch": 8.208333333333334, |
| "grad_norm": 4.9653820626533385, |
| "kl": 0.052734375, |
| "learning_rate": 1.9583333333333332e-07, |
| "loss": 0.0022, |
| "reward": 1.9158563613891602, |
| "reward_std": 0.025936102494597435, |
| "rewards/accuracy_reward": 0.9171584844589233, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 386 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.20964050292969, |
| "epoch": 8.229166666666666, |
| "grad_norm": 2.290735973233346, |
| "kl": 0.055908203125, |
| "learning_rate": 1.9375e-07, |
| "loss": 0.0023, |
| "reward": 1.920623540878296, |
| "reward_std": 0.016118617728352547, |
| "rewards/accuracy_reward": 0.9206234216690063, |
| "rewards/format_reward": 1.0, |
| "step": 387 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.79817962646484, |
| "epoch": 8.25, |
| "grad_norm": 2.14149502952453, |
| "kl": 0.05078125, |
| "learning_rate": 1.9166666666666668e-07, |
| "loss": 0.0021, |
| "reward": 1.8982393741607666, |
| "reward_std": 0.02737743966281414, |
| "rewards/accuracy_reward": 0.8995413780212402, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 388 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.703125, |
| "epoch": 8.270833333333334, |
| "grad_norm": 1.7520909830046585, |
| "kl": 0.049560546875, |
| "learning_rate": 1.8958333333333333e-07, |
| "loss": 0.0021, |
| "reward": 1.9203259944915771, |
| "reward_std": 0.017365001142024994, |
| "rewards/accuracy_reward": 0.9203259348869324, |
| "rewards/format_reward": 1.0, |
| "step": 389 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.1953125, |
| "epoch": 8.291666666666666, |
| "grad_norm": 3.8682316676739044, |
| "kl": 0.05322265625, |
| "learning_rate": 1.875e-07, |
| "loss": 0.0023, |
| "reward": 1.90675950050354, |
| "reward_std": 0.020597189664840698, |
| "rewards/accuracy_reward": 0.9067594408988953, |
| "rewards/format_reward": 1.0, |
| "step": 390 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.93489837646484, |
| "epoch": 8.3125, |
| "grad_norm": 2.4384512365064483, |
| "kl": 0.047607421875, |
| "learning_rate": 1.8541666666666666e-07, |
| "loss": 0.002, |
| "reward": 1.923602819442749, |
| "reward_std": 0.01742154359817505, |
| "rewards/accuracy_reward": 0.9249049425125122, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 391 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.72005462646484, |
| "epoch": 8.333333333333334, |
| "grad_norm": 2.234427870328764, |
| "kl": 0.051513671875, |
| "learning_rate": 1.833333333333333e-07, |
| "loss": 0.0021, |
| "reward": 1.9096324443817139, |
| "reward_std": 0.021227438002824783, |
| "rewards/accuracy_reward": 0.9096323847770691, |
| "rewards/format_reward": 1.0, |
| "step": 392 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.46745300292969, |
| "epoch": 8.354166666666666, |
| "grad_norm": 1.9160175504578056, |
| "kl": 0.05078125, |
| "learning_rate": 1.8124999999999999e-07, |
| "loss": 0.0022, |
| "reward": 1.9217007160186768, |
| "reward_std": 0.01802412047982216, |
| "rewards/accuracy_reward": 0.9230027198791504, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 393 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.35807800292969, |
| "epoch": 8.375, |
| "grad_norm": 5.1139313564913955, |
| "kl": 0.044921875, |
| "learning_rate": 1.7916666666666666e-07, |
| "loss": 0.0019, |
| "reward": 1.9158090353012085, |
| "reward_std": 0.021614177152514458, |
| "rewards/accuracy_reward": 0.9171112775802612, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 394 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.94271087646484, |
| "epoch": 8.395833333333334, |
| "grad_norm": 1.8921820475793412, |
| "kl": 0.046875, |
| "learning_rate": 1.7708333333333334e-07, |
| "loss": 0.002, |
| "reward": 1.9103114604949951, |
| "reward_std": 0.01914086937904358, |
| "rewards/accuracy_reward": 0.9103114604949951, |
| "rewards/format_reward": 1.0, |
| "step": 395 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.96614837646484, |
| "epoch": 8.416666666666666, |
| "grad_norm": 2.8573255442328405, |
| "kl": 0.050537109375, |
| "learning_rate": 1.75e-07, |
| "loss": 0.0021, |
| "reward": 1.9096262454986572, |
| "reward_std": 0.01799336075782776, |
| "rewards/accuracy_reward": 0.9096261262893677, |
| "rewards/format_reward": 1.0, |
| "step": 396 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.53646087646484, |
| "epoch": 8.4375, |
| "grad_norm": 2.533540112681752, |
| "kl": 0.044677734375, |
| "learning_rate": 1.7291666666666664e-07, |
| "loss": 0.0019, |
| "reward": 1.9281089305877686, |
| "reward_std": 0.01919987052679062, |
| "rewards/accuracy_reward": 0.928108811378479, |
| "rewards/format_reward": 1.0, |
| "step": 397 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.0234375, |
| "epoch": 8.458333333333334, |
| "grad_norm": 2.2470205832432666, |
| "kl": 0.04443359375, |
| "learning_rate": 1.7083333333333332e-07, |
| "loss": 0.0018, |
| "reward": 1.9159855842590332, |
| "reward_std": 0.021353445947170258, |
| "rewards/accuracy_reward": 0.9159855842590332, |
| "rewards/format_reward": 1.0, |
| "step": 398 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.69792175292969, |
| "epoch": 8.479166666666666, |
| "grad_norm": 2.60464995508122, |
| "kl": 0.046875, |
| "learning_rate": 1.6875e-07, |
| "loss": 0.002, |
| "reward": 1.9079928398132324, |
| "reward_std": 0.022192446514964104, |
| "rewards/accuracy_reward": 0.909294843673706, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 399 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.828125, |
| "epoch": 8.5, |
| "grad_norm": 2.421415014993462, |
| "kl": 0.053466796875, |
| "learning_rate": 1.6666666666666665e-07, |
| "loss": 0.0022, |
| "reward": 1.9306447505950928, |
| "reward_std": 0.01638518087565899, |
| "rewards/accuracy_reward": 0.9306447505950928, |
| "rewards/format_reward": 1.0, |
| "step": 400 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.39714050292969, |
| "epoch": 8.520833333333334, |
| "grad_norm": 1.7082957831764727, |
| "kl": 0.056640625, |
| "learning_rate": 1.6458333333333333e-07, |
| "loss": 0.0024, |
| "reward": 1.9131463766098022, |
| "reward_std": 0.023682190105319023, |
| "rewards/accuracy_reward": 0.9144482612609863, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 401 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.69010925292969, |
| "epoch": 8.541666666666666, |
| "grad_norm": 2.2438919940917197, |
| "kl": 0.04638671875, |
| "learning_rate": 1.625e-07, |
| "loss": 0.002, |
| "reward": 1.9307001829147339, |
| "reward_std": 0.016804661601781845, |
| "rewards/accuracy_reward": 0.9307002425193787, |
| "rewards/format_reward": 1.0, |
| "step": 402 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.14192962646484, |
| "epoch": 8.5625, |
| "grad_norm": 3.264026183321021, |
| "kl": 0.04638671875, |
| "learning_rate": 1.6041666666666668e-07, |
| "loss": 0.0019, |
| "reward": 1.8919553756713867, |
| "reward_std": 0.02204691618680954, |
| "rewards/accuracy_reward": 0.8932574987411499, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 403 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.13542175292969, |
| "epoch": 8.583333333333334, |
| "grad_norm": 2.309073599276998, |
| "kl": 0.04443359375, |
| "learning_rate": 1.583333333333333e-07, |
| "loss": 0.0019, |
| "reward": 1.9184863567352295, |
| "reward_std": 0.016750024631619453, |
| "rewards/accuracy_reward": 0.918486475944519, |
| "rewards/format_reward": 1.0, |
| "step": 404 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.02604675292969, |
| "epoch": 8.604166666666666, |
| "grad_norm": 1.920449334945207, |
| "kl": 0.046875, |
| "learning_rate": 1.5624999999999999e-07, |
| "loss": 0.0019, |
| "reward": 1.8996871709823608, |
| "reward_std": 0.019097616896033287, |
| "rewards/accuracy_reward": 0.8996869921684265, |
| "rewards/format_reward": 1.0, |
| "step": 405 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.38671875, |
| "epoch": 8.625, |
| "grad_norm": 6.108556482978297, |
| "kl": 0.047607421875, |
| "learning_rate": 1.5416666666666666e-07, |
| "loss": 0.002, |
| "reward": 1.8946789503097534, |
| "reward_std": 0.021756049245595932, |
| "rewards/accuracy_reward": 0.8946789503097534, |
| "rewards/format_reward": 1.0, |
| "step": 406 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.61589050292969, |
| "epoch": 8.645833333333334, |
| "grad_norm": 1.4312273043036572, |
| "kl": 0.04833984375, |
| "learning_rate": 1.5208333333333332e-07, |
| "loss": 0.002, |
| "reward": 1.930631399154663, |
| "reward_std": 0.019629666581749916, |
| "rewards/accuracy_reward": 0.9319334030151367, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 407 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.0546875, |
| "epoch": 8.666666666666666, |
| "grad_norm": 2.0186934249559645, |
| "kl": 0.046142578125, |
| "learning_rate": 1.5e-07, |
| "loss": 0.0019, |
| "reward": 1.910029411315918, |
| "reward_std": 0.01880134642124176, |
| "rewards/accuracy_reward": 0.9100292921066284, |
| "rewards/format_reward": 1.0, |
| "step": 408 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.23958587646484, |
| "epoch": 8.6875, |
| "grad_norm": 2.388178370024915, |
| "kl": 0.046630859375, |
| "learning_rate": 1.4791666666666667e-07, |
| "loss": 0.0019, |
| "reward": 1.9040935039520264, |
| "reward_std": 0.023898255079984665, |
| "rewards/accuracy_reward": 0.9040936231613159, |
| "rewards/format_reward": 1.0, |
| "step": 409 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.42317962646484, |
| "epoch": 8.708333333333334, |
| "grad_norm": 3.09323894478763, |
| "kl": 0.045166015625, |
| "learning_rate": 1.4583333333333335e-07, |
| "loss": 0.0019, |
| "reward": 1.9278287887573242, |
| "reward_std": 0.01652323268353939, |
| "rewards/accuracy_reward": 0.9278289675712585, |
| "rewards/format_reward": 1.0, |
| "step": 410 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 111.14714050292969, |
| "epoch": 8.729166666666666, |
| "grad_norm": 8.95075477806658, |
| "kl": 0.046875, |
| "learning_rate": 1.4374999999999997e-07, |
| "loss": 0.002, |
| "reward": 1.9176443815231323, |
| "reward_std": 0.020662346854805946, |
| "rewards/accuracy_reward": 0.9202485084533691, |
| "rewards/format_reward": 0.9973958730697632, |
| "step": 411 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.69140625, |
| "epoch": 8.75, |
| "grad_norm": 14.290195535948639, |
| "kl": 0.04638671875, |
| "learning_rate": 1.4166666666666665e-07, |
| "loss": 0.0019, |
| "reward": 1.9256818294525146, |
| "reward_std": 0.01680697686970234, |
| "rewards/accuracy_reward": 0.9256815910339355, |
| "rewards/format_reward": 1.0, |
| "step": 412 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.74089050292969, |
| "epoch": 8.770833333333334, |
| "grad_norm": 3.382470297102888, |
| "kl": 0.04833984375, |
| "learning_rate": 1.3958333333333333e-07, |
| "loss": 0.002, |
| "reward": 1.9325473308563232, |
| "reward_std": 0.015486609190702438, |
| "rewards/accuracy_reward": 0.9325472116470337, |
| "rewards/format_reward": 1.0, |
| "step": 413 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.37890625, |
| "epoch": 8.791666666666666, |
| "grad_norm": 1.5498505485516252, |
| "kl": 0.04638671875, |
| "learning_rate": 1.375e-07, |
| "loss": 0.002, |
| "reward": 1.9133939743041992, |
| "reward_std": 0.019088715314865112, |
| "rewards/accuracy_reward": 0.9133939743041992, |
| "rewards/format_reward": 1.0, |
| "step": 414 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.13151550292969, |
| "epoch": 8.8125, |
| "grad_norm": 2.1982092482674496, |
| "kl": 0.047119140625, |
| "learning_rate": 1.3541666666666666e-07, |
| "loss": 0.0019, |
| "reward": 1.9054036140441895, |
| "reward_std": 0.023469921201467514, |
| "rewards/accuracy_reward": 0.9067057371139526, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 415 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.67578125, |
| "epoch": 8.833333333333334, |
| "grad_norm": 2.5337968426277158, |
| "kl": 0.052490234375, |
| "learning_rate": 1.3333333333333334e-07, |
| "loss": 0.0022, |
| "reward": 1.9113550186157227, |
| "reward_std": 0.017122842371463776, |
| "rewards/accuracy_reward": 0.9113550186157227, |
| "rewards/format_reward": 1.0, |
| "step": 416 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.7421875, |
| "epoch": 8.854166666666666, |
| "grad_norm": 2.957658114504722, |
| "kl": 0.049072265625, |
| "learning_rate": 1.3125e-07, |
| "loss": 0.002, |
| "reward": 1.9356334209442139, |
| "reward_std": 0.01688789203763008, |
| "rewards/accuracy_reward": 0.9356333613395691, |
| "rewards/format_reward": 1.0, |
| "step": 417 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.45442962646484, |
| "epoch": 8.875, |
| "grad_norm": 3.414030918508783, |
| "kl": 0.048095703125, |
| "learning_rate": 1.2916666666666667e-07, |
| "loss": 0.002, |
| "reward": 1.9134725332260132, |
| "reward_std": 0.01754312589764595, |
| "rewards/accuracy_reward": 0.9134725332260132, |
| "rewards/format_reward": 1.0, |
| "step": 418 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.20833587646484, |
| "epoch": 8.895833333333334, |
| "grad_norm": 3.3858814535237918, |
| "kl": 0.04931640625, |
| "learning_rate": 1.2708333333333332e-07, |
| "loss": 0.002, |
| "reward": 1.8799127340316772, |
| "reward_std": 0.0196706410497427, |
| "rewards/accuracy_reward": 0.8799127340316772, |
| "rewards/format_reward": 1.0, |
| "step": 419 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.65885925292969, |
| "epoch": 8.916666666666666, |
| "grad_norm": 4.177315577481865, |
| "kl": 0.048583984375, |
| "learning_rate": 1.25e-07, |
| "loss": 0.002, |
| "reward": 1.9059925079345703, |
| "reward_std": 0.017467858269810677, |
| "rewards/accuracy_reward": 0.9059926867485046, |
| "rewards/format_reward": 1.0, |
| "step": 420 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.44010925292969, |
| "epoch": 8.9375, |
| "grad_norm": 1.8879259079304014, |
| "kl": 0.045654296875, |
| "learning_rate": 1.2291666666666665e-07, |
| "loss": 0.0019, |
| "reward": 1.9416245222091675, |
| "reward_std": 0.014588016085326672, |
| "rewards/accuracy_reward": 0.9416245818138123, |
| "rewards/format_reward": 1.0, |
| "step": 421 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.62370300292969, |
| "epoch": 8.958333333333334, |
| "grad_norm": 2.993225929254523, |
| "kl": 0.053955078125, |
| "learning_rate": 1.2083333333333332e-07, |
| "loss": 0.0023, |
| "reward": 1.9129104614257812, |
| "reward_std": 0.018522052094340324, |
| "rewards/accuracy_reward": 0.9129105806350708, |
| "rewards/format_reward": 1.0, |
| "step": 422 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.63018035888672, |
| "epoch": 8.979166666666666, |
| "grad_norm": 1.868194038764531, |
| "kl": 0.05908203125, |
| "learning_rate": 1.1874999999999999e-07, |
| "loss": 0.0024, |
| "reward": 1.8931113481521606, |
| "reward_std": 0.019662605598568916, |
| "rewards/accuracy_reward": 0.8931112885475159, |
| "rewards/format_reward": 1.0, |
| "step": 423 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.67708587646484, |
| "epoch": 9.020833333333334, |
| "grad_norm": 4.908308927568047, |
| "kl": 0.053955078125, |
| "learning_rate": 1.1666666666666667e-07, |
| "loss": 0.0022, |
| "reward": 1.9184527397155762, |
| "reward_std": 0.022821567952632904, |
| "rewards/accuracy_reward": 0.9197548627853394, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 424 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.43229675292969, |
| "epoch": 9.041666666666666, |
| "grad_norm": 2.0304888444214884, |
| "kl": 0.047119140625, |
| "learning_rate": 1.1458333333333332e-07, |
| "loss": 0.002, |
| "reward": 1.9061381816864014, |
| "reward_std": 0.017391815781593323, |
| "rewards/accuracy_reward": 0.9061381220817566, |
| "rewards/format_reward": 1.0, |
| "step": 425 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.78385925292969, |
| "epoch": 9.0625, |
| "grad_norm": 4.941122982616481, |
| "kl": 0.05029296875, |
| "learning_rate": 1.125e-07, |
| "loss": 0.0021, |
| "reward": 1.9194796085357666, |
| "reward_std": 0.01858523301780224, |
| "rewards/accuracy_reward": 0.9194795489311218, |
| "rewards/format_reward": 1.0, |
| "step": 426 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.11198425292969, |
| "epoch": 9.083333333333334, |
| "grad_norm": 1.5041501011347416, |
| "kl": 0.055419921875, |
| "learning_rate": 1.1041666666666666e-07, |
| "loss": 0.0023, |
| "reward": 1.926844596862793, |
| "reward_std": 0.017708610743284225, |
| "rewards/accuracy_reward": 0.926844596862793, |
| "rewards/format_reward": 1.0, |
| "step": 427 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 104.85546875, |
| "epoch": 9.104166666666666, |
| "grad_norm": 5.314885280931172, |
| "kl": 0.0498046875, |
| "learning_rate": 1.0833333333333334e-07, |
| "loss": 0.0021, |
| "reward": 1.9273128509521484, |
| "reward_std": 0.01629455015063286, |
| "rewards/accuracy_reward": 0.927312970161438, |
| "rewards/format_reward": 1.0, |
| "step": 428 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.00651550292969, |
| "epoch": 9.125, |
| "grad_norm": 1.6763167113014938, |
| "kl": 0.059326171875, |
| "learning_rate": 1.0624999999999999e-07, |
| "loss": 0.0024, |
| "reward": 1.9027307033538818, |
| "reward_std": 0.023923706263303757, |
| "rewards/accuracy_reward": 0.9040327072143555, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 429 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.30729675292969, |
| "epoch": 9.145833333333334, |
| "grad_norm": 2.331935816678969, |
| "kl": 0.046875, |
| "learning_rate": 1.0416666666666667e-07, |
| "loss": 0.002, |
| "reward": 1.914282202720642, |
| "reward_std": 0.01946648769080639, |
| "rewards/accuracy_reward": 0.9142822027206421, |
| "rewards/format_reward": 1.0, |
| "step": 430 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.20833587646484, |
| "epoch": 9.166666666666666, |
| "grad_norm": 2.7235761953830515, |
| "kl": 0.046875, |
| "learning_rate": 1.0208333333333333e-07, |
| "loss": 0.0019, |
| "reward": 1.9132049083709717, |
| "reward_std": 0.018830081447958946, |
| "rewards/accuracy_reward": 0.9132048487663269, |
| "rewards/format_reward": 1.0, |
| "step": 431 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.61328125, |
| "epoch": 9.1875, |
| "grad_norm": 5.239801858902832, |
| "kl": 0.045166015625, |
| "learning_rate": 1e-07, |
| "loss": 0.0019, |
| "reward": 1.9167590141296387, |
| "reward_std": 0.01652991585433483, |
| "rewards/accuracy_reward": 0.9167590141296387, |
| "rewards/format_reward": 1.0, |
| "step": 432 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.30989837646484, |
| "epoch": 9.208333333333334, |
| "grad_norm": 2.0958692476606453, |
| "kl": 0.047607421875, |
| "learning_rate": 9.791666666666666e-08, |
| "loss": 0.002, |
| "reward": 1.9000282287597656, |
| "reward_std": 0.017917610704898834, |
| "rewards/accuracy_reward": 0.9000282287597656, |
| "rewards/format_reward": 1.0, |
| "step": 433 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.13021087646484, |
| "epoch": 9.229166666666666, |
| "grad_norm": 3.688730357890573, |
| "kl": 0.047119140625, |
| "learning_rate": 9.583333333333334e-08, |
| "loss": 0.002, |
| "reward": 1.915367841720581, |
| "reward_std": 0.01575218327343464, |
| "rewards/accuracy_reward": 0.915367841720581, |
| "rewards/format_reward": 1.0, |
| "step": 434 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.29817962646484, |
| "epoch": 9.25, |
| "grad_norm": 4.777726769533225, |
| "kl": 0.0498046875, |
| "learning_rate": 9.375e-08, |
| "loss": 0.002, |
| "reward": 1.9215753078460693, |
| "reward_std": 0.018969135358929634, |
| "rewards/accuracy_reward": 0.9215752482414246, |
| "rewards/format_reward": 1.0, |
| "step": 435 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.67578125, |
| "epoch": 9.270833333333334, |
| "grad_norm": 3.166832895506651, |
| "kl": 0.1455078125, |
| "learning_rate": 9.166666666666665e-08, |
| "loss": 0.0059, |
| "reward": 1.8940961360931396, |
| "reward_std": 0.022732451558113098, |
| "rewards/accuracy_reward": 0.8940961360931396, |
| "rewards/format_reward": 1.0, |
| "step": 436 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.04427337646484, |
| "epoch": 9.291666666666666, |
| "grad_norm": 7.685144828516677, |
| "kl": 0.053466796875, |
| "learning_rate": 8.958333333333333e-08, |
| "loss": 0.0022, |
| "reward": 1.9240249395370483, |
| "reward_std": 0.020782217383384705, |
| "rewards/accuracy_reward": 0.9253270030021667, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 437 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.99609375, |
| "epoch": 9.3125, |
| "grad_norm": 2.378060228855226, |
| "kl": 0.047119140625, |
| "learning_rate": 8.75e-08, |
| "loss": 0.0019, |
| "reward": 1.9038584232330322, |
| "reward_std": 0.018619615584611893, |
| "rewards/accuracy_reward": 0.9038585424423218, |
| "rewards/format_reward": 1.0, |
| "step": 438 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.53515625, |
| "epoch": 9.333333333333334, |
| "grad_norm": 1.7692084618793151, |
| "kl": 0.04150390625, |
| "learning_rate": 8.541666666666666e-08, |
| "loss": 0.0018, |
| "reward": 1.9091638326644897, |
| "reward_std": 0.022435273975133896, |
| "rewards/accuracy_reward": 0.9104660749435425, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 439 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.88542175292969, |
| "epoch": 9.354166666666666, |
| "grad_norm": 1.9024254121008084, |
| "kl": 0.05615234375, |
| "learning_rate": 8.333333333333333e-08, |
| "loss": 0.0023, |
| "reward": 1.9238141775131226, |
| "reward_std": 0.022633202373981476, |
| "rewards/accuracy_reward": 0.9251161813735962, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 440 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.61589050292969, |
| "epoch": 9.375, |
| "grad_norm": 1.6956193954619285, |
| "kl": 0.047119140625, |
| "learning_rate": 8.125e-08, |
| "loss": 0.002, |
| "reward": 1.9281830787658691, |
| "reward_std": 0.016372021287679672, |
| "rewards/accuracy_reward": 0.9281830787658691, |
| "rewards/format_reward": 1.0, |
| "step": 441 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.97786712646484, |
| "epoch": 9.395833333333334, |
| "grad_norm": 1.7595623551426245, |
| "kl": 0.046142578125, |
| "learning_rate": 7.916666666666665e-08, |
| "loss": 0.0019, |
| "reward": 1.8898437023162842, |
| "reward_std": 0.020021602511405945, |
| "rewards/accuracy_reward": 0.8898436427116394, |
| "rewards/format_reward": 1.0, |
| "step": 442 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.08464050292969, |
| "epoch": 9.416666666666666, |
| "grad_norm": 2.5345231096237626, |
| "kl": 0.045654296875, |
| "learning_rate": 7.708333333333333e-08, |
| "loss": 0.0019, |
| "reward": 1.9146665334701538, |
| "reward_std": 0.018744416534900665, |
| "rewards/accuracy_reward": 0.9146665334701538, |
| "rewards/format_reward": 1.0, |
| "step": 443 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.6875, |
| "epoch": 9.4375, |
| "grad_norm": 3.953395223073277, |
| "kl": 0.05224609375, |
| "learning_rate": 7.5e-08, |
| "loss": 0.0022, |
| "reward": 1.901958703994751, |
| "reward_std": 0.02426784299314022, |
| "rewards/accuracy_reward": 0.9032607078552246, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 444 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.35026550292969, |
| "epoch": 9.458333333333334, |
| "grad_norm": 2.9548245945269294, |
| "kl": 0.046875, |
| "learning_rate": 7.291666666666667e-08, |
| "loss": 0.0019, |
| "reward": 1.9318149089813232, |
| "reward_std": 0.01830216310918331, |
| "rewards/accuracy_reward": 0.9318150281906128, |
| "rewards/format_reward": 1.0, |
| "step": 445 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.83203125, |
| "epoch": 9.479166666666666, |
| "grad_norm": 2.485294511659845, |
| "kl": 0.050048828125, |
| "learning_rate": 7.083333333333333e-08, |
| "loss": 0.0021, |
| "reward": 1.94561767578125, |
| "reward_std": 0.014377261511981487, |
| "rewards/accuracy_reward": 0.9456178545951843, |
| "rewards/format_reward": 1.0, |
| "step": 446 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.19921875, |
| "epoch": 9.5, |
| "grad_norm": 3.3111964897684425, |
| "kl": 0.072265625, |
| "learning_rate": 6.875e-08, |
| "loss": 0.003, |
| "reward": 1.932363510131836, |
| "reward_std": 0.02322392538189888, |
| "rewards/accuracy_reward": 0.9336656332015991, |
| "rewards/format_reward": 0.9986979365348816, |
| "step": 447 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.52214050292969, |
| "epoch": 9.520833333333334, |
| "grad_norm": 1.8991735989887744, |
| "kl": 0.04541015625, |
| "learning_rate": 6.666666666666667e-08, |
| "loss": 0.002, |
| "reward": 1.928739070892334, |
| "reward_std": 0.017306815832853317, |
| "rewards/accuracy_reward": 0.928739070892334, |
| "rewards/format_reward": 1.0, |
| "step": 448 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.90104675292969, |
| "epoch": 9.541666666666666, |
| "grad_norm": 2.0677038589369747, |
| "kl": 0.0478515625, |
| "learning_rate": 6.458333333333333e-08, |
| "loss": 0.002, |
| "reward": 1.9004323482513428, |
| "reward_std": 0.018483035266399384, |
| "rewards/accuracy_reward": 0.900432288646698, |
| "rewards/format_reward": 1.0, |
| "step": 449 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.74479675292969, |
| "epoch": 9.5625, |
| "grad_norm": 2.4393059130616748, |
| "kl": 0.04736328125, |
| "learning_rate": 6.25e-08, |
| "loss": 0.002, |
| "reward": 1.9109472036361694, |
| "reward_std": 0.01986522786319256, |
| "rewards/accuracy_reward": 0.9109472036361694, |
| "rewards/format_reward": 1.0, |
| "step": 450 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 480, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 25, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 48, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|