diff --git "a/checkpoint-2000/trainer_state.json" "b/checkpoint-2000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2000/trainer_state.json" @@ -0,0 +1,38034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0002, + "grad_norm": 1.0032002925872803, + "kl": 0.0004780956369359046, + "learning_rate": 0.0, + "loss": 0.0, + "num_tokens": 11536.0, + "reward": 0.79803466796875, + "reward_std": 0.024305492639541626, + "rewards//mean": 0.79803466796875, + "rewards//std": 0.02694382146000862, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.0004, + "grad_norm": 1.2452327013015747, + "kl": 0.000407114301196998, + "learning_rate": 2e-08, + "loss": -0.0006, + "num_tokens": 23054.0, + "reward": 0.77716064453125, + "reward_std": 0.026728997007012367, + "rewards//mean": 0.77716064453125, + "rewards//std": 0.02854783833026886, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.0006, + "grad_norm": 1.1469886302947998, + "kl": 0.00040853865357348695, + "learning_rate": 4e-08, + "loss": 0.0007, + "num_tokens": 34663.0, + "reward": 0.79791259765625, + "reward_std": 0.023661676794290543, + "rewards//mean": 0.79791259765625, + "rewards//std": 0.02678661234676838, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0008, + "grad_norm": 1.0362390279769897, + "kl": 0.0004155077913310379, + "learning_rate": 6e-08, + "loss": 0.0, + "num_tokens": 46319.0, + "reward": 0.76654052734375, + "reward_std": 0.026976488530635834, + "rewards//mean": 0.76654052734375, + "rewards//std": 0.02831196039915085, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.001, + "grad_norm": 1.2705215215682983, + "kl": 0.00042918400140479207, + "learning_rate": 8e-08, + "loss": 0.0, + "num_tokens": 57951.0, + "reward": 0.7061767578125, + "reward_std": 0.03839603438973427, + "rewards//mean": 0.7061767578125, + "rewards//std": 0.04136603698134422, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0012, + "grad_norm": 1.0415380001068115, + "kl": 0.0004256443935446441, + "learning_rate": 1e-07, + "loss": 0.0, + "num_tokens": 69567.0, + "reward": 0.78363037109375, + "reward_std": 0.023982666432857513, + "rewards//mean": 0.78363037109375, + "rewards//std": 0.028962144628167152, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0014, + "grad_norm": 0.9738340973854065, + "kl": 0.00037688305019401014, + "learning_rate": 1.2e-07, + "loss": 0.0, + "num_tokens": 81215.0, + "reward": 0.7799072265625, + "reward_std": 0.02770727500319481, + "rewards//mean": 0.7799072265625, + "rewards//std": 0.028950447216629982, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0016, + "grad_norm": 0.9812470078468323, + "kl": 0.0003702049871208146, + "learning_rate": 1.4e-07, + "loss": 0.0021, + "num_tokens": 92787.0, + "reward": 0.7767333984375, + "reward_std": 0.017952006310224533, + "rewards//mean": 0.7767333984375, + "rewards//std": 0.021978598088026047, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0018, + "grad_norm": 1.1836854219436646, + "kl": 0.0004490012943278998, + "learning_rate": 1.6e-07, + "loss": 0.0, + "num_tokens": 104395.0, + "reward": 0.7958984375, + "reward_std": 0.02408256009221077, + "rewards//mean": 0.7958984375, + "rewards//std": 0.025209931656718254, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.002, + "grad_norm": 1.0419762134552002, + "kl": 0.000443739234469831, + "learning_rate": 1.8e-07, + "loss": 0.0, + "num_tokens": 115971.0, + "reward": 0.7962646484375, + "reward_std": 0.01983659714460373, + "rewards//mean": 0.7962646484375, + "rewards//std": 0.022458242252469063, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0022, + "grad_norm": 1.0875407457351685, + "kl": 0.0004412107227835804, + "learning_rate": 2e-07, + "loss": 0.0, + "num_tokens": 127723.0, + "reward": 0.775390625, + "reward_std": 0.026773778721690178, + "rewards//mean": 0.775390625, + "rewards//std": 0.028628919273614883, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0024, + "grad_norm": 1.026057243347168, + "kl": 0.0004478457267396152, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0, + "num_tokens": 139307.0, + "reward": 0.7791748046875, + "reward_std": 0.021914469078183174, + "rewards//mean": 0.7791748046875, + "rewards//std": 0.024606755003333092, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0026, + "grad_norm": 1.0839637517929077, + "kl": 0.00041695138861541636, + "learning_rate": 2.4e-07, + "loss": -0.0004, + "num_tokens": 150823.0, + "reward": 0.76849365234375, + "reward_std": 0.027295563369989395, + "rewards//mean": 0.76849365234375, + "rewards//std": 0.029316440224647522, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.0028, + "grad_norm": 1.1106576919555664, + "kl": 0.00044290057121543214, + "learning_rate": 2.6e-07, + "loss": -0.0003, + "num_tokens": 162381.0, + "reward": 0.76885986328125, + "reward_std": 0.022853918373584747, + "rewards//mean": 0.76885986328125, + "rewards//std": 0.02404506504535675, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.003, + "grad_norm": 1.0103557109832764, + "kl": 0.0003988853713963181, + "learning_rate": 2.8e-07, + "loss": 0.0, + "num_tokens": 174029.0, + "reward": 0.7911376953125, + "reward_std": 0.027279861271381378, + "rewards//mean": 0.7911376953125, + "rewards//std": 0.03135807812213898, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0032, + "grad_norm": 1.001014232635498, + "kl": 0.0004324003530200571, + "learning_rate": 3e-07, + "loss": 0.0, + "num_tokens": 185573.0, + "reward": 0.75823974609375, + "reward_std": 0.028175704181194305, + "rewards//mean": 0.75823974609375, + "rewards//std": 0.030687816441059113, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0034, + "grad_norm": 1.0996702909469604, + "kl": 0.0004953897150699049, + "learning_rate": 3.2e-07, + "loss": 0.0, + "num_tokens": 197157.0, + "reward": 0.7860107421875, + "reward_std": 0.027306145057082176, + "rewards//mean": 0.7860107421875, + "rewards//std": 0.030879411846399307, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0036, + "grad_norm": 1.091248869895935, + "kl": 0.0003930546226911247, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0, + "num_tokens": 208741.0, + "reward": 0.79180908203125, + "reward_std": 0.029533306136727333, + "rewards//mean": 0.79180908203125, + "rewards//std": 0.032516200095415115, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.0038, + "grad_norm": 1.0470415353775024, + "kl": 0.0004306385926611256, + "learning_rate": 3.6e-07, + "loss": -0.0074, + "num_tokens": 220298.0, + "reward": 0.79376220703125, + "reward_std": 0.024944156408309937, + "rewards//mean": 0.79376220703125, + "rewards//std": 0.028511757031083107, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.004, + "grad_norm": 1.0079525709152222, + "kl": 0.0004520138609223068, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0, + "num_tokens": 231802.0, + "reward": 0.79541015625, + "reward_std": 0.017426297068595886, + "rewards//mean": 0.79541015625, + "rewards//std": 0.022266332060098648, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0042, + "grad_norm": 0.9105224609375, + "kl": 0.00041493010940030217, + "learning_rate": 4e-07, + "loss": 0.0, + "num_tokens": 243354.0, + "reward": 0.75286865234375, + "reward_std": 0.022243333980441093, + "rewards//mean": 0.75286865234375, + "rewards//std": 0.02703860215842724, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0044, + "grad_norm": 1.0261293649673462, + "kl": 0.0004729063075501472, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0, + "num_tokens": 254890.0, + "reward": 0.790771484375, + "reward_std": 0.02005748450756073, + "rewards//mean": 0.790771484375, + "rewards//std": 0.023270267993211746, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.0046, + "grad_norm": 0.9665905833244324, + "kl": 0.00039523342638858594, + "learning_rate": 4.3999999999999997e-07, + "loss": -0.0039, + "num_tokens": 266517.0, + "reward": 0.7850341796875, + "reward_std": 0.02490047924220562, + "rewards//mean": 0.7850341796875, + "rewards//std": 0.02690144069492817, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0048, + "grad_norm": 1.0953789949417114, + "kl": 0.0004080493854417, + "learning_rate": 4.6e-07, + "loss": -0.0029, + "num_tokens": 278065.0, + "reward": 0.800537109375, + "reward_std": 0.026891034096479416, + "rewards//mean": 0.800537109375, + "rewards//std": 0.028355836868286133, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.005, + "grad_norm": 1.0074764490127563, + "kl": 0.0004477243637666106, + "learning_rate": 4.8e-07, + "loss": 0.0, + "num_tokens": 289585.0, + "reward": 0.77471923828125, + "reward_std": 0.030465159565210342, + "rewards//mean": 0.77471923828125, + "rewards//std": 0.034786101430654526, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0052, + "grad_norm": 1.0062299966812134, + "kl": 0.0005353202286642045, + "learning_rate": 5e-07, + "loss": 0.0001, + "num_tokens": 301193.0, + "reward": 0.7786865234375, + "reward_std": 0.02002078853547573, + "rewards//mean": 0.7786865234375, + "rewards//std": 0.02277948334813118, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.0054, + "grad_norm": 1.0474389791488647, + "kl": 0.00039401414687745273, + "learning_rate": 5.2e-07, + "loss": -0.0068, + "num_tokens": 312651.0, + "reward": 0.79034423828125, + "reward_std": 0.019780851900577545, + "rewards//mean": 0.79034423828125, + "rewards//std": 0.024978820234537125, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0056, + "grad_norm": 1.0718964338302612, + "kl": 0.00045608119035023265, + "learning_rate": 5.4e-07, + "loss": 0.0092, + "num_tokens": 324316.0, + "reward": 0.799560546875, + "reward_std": 0.02452939935028553, + "rewards//mean": 0.799560546875, + "rewards//std": 0.027585163712501526, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0058, + "grad_norm": 0.9650251865386963, + "kl": 0.0004629575414583087, + "learning_rate": 5.6e-07, + "loss": 0.0, + "num_tokens": 335948.0, + "reward": 0.7984619140625, + "reward_std": 0.025783387944102287, + "rewards//mean": 0.7984619140625, + "rewards//std": 0.027785014361143112, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.006, + "grad_norm": 0.9607970714569092, + "kl": 0.0004378036319394596, + "learning_rate": 5.8e-07, + "loss": -0.0016, + "num_tokens": 347536.0, + "reward": 0.7843017578125, + "reward_std": 0.026795413345098495, + "rewards//mean": 0.7843017578125, + "rewards//std": 0.03300260379910469, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0062, + "grad_norm": 1.0197336673736572, + "kl": 0.0004547602729871869, + "learning_rate": 6e-07, + "loss": 0.0, + "num_tokens": 359024.0, + "reward": 0.77569580078125, + "reward_std": 0.021471459418535233, + "rewards//mean": 0.77569580078125, + "rewards//std": 0.02761087566614151, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0064, + "grad_norm": 1.0109747648239136, + "kl": 0.0004543978502624668, + "learning_rate": 6.2e-07, + "loss": 0.0013, + "num_tokens": 370557.0, + "reward": 0.817626953125, + "reward_std": 0.019293474033474922, + "rewards//mean": 0.817626953125, + "rewards//std": 0.030693797394633293, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.0066, + "grad_norm": 1.064644455909729, + "kl": 0.0004356145436759107, + "learning_rate": 6.4e-07, + "loss": -0.0189, + "num_tokens": 382179.0, + "reward": 0.77215576171875, + "reward_std": 0.023840943351387978, + "rewards//mean": 0.77215576171875, + "rewards//std": 0.028203211724758148, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0068, + "grad_norm": 0.9957301020622253, + "kl": 0.00042641197796911, + "learning_rate": 6.6e-07, + "loss": 0.0, + "num_tokens": 393747.0, + "reward": 0.782958984375, + "reward_std": 0.023168984800577164, + "rewards//mean": 0.782958984375, + "rewards//std": 0.027567598968744278, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.007, + "grad_norm": 0.9983273148536682, + "kl": 0.0003891971427947283, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0, + "num_tokens": 405419.0, + "reward": 0.80389404296875, + "reward_std": 0.017665725201368332, + "rewards//mean": 0.80389404296875, + "rewards//std": 0.026622792705893517, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0072, + "grad_norm": 1.0639199018478394, + "kl": 0.0004166060534771532, + "learning_rate": 7e-07, + "loss": 0.0, + "num_tokens": 416963.0, + "reward": 0.79071044921875, + "reward_std": 0.026995990425348282, + "rewards//mean": 0.79071044921875, + "rewards//std": 0.02771594002842903, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0074, + "grad_norm": 1.1130720376968384, + "kl": 0.00047251302748918533, + "learning_rate": 7.2e-07, + "loss": 0.0, + "num_tokens": 428603.0, + "reward": 0.7882080078125, + "reward_std": 0.027695732191205025, + "rewards//mean": 0.7882080078125, + "rewards//std": 0.032237835228443146, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0076, + "grad_norm": 1.0068720579147339, + "kl": 0.000439065886894241, + "learning_rate": 7.4e-07, + "loss": 0.0, + "num_tokens": 440219.0, + "reward": 0.7789306640625, + "reward_std": 0.024551186710596085, + "rewards//mean": 0.7789306640625, + "rewards//std": 0.028937894850969315, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0078, + "grad_norm": 1.0283150672912598, + "kl": 0.00044043277739547193, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0, + "num_tokens": 451875.0, + "reward": 0.81707763671875, + "reward_std": 0.02156119793653488, + "rewards//mean": 0.81707763671875, + "rewards//std": 0.022963067516684532, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.008, + "grad_norm": 1.1717981100082397, + "kl": 0.00041474829777143896, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0, + "num_tokens": 463451.0, + "reward": 0.77880859375, + "reward_std": 0.02803177759051323, + "rewards//mean": 0.77880859375, + "rewards//std": 0.03091101534664631, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0082, + "grad_norm": 1.1008245944976807, + "kl": 0.00043035196722485125, + "learning_rate": 8e-07, + "loss": 0.0, + "num_tokens": 475043.0, + "reward": 0.7664794921875, + "reward_std": 0.03066561371088028, + "rewards//mean": 0.7664794921875, + "rewards//std": 0.038528524339199066, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0084, + "grad_norm": 0.8630216121673584, + "kl": 0.00038830412086099386, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0, + "num_tokens": 486603.0, + "reward": 0.77850341796875, + "reward_std": 0.019993852823972702, + "rewards//mean": 0.77850341796875, + "rewards//std": 0.02410668134689331, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.0086, + "grad_norm": 1.26260507106781, + "kl": 0.00043443931281217374, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0053, + "num_tokens": 498131.0, + "reward": 0.76751708984375, + "reward_std": 0.02363547310233116, + "rewards//mean": 0.76751708984375, + "rewards//std": 0.030883517116308212, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0088, + "grad_norm": 0.9733297824859619, + "kl": 0.00041065775440074503, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0131, + "num_tokens": 509759.0, + "reward": 0.7841796875, + "reward_std": 0.02677779272198677, + "rewards//mean": 0.7841796875, + "rewards//std": 0.029388703405857086, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.009, + "grad_norm": 1.031009554862976, + "kl": 0.0003811084316112101, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0, + "num_tokens": 521351.0, + "reward": 0.8021240234375, + "reward_std": 0.0262990090996027, + "rewards//mean": 0.8021240234375, + "rewards//std": 0.028647683560848236, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0092, + "grad_norm": 1.061417818069458, + "kl": 0.0004516712506301701, + "learning_rate": 9e-07, + "loss": 0.0, + "num_tokens": 532967.0, + "reward": 0.79156494140625, + "reward_std": 0.026183973997831345, + "rewards//mean": 0.79156494140625, + "rewards//std": 0.030276626348495483, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0094, + "grad_norm": 0.9763254523277283, + "kl": 0.00044392107520252466, + "learning_rate": 9.2e-07, + "loss": 0.0, + "num_tokens": 544543.0, + "reward": 0.78955078125, + "reward_std": 0.03148472309112549, + "rewards//mean": 0.78955078125, + "rewards//std": 0.036594267934560776, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0096, + "grad_norm": 1.1541022062301636, + "kl": 0.0004645702720154077, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0, + "num_tokens": 556151.0, + "reward": 0.77117919921875, + "reward_std": 0.022975638508796692, + "rewards//mean": 0.77117919921875, + "rewards//std": 0.026418449357151985, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0098, + "grad_norm": 0.9314018487930298, + "kl": 0.0004480852803681046, + "learning_rate": 9.6e-07, + "loss": 0.0, + "num_tokens": 567655.0, + "reward": 0.7828369140625, + "reward_std": 0.02631896734237671, + "rewards//mean": 0.7828369140625, + "rewards//std": 0.03058980591595173, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.01, + "grad_norm": 1.03190279006958, + "kl": 0.0004517025809036568, + "learning_rate": 9.8e-07, + "loss": -0.0004, + "num_tokens": 579181.0, + "reward": 0.77392578125, + "reward_std": 0.02523089200258255, + "rewards//mean": 0.77392578125, + "rewards//std": 0.03195888176560402, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.0102, + "grad_norm": 1.275802731513977, + "kl": 0.0004933062882628292, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 590773.0, + "reward": 0.81719970703125, + "reward_std": 0.025812089443206787, + "rewards//mean": 0.81719970703125, + "rewards//std": 0.031197095289826393, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0104, + "grad_norm": 1.073560357093811, + "kl": 0.0004377171862870455, + "learning_rate": 9.999998993000298e-07, + "loss": 0.0009, + "num_tokens": 602330.0, + "reward": 0.7783203125, + "reward_std": 0.021733129397034645, + "rewards//mean": 0.7783203125, + "rewards//std": 0.02302042953670025, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0106, + "grad_norm": 1.1954807043075562, + "kl": 0.00047752540558576584, + "learning_rate": 9.999995972001601e-07, + "loss": 0.0, + "num_tokens": 613826.0, + "reward": 0.81591796875, + "reward_std": 0.023050827905535698, + "rewards//mean": 0.81591796875, + "rewards//std": 0.03379330039024353, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0108, + "grad_norm": 1.0675877332687378, + "kl": 0.0005111820064485073, + "learning_rate": 9.999990937005123e-07, + "loss": 0.0001, + "num_tokens": 625290.0, + "reward": 0.772705078125, + "reward_std": 0.02172166109085083, + "rewards//mean": 0.772705078125, + "rewards//std": 0.025900613516569138, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.011, + "grad_norm": 1.1736376285552979, + "kl": 0.00045154252438806, + "learning_rate": 9.999983888012896e-07, + "loss": 0.0, + "num_tokens": 636898.0, + "reward": 0.7733154296875, + "reward_std": 0.02385832369327545, + "rewards//mean": 0.7733154296875, + "rewards//std": 0.029198279604315758, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.0112, + "grad_norm": 0.9508896470069885, + "kl": 0.0005138024353072979, + "learning_rate": 9.999974825027754e-07, + "loss": -0.0002, + "num_tokens": 648472.0, + "reward": 0.777587890625, + "reward_std": 0.023099493235349655, + "rewards//mean": 0.777587890625, + "rewards//std": 0.029189204797148705, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0114, + "grad_norm": 1.0896598100662231, + "kl": 0.0005863557453267276, + "learning_rate": 9.999963748053354e-07, + "loss": 0.0001, + "num_tokens": 660048.0, + "reward": 0.7994384765625, + "reward_std": 0.02097148261964321, + "rewards//mean": 0.7994384765625, + "rewards//std": 0.026062294840812683, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.0116, + "grad_norm": 0.9406888484954834, + "kl": 0.0005287651438266039, + "learning_rate": 9.99995065709415e-07, + "loss": -0.0014, + "num_tokens": 671593.0, + "reward": 0.77886962890625, + "reward_std": 0.026260022073984146, + "rewards//mean": 0.77886962890625, + "rewards//std": 0.028861617669463158, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0118, + "grad_norm": 0.9371205568313599, + "kl": 0.000455894332844764, + "learning_rate": 9.999935552155421e-07, + "loss": 0.0, + "num_tokens": 683185.0, + "reward": 0.79461669921875, + "reward_std": 0.02584458701312542, + "rewards//mean": 0.79461669921875, + "rewards//std": 0.027898849919438362, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.012, + "grad_norm": 1.0464869737625122, + "kl": 0.00044593046186491847, + "learning_rate": 9.99991843324325e-07, + "loss": 0.0, + "num_tokens": 694785.0, + "reward": 0.80535888671875, + "reward_std": 0.02092009410262108, + "rewards//mean": 0.80535888671875, + "rewards//std": 0.025539757683873177, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0122, + "grad_norm": 1.0295146703720093, + "kl": 0.00046380768981180154, + "learning_rate": 9.999899300364532e-07, + "loss": -0.0012, + "num_tokens": 706294.0, + "reward": 0.74822998046875, + "reward_std": 0.027481555938720703, + "rewards//mean": 0.74822998046875, + "rewards//std": 0.03254784271121025, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.625, + "epoch": 0.0124, + "grad_norm": 1.0317684412002563, + "kl": 0.0005455511709442362, + "learning_rate": 9.999878153526972e-07, + "loss": -0.0048, + "num_tokens": 717798.0, + "reward": 0.78125, + "reward_std": 0.025627389550209045, + "rewards//mean": 0.78125, + "rewards//std": 0.02726835571229458, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0126, + "grad_norm": 1.0729941129684448, + "kl": 0.00045381151721812785, + "learning_rate": 9.999854992739093e-07, + "loss": 0.0, + "num_tokens": 729366.0, + "reward": 0.7891845703125, + "reward_std": 0.02221279963850975, + "rewards//mean": 0.7891845703125, + "rewards//std": 0.024851610884070396, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0128, + "grad_norm": 0.967627763748169, + "kl": 0.0005410933517850935, + "learning_rate": 9.999829818010219e-07, + "loss": 0.0001, + "num_tokens": 740974.0, + "reward": 0.77410888671875, + "reward_std": 0.018198326230049133, + "rewards//mean": 0.77410888671875, + "rewards//std": 0.02245108038187027, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.013, + "grad_norm": 0.8912996053695679, + "kl": 0.000532777929038275, + "learning_rate": 9.999802629350491e-07, + "loss": -0.001, + "num_tokens": 752547.0, + "reward": 0.7686767578125, + "reward_std": 0.024927247315645218, + "rewards//mean": 0.7686767578125, + "rewards//std": 0.027049588039517403, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0132, + "grad_norm": 0.9547525644302368, + "kl": 0.000567901908652857, + "learning_rate": 9.999773426770863e-07, + "loss": 0.0001, + "num_tokens": 764083.0, + "reward": 0.7891845703125, + "reward_std": 0.02082321234047413, + "rewards//mean": 0.7891845703125, + "rewards//std": 0.03106318786740303, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0134, + "grad_norm": 1.055927038192749, + "kl": 0.0005380832008086145, + "learning_rate": 9.999742210283097e-07, + "loss": 0.0001, + "num_tokens": 775723.0, + "reward": 0.7799072265625, + "reward_std": 0.025382796302437782, + "rewards//mean": 0.7799072265625, + "rewards//std": 0.02799776755273342, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0136, + "grad_norm": 1.0073267221450806, + "kl": 0.000618477410171181, + "learning_rate": 9.999708979899767e-07, + "loss": 0.0001, + "num_tokens": 787299.0, + "reward": 0.74822998046875, + "reward_std": 0.023253660649061203, + "rewards//mean": 0.74822998046875, + "rewards//std": 0.02575107291340828, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.0138, + "grad_norm": 0.9209610223770142, + "kl": 0.0005787500922451727, + "learning_rate": 9.999673735634259e-07, + "loss": 0.0044, + "num_tokens": 798839.0, + "reward": 0.77374267578125, + "reward_std": 0.025014236569404602, + "rewards//mean": 0.77374267578125, + "rewards//std": 0.02990587055683136, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.014, + "grad_norm": 0.9837480187416077, + "kl": 0.0004812025581486523, + "learning_rate": 9.999636477500764e-07, + "loss": 0.0, + "num_tokens": 810423.0, + "reward": 0.7242431640625, + "reward_std": 0.027096208184957504, + "rewards//mean": 0.7242431640625, + "rewards//std": 0.02849084511399269, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0142, + "grad_norm": 1.054513931274414, + "kl": 0.0005800646031275392, + "learning_rate": 9.999597205514296e-07, + "loss": 0.0001, + "num_tokens": 821927.0, + "reward": 0.76715087890625, + "reward_std": 0.02407689392566681, + "rewards//mean": 0.76715087890625, + "rewards//std": 0.028165077790617943, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.0144, + "grad_norm": 1.0837604999542236, + "kl": 0.0006065164052415639, + "learning_rate": 9.999555919690672e-07, + "loss": -0.0049, + "num_tokens": 833361.0, + "reward": 0.7779541015625, + "reward_std": 0.027434196323156357, + "rewards//mean": 0.7779541015625, + "rewards//std": 0.03595810756087303, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0146, + "grad_norm": 1.000943899154663, + "kl": 0.0007239706174004823, + "learning_rate": 9.99951262004652e-07, + "loss": 0.0001, + "num_tokens": 844945.0, + "reward": 0.76153564453125, + "reward_std": 0.018486179411411285, + "rewards//mean": 0.76153564453125, + "rewards//std": 0.021388955414295197, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0148, + "grad_norm": 1.0534002780914307, + "kl": 0.0006256884953472763, + "learning_rate": 9.999467306599285e-07, + "loss": 0.0001, + "num_tokens": 856545.0, + "reward": 0.78863525390625, + "reward_std": 0.025323180481791496, + "rewards//mean": 0.78863525390625, + "rewards//std": 0.02810050919651985, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.015, + "grad_norm": 1.0482524633407593, + "kl": 0.0005512565912795253, + "learning_rate": 9.999419979367214e-07, + "loss": 0.0002, + "num_tokens": 868176.0, + "reward": 0.7774658203125, + "reward_std": 0.034943222999572754, + "rewards//mean": 0.7774658203125, + "rewards//std": 0.04027128219604492, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0152, + "grad_norm": 1.075704574584961, + "kl": 0.0005089160695206374, + "learning_rate": 9.999370638369376e-07, + "loss": 0.0001, + "num_tokens": 879872.0, + "reward": 0.7825927734375, + "reward_std": 0.026704978197813034, + "rewards//mean": 0.7825927734375, + "rewards//std": 0.027780653908848763, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0154, + "grad_norm": 0.9593507051467896, + "kl": 0.0005675847933162004, + "learning_rate": 9.99931928362564e-07, + "loss": 0.0001, + "num_tokens": 891448.0, + "reward": 0.75299072265625, + "reward_std": 0.025164734572172165, + "rewards//mean": 0.75299072265625, + "rewards//std": 0.028036870062351227, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0156, + "grad_norm": 1.0475672483444214, + "kl": 0.000680763041600585, + "learning_rate": 9.999265915156696e-07, + "loss": 0.0001, + "num_tokens": 902997.0, + "reward": 0.77685546875, + "reward_std": 0.022688014432787895, + "rewards//mean": 0.77685546875, + "rewards//std": 0.025501275435090065, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0158, + "grad_norm": 0.9479108452796936, + "kl": 0.0005459161475300789, + "learning_rate": 9.999210532984038e-07, + "loss": 0.0001, + "num_tokens": 914573.0, + "reward": 0.78399658203125, + "reward_std": 0.02836792543530464, + "rewards//mean": 0.78399658203125, + "rewards//std": 0.03024711273610592, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.016, + "grad_norm": 1.0425359010696411, + "kl": 0.000688684158376418, + "learning_rate": 9.999153137129977e-07, + "loss": 0.005, + "num_tokens": 926161.0, + "reward": 0.80126953125, + "reward_std": 0.02532920613884926, + "rewards//mean": 0.80126953125, + "rewards//std": 0.029483327642083168, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.0162, + "grad_norm": 1.2687329053878784, + "kl": 0.0007346177808358334, + "learning_rate": 9.999093727617628e-07, + "loss": 0.0054, + "num_tokens": 937699.0, + "reward": 0.7745361328125, + "reward_std": 0.023173650726675987, + "rewards//mean": 0.7745361328125, + "rewards//std": 0.026185141876339912, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.0164, + "grad_norm": 1.044936180114746, + "kl": 0.0005363688615034334, + "learning_rate": 9.999032304470924e-07, + "loss": -0.0091, + "num_tokens": 949314.0, + "reward": 0.787109375, + "reward_std": 0.02587766945362091, + "rewards//mean": 0.787109375, + "rewards//std": 0.027638891711831093, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0166, + "grad_norm": 0.9152664542198181, + "kl": 0.0005534780793823302, + "learning_rate": 9.998968867714608e-07, + "loss": 0.0001, + "num_tokens": 960826.0, + "reward": 0.7813720703125, + "reward_std": 0.0286535806953907, + "rewards//mean": 0.7813720703125, + "rewards//std": 0.038304708898067474, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.0168, + "grad_norm": 1.0190411806106567, + "kl": 0.000642721355688991, + "learning_rate": 9.998903417374226e-07, + "loss": -0.0028, + "num_tokens": 972484.0, + "reward": 0.79168701171875, + "reward_std": 0.021379197016358376, + "rewards//mean": 0.79168701171875, + "rewards//std": 0.024260664358735085, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.017, + "grad_norm": 1.0226621627807617, + "kl": 0.0006434479146264493, + "learning_rate": 9.998835953476147e-07, + "loss": 0.0001, + "num_tokens": 984108.0, + "reward": 0.78643798828125, + "reward_std": 0.025598857551813126, + "rewards//mean": 0.78643798828125, + "rewards//std": 0.03149745985865593, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0172, + "grad_norm": 1.0187103748321533, + "kl": 0.000668648979626596, + "learning_rate": 9.998766476047545e-07, + "loss": 0.0001, + "num_tokens": 995692.0, + "reward": 0.79339599609375, + "reward_std": 0.024962294846773148, + "rewards//mean": 0.79339599609375, + "rewards//std": 0.03137466683983803, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0174, + "grad_norm": 0.9565930962562561, + "kl": 0.0006780105759389699, + "learning_rate": 9.998694985116404e-07, + "loss": 0.0001, + "num_tokens": 1007396.0, + "reward": 0.8016357421875, + "reward_std": 0.031955696642398834, + "rewards//mean": 0.8016357421875, + "rewards//std": 0.03708713874220848, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0176, + "grad_norm": 1.1048840284347534, + "kl": 0.0006080983439460397, + "learning_rate": 9.99862148071152e-07, + "loss": 0.0001, + "num_tokens": 1018964.0, + "reward": 0.79400634765625, + "reward_std": 0.032646212726831436, + "rewards//mean": 0.79400634765625, + "rewards//std": 0.04059638828039169, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0178, + "grad_norm": 0.9914064407348633, + "kl": 0.0005614705441985279, + "learning_rate": 9.998545962862501e-07, + "loss": 0.0001, + "num_tokens": 1030508.0, + "reward": 0.77838134765625, + "reward_std": 0.024556931108236313, + "rewards//mean": 0.77838134765625, + "rewards//std": 0.029710352420806885, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.018, + "grad_norm": 1.0337227582931519, + "kl": 0.0008954341465141624, + "learning_rate": 9.998468431599767e-07, + "loss": 0.0001, + "num_tokens": 1042020.0, + "reward": 0.7742919921875, + "reward_std": 0.02597588673233986, + "rewards//mean": 0.7742919921875, + "rewards//std": 0.028060413897037506, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0182, + "grad_norm": 1.0370957851409912, + "kl": 0.0005623717734124511, + "learning_rate": 9.998388886954545e-07, + "loss": 0.0001, + "num_tokens": 1053572.0, + "reward": 0.76031494140625, + "reward_std": 0.017384860664606094, + "rewards//mean": 0.76031494140625, + "rewards//std": 0.02488471008837223, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.0184, + "grad_norm": 1.1301162242889404, + "kl": 0.0005997827975079417, + "learning_rate": 9.998307328958877e-07, + "loss": 0.0004, + "num_tokens": 1065198.0, + "reward": 0.7950439453125, + "reward_std": 0.02988332509994507, + "rewards//mean": 0.7950439453125, + "rewards//std": 0.031550582498311996, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0186, + "grad_norm": 0.9675880074501038, + "kl": 0.0006305756978690624, + "learning_rate": 9.998223757645617e-07, + "loss": 0.0001, + "num_tokens": 1076694.0, + "reward": 0.8011474609375, + "reward_std": 0.0217256061732769, + "rewards//mean": 0.8011474609375, + "rewards//std": 0.02945224940776825, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0188, + "grad_norm": 0.901665985584259, + "kl": 0.0007376274152193218, + "learning_rate": 9.998138173048423e-07, + "loss": 0.0001, + "num_tokens": 1088278.0, + "reward": 0.79254150390625, + "reward_std": 0.026609892025589943, + "rewards//mean": 0.79254150390625, + "rewards//std": 0.02674928866326809, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.019, + "grad_norm": 1.06537926197052, + "kl": 0.0006385745655279607, + "learning_rate": 9.99805057520177e-07, + "loss": -0.0038, + "num_tokens": 1099845.0, + "reward": 0.82525634765625, + "reward_std": 0.021092217415571213, + "rewards//mean": 0.82525634765625, + "rewards//std": 0.02492239698767662, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0192, + "grad_norm": 1.083445429801941, + "kl": 0.0006971871189307421, + "learning_rate": 9.997960964140945e-07, + "loss": 0.0001, + "num_tokens": 1111341.0, + "reward": 0.79345703125, + "reward_std": 0.02896975539624691, + "rewards//mean": 0.79345703125, + "rewards//std": 0.030245674774050713, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.0194, + "grad_norm": 1.076715111732483, + "kl": 0.0006535321081173606, + "learning_rate": 9.99786933990204e-07, + "loss": -0.0025, + "num_tokens": 1122829.0, + "reward": 0.778564453125, + "reward_std": 0.02306465432047844, + "rewards//mean": 0.778564453125, + "rewards//std": 0.027026351541280746, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.0196, + "grad_norm": 1.0992696285247803, + "kl": 0.0007488660121452995, + "learning_rate": 9.997775702521965e-07, + "loss": 0.0008, + "num_tokens": 1134351.0, + "reward": 0.7811279296875, + "reward_std": 0.025325408205389977, + "rewards//mean": 0.7811279296875, + "rewards//std": 0.02959376573562622, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0198, + "grad_norm": 0.9488234519958496, + "kl": 0.00062160289962776, + "learning_rate": 9.997680052038434e-07, + "loss": 0.0001, + "num_tokens": 1145967.0, + "reward": 0.8046875, + "reward_std": 0.0258016437292099, + "rewards//mean": 0.8046875, + "rewards//std": 0.02717938832938671, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.02, + "grad_norm": 1.1617958545684814, + "kl": 0.0006840423739049584, + "learning_rate": 9.997582388489973e-07, + "loss": 0.0001, + "num_tokens": 1157591.0, + "reward": 0.80316162109375, + "reward_std": 0.019280632957816124, + "rewards//mean": 0.80316162109375, + "rewards//std": 0.02218042127788067, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0202, + "grad_norm": 1.007063627243042, + "kl": 0.0007131188176572323, + "learning_rate": 9.997482711915925e-07, + "loss": 0.0001, + "num_tokens": 1169167.0, + "reward": 0.80047607421875, + "reward_std": 0.02441065013408661, + "rewards//mean": 0.80047607421875, + "rewards//std": 0.02841070294380188, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0204, + "grad_norm": 1.1481879949569702, + "kl": 0.0007947582635097206, + "learning_rate": 9.99738102235644e-07, + "loss": 0.0001, + "num_tokens": 1180767.0, + "reward": 0.7999267578125, + "reward_std": 0.028446532785892487, + "rewards//mean": 0.7999267578125, + "rewards//std": 0.029052751138806343, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.0206, + "grad_norm": 1.0125106573104858, + "kl": 0.000584149791393429, + "learning_rate": 9.997277319852474e-07, + "loss": -0.002, + "num_tokens": 1192236.0, + "reward": 0.75726318359375, + "reward_std": 0.021114790812134743, + "rewards//mean": 0.75726318359375, + "rewards//std": 0.021930592134594917, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0208, + "grad_norm": 0.9280813336372375, + "kl": 0.0006463157187681645, + "learning_rate": 9.997171604445802e-07, + "loss": 0.0001, + "num_tokens": 1203940.0, + "reward": 0.7811279296875, + "reward_std": 0.019756892696022987, + "rewards//mean": 0.7811279296875, + "rewards//std": 0.02534378133714199, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.021, + "grad_norm": 1.1672253608703613, + "kl": 0.0007268701156135648, + "learning_rate": 9.997063876179007e-07, + "loss": 0.0001, + "num_tokens": 1215556.0, + "reward": 0.7855224609375, + "reward_std": 0.02292565628886223, + "rewards//mean": 0.7855224609375, + "rewards//std": 0.027221407741308212, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.0212, + "grad_norm": 1.0228865146636963, + "kl": 0.0007429054967360571, + "learning_rate": 9.996954135095478e-07, + "loss": -0.0003, + "num_tokens": 1227074.0, + "reward": 0.772705078125, + "reward_std": 0.026300232857465744, + "rewards//mean": 0.772705078125, + "rewards//std": 0.028526155278086662, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0214, + "grad_norm": 0.9938479065895081, + "kl": 0.0007216637313831598, + "learning_rate": 9.996842381239422e-07, + "loss": 0.0001, + "num_tokens": 1238610.0, + "reward": 0.78662109375, + "reward_std": 0.028536584228277206, + "rewards//mean": 0.78662109375, + "rewards//std": 0.028138259425759315, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.0216, + "grad_norm": 0.9797035455703735, + "kl": 0.0007567866632598452, + "learning_rate": 9.996728614655853e-07, + "loss": 0.0032, + "num_tokens": 1250205.0, + "reward": 0.78033447265625, + "reward_std": 0.022089680656790733, + "rewards//mean": 0.78033447265625, + "rewards//std": 0.02732817642390728, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0218, + "grad_norm": 1.0019795894622803, + "kl": 0.0009185494272969663, + "learning_rate": 9.996612835390594e-07, + "loss": 0.0001, + "num_tokens": 1261813.0, + "reward": 0.78515625, + "reward_std": 0.018684202805161476, + "rewards//mean": 0.78515625, + "rewards//std": 0.024724895134568214, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.022, + "grad_norm": 1.0843669176101685, + "kl": 0.0008846650598570704, + "learning_rate": 9.996495043490283e-07, + "loss": 0.0001, + "num_tokens": 1273397.0, + "reward": 0.7869873046875, + "reward_std": 0.023270703852176666, + "rewards//mean": 0.7869873046875, + "rewards//std": 0.03032938949763775, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0222, + "grad_norm": 1.0688378810882568, + "kl": 0.0008623726898804307, + "learning_rate": 9.996375239002368e-07, + "loss": 0.0001, + "num_tokens": 1285005.0, + "reward": 0.78009033203125, + "reward_std": 0.023679332807660103, + "rewards//mean": 0.78009033203125, + "rewards//std": 0.028979387134313583, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0224, + "grad_norm": 1.0023386478424072, + "kl": 0.0011414361651986837, + "learning_rate": 9.996253421975102e-07, + "loss": 0.0001, + "num_tokens": 1296613.0, + "reward": 0.81494140625, + "reward_std": 0.028789427131414413, + "rewards//mean": 0.81494140625, + "rewards//std": 0.03268573805689812, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.0226, + "grad_norm": 0.9800978899002075, + "kl": 0.001048477555741556, + "learning_rate": 9.996129592457556e-07, + "loss": 0.0043, + "num_tokens": 1308127.0, + "reward": 0.81640625, + "reward_std": 0.025129372254014015, + "rewards//mean": 0.81640625, + "rewards//std": 0.02992769330739975, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0228, + "grad_norm": 1.0748845338821411, + "kl": 0.0008972941723186523, + "learning_rate": 9.996003750499607e-07, + "loss": 0.0001, + "num_tokens": 1319671.0, + "reward": 0.80609130859375, + "reward_std": 0.02805209904909134, + "rewards//mean": 0.80609130859375, + "rewards//std": 0.03090997412800789, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.023, + "grad_norm": 0.9468462467193604, + "kl": 0.0012133549607824534, + "learning_rate": 9.995875896151944e-07, + "loss": 0.0001, + "num_tokens": 1331319.0, + "reward": 0.79931640625, + "reward_std": 0.023228216916322708, + "rewards//mean": 0.79931640625, + "rewards//std": 0.026589294895529747, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.0232, + "grad_norm": 0.9656848311424255, + "kl": 0.0009128780075116083, + "learning_rate": 9.99574602946607e-07, + "loss": 0.001, + "num_tokens": 1342825.0, + "reward": 0.8038330078125, + "reward_std": 0.027980148792266846, + "rewards//mean": 0.8038330078125, + "rewards//std": 0.0327669158577919, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0234, + "grad_norm": 1.091425895690918, + "kl": 0.0008740312332520261, + "learning_rate": 9.99561415049429e-07, + "loss": 0.0011, + "num_tokens": 1354382.0, + "reward": 0.78094482421875, + "reward_std": 0.025229131802916527, + "rewards//mean": 0.78094482421875, + "rewards//std": 0.028308216482400894, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0236, + "grad_norm": 0.9921692609786987, + "kl": 0.0009811834825086407, + "learning_rate": 9.99548025928973e-07, + "loss": 0.0017, + "num_tokens": 1365971.0, + "reward": 0.78350830078125, + "reward_std": 0.025840595364570618, + "rewards//mean": 0.78350830078125, + "rewards//std": 0.02721327356994152, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0238, + "grad_norm": 1.0975821018218994, + "kl": 0.0012758941447827965, + "learning_rate": 9.995344355906318e-07, + "loss": 0.0001, + "num_tokens": 1377507.0, + "reward": 0.7860107421875, + "reward_std": 0.025157637894153595, + "rewards//mean": 0.7860107421875, + "rewards//std": 0.027263637632131577, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.024, + "grad_norm": 1.149405837059021, + "kl": 0.0009670757572166622, + "learning_rate": 9.995206440398796e-07, + "loss": 0.0001, + "num_tokens": 1389131.0, + "reward": 0.7857666015625, + "reward_std": 0.02165774442255497, + "rewards//mean": 0.7857666015625, + "rewards//std": 0.02416226826608181, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0242, + "grad_norm": 1.020729899406433, + "kl": 0.0008752426074352115, + "learning_rate": 9.995066512822718e-07, + "loss": 0.0001, + "num_tokens": 1400835.0, + "reward": 0.781494140625, + "reward_std": 0.0295531265437603, + "rewards//mean": 0.781494140625, + "rewards//std": 0.034840505570173264, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0244, + "grad_norm": 1.0369584560394287, + "kl": 0.0009374480287078768, + "learning_rate": 9.994924573234446e-07, + "loss": 0.0001, + "num_tokens": 1412419.0, + "reward": 0.8021240234375, + "reward_std": 0.025799574330449104, + "rewards//mean": 0.8021240234375, + "rewards//std": 0.02950565330684185, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0246, + "grad_norm": 1.1742593050003052, + "kl": 0.0011316849268041551, + "learning_rate": 9.994780621691154e-07, + "loss": 0.0001, + "num_tokens": 1424035.0, + "reward": 0.81951904296875, + "reward_std": 0.019540857523679733, + "rewards//mean": 0.81951904296875, + "rewards//std": 0.021856609731912613, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0248, + "grad_norm": 1.0541960000991821, + "kl": 0.0010330154618714005, + "learning_rate": 9.994634658250824e-07, + "loss": 0.0001, + "num_tokens": 1435587.0, + "reward": 0.79388427734375, + "reward_std": 0.02638779953122139, + "rewards//mean": 0.79388427734375, + "rewards//std": 0.02839737944304943, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.025, + "grad_norm": 0.966884195804596, + "kl": 0.0011270601826254278, + "learning_rate": 9.994486682972252e-07, + "loss": 0.0001, + "num_tokens": 1447171.0, + "reward": 0.8128662109375, + "reward_std": 0.02109691873192787, + "rewards//mean": 0.8128662109375, + "rewards//std": 0.02275288663804531, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0252, + "grad_norm": 1.1111615896224976, + "kl": 0.0012122523621656, + "learning_rate": 9.99433669591504e-07, + "loss": 0.0001, + "num_tokens": 1458723.0, + "reward": 0.80377197265625, + "reward_std": 0.030984841287136078, + "rewards//mean": 0.80377197265625, + "rewards//std": 0.039578329771757126, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0254, + "grad_norm": 1.0731996297836304, + "kl": 0.001484197418903932, + "learning_rate": 9.994184697139604e-07, + "loss": 0.0001, + "num_tokens": 1470283.0, + "reward": 0.82110595703125, + "reward_std": 0.018746808171272278, + "rewards//mean": 0.82110595703125, + "rewards//std": 0.021158406510949135, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0256, + "grad_norm": 0.9753751754760742, + "kl": 0.001205368316732347, + "learning_rate": 9.99403068670717e-07, + "loss": 0.0001, + "num_tokens": 1481811.0, + "reward": 0.795654296875, + "reward_std": 0.025789041072130203, + "rewards//mean": 0.795654296875, + "rewards//std": 0.028670132160186768, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0258, + "grad_norm": 1.0944480895996094, + "kl": 0.0015931284870021045, + "learning_rate": 9.993874664679772e-07, + "loss": 0.0002, + "num_tokens": 1493331.0, + "reward": 0.80804443359375, + "reward_std": 0.025768376886844635, + "rewards//mean": 0.80804443359375, + "rewards//std": 0.027585096657276154, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.026, + "grad_norm": 1.0858913660049438, + "kl": 0.001247147301910445, + "learning_rate": 9.993716631120258e-07, + "loss": 0.0001, + "num_tokens": 1504883.0, + "reward": 0.77362060546875, + "reward_std": 0.025001801550388336, + "rewards//mean": 0.77362060546875, + "rewards//std": 0.029777023941278458, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0262, + "grad_norm": 0.9832647442817688, + "kl": 0.0012248472485225648, + "learning_rate": 9.99355658609228e-07, + "loss": 0.0001, + "num_tokens": 1516539.0, + "reward": 0.7835693359375, + "reward_std": 0.02203105017542839, + "rewards//mean": 0.7835693359375, + "rewards//std": 0.022072069346904755, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.0264, + "grad_norm": 1.1532351970672607, + "kl": 0.0016239402611972764, + "learning_rate": 9.993394529660306e-07, + "loss": -0.0083, + "num_tokens": 1528106.0, + "reward": 0.7939453125, + "reward_std": 0.02449902333319187, + "rewards//mean": 0.7939453125, + "rewards//std": 0.026134507730603218, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0266, + "grad_norm": 1.0743811130523682, + "kl": 0.00150086305802688, + "learning_rate": 9.993230461889615e-07, + "loss": 0.0002, + "num_tokens": 1539626.0, + "reward": 0.75286865234375, + "reward_std": 0.029542606323957443, + "rewards//mean": 0.75286865234375, + "rewards//std": 0.03554501757025719, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0268, + "grad_norm": 1.0970526933670044, + "kl": 0.0016112506855279207, + "learning_rate": 9.993064382846289e-07, + "loss": -0.0091, + "num_tokens": 1551150.0, + "reward": 0.79534912109375, + "reward_std": 0.025676142424345016, + "rewards//mean": 0.79534912109375, + "rewards//std": 0.026970213279128075, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.34375, + "epoch": 0.027, + "grad_norm": 1.006722331047058, + "kl": 0.0018630806589499116, + "learning_rate": 9.992896292597228e-07, + "loss": -0.0252, + "num_tokens": 1562628.0, + "reward": 0.79071044921875, + "reward_std": 0.022746920585632324, + "rewards//mean": 0.79071044921875, + "rewards//std": 0.024830510839819908, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.0272, + "grad_norm": 0.9579416513442993, + "kl": 0.001146607093687635, + "learning_rate": 9.992726191210137e-07, + "loss": 0.0047, + "num_tokens": 1574169.0, + "reward": 0.757568359375, + "reward_std": 0.018275555223226547, + "rewards//mean": 0.757568359375, + "rewards//std": 0.019058961421251297, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0274, + "grad_norm": 1.0534205436706543, + "kl": 0.0011900178215000778, + "learning_rate": 9.992554078753533e-07, + "loss": -0.0017, + "num_tokens": 1585710.0, + "reward": 0.8111572265625, + "reward_std": 0.025155656039714813, + "rewards//mean": 0.8111572265625, + "rewards//std": 0.031065138056874275, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.0276, + "grad_norm": 1.1682957410812378, + "kl": 0.0015846183523535728, + "learning_rate": 9.992379955296745e-07, + "loss": -0.0113, + "num_tokens": 1597312.0, + "reward": 0.7825927734375, + "reward_std": 0.02667314186692238, + "rewards//mean": 0.7825927734375, + "rewards//std": 0.031506411731243134, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0278, + "grad_norm": 0.9936683773994446, + "kl": 0.0013473591534420848, + "learning_rate": 9.992203820909905e-07, + "loss": 0.0001, + "num_tokens": 1608896.0, + "reward": 0.80841064453125, + "reward_std": 0.026514634490013123, + "rewards//mean": 0.80841064453125, + "rewards//std": 0.028901973739266396, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.028, + "grad_norm": 1.037082552909851, + "kl": 0.0013977802882436663, + "learning_rate": 9.992025675663965e-07, + "loss": 0.0001, + "num_tokens": 1620536.0, + "reward": 0.8101806640625, + "reward_std": 0.02804284356534481, + "rewards//mean": 0.8101806640625, + "rewards//std": 0.028422754257917404, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0282, + "grad_norm": 1.1799076795578003, + "kl": 0.001647633471293375, + "learning_rate": 9.991845519630676e-07, + "loss": 0.0002, + "num_tokens": 1632176.0, + "reward": 0.7801513671875, + "reward_std": 0.020243562757968903, + "rewards//mean": 0.7801513671875, + "rewards//std": 0.024522947147488594, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0284, + "grad_norm": 1.173966407775879, + "kl": 0.0019308491027913988, + "learning_rate": 9.991663352882613e-07, + "loss": 0.0002, + "num_tokens": 1643872.0, + "reward": 0.808349609375, + "reward_std": 0.020979978144168854, + "rewards//mean": 0.808349609375, + "rewards//std": 0.022226866334676743, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0286, + "grad_norm": 1.1129895448684692, + "kl": 0.0015910310903564095, + "learning_rate": 9.991479175493148e-07, + "loss": 0.0002, + "num_tokens": 1655448.0, + "reward": 0.8052978515625, + "reward_std": 0.023709788918495178, + "rewards//mean": 0.8052978515625, + "rewards//std": 0.027789371088147163, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0288, + "grad_norm": 1.073409080505371, + "kl": 0.0014154893578961492, + "learning_rate": 9.991292987536468e-07, + "loss": 0.0001, + "num_tokens": 1667096.0, + "reward": 0.8223876953125, + "reward_std": 0.029593907296657562, + "rewards//mean": 0.8223876953125, + "rewards//std": 0.03247363120317459, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.029, + "grad_norm": 1.0845848321914673, + "kl": 0.0017709041421767324, + "learning_rate": 9.991104789087569e-07, + "loss": -0.0056, + "num_tokens": 1678585.0, + "reward": 0.7994384765625, + "reward_std": 0.02198037877678871, + "rewards//mean": 0.7994384765625, + "rewards//std": 0.022436663508415222, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0292, + "grad_norm": 1.0564552545547485, + "kl": 0.0016774717660155147, + "learning_rate": 9.990914580222255e-07, + "loss": 0.0002, + "num_tokens": 1690249.0, + "reward": 0.79022216796875, + "reward_std": 0.026936372742056847, + "rewards//mean": 0.79022216796875, + "rewards//std": 0.027915123850107193, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.0294, + "grad_norm": 1.1047219038009644, + "kl": 0.0015970502863638103, + "learning_rate": 9.990722361017149e-07, + "loss": -0.0069, + "num_tokens": 1701863.0, + "reward": 0.78485107421875, + "reward_std": 0.01902250200510025, + "rewards//mean": 0.78485107421875, + "rewards//std": 0.02796388603746891, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.0296, + "grad_norm": 1.1744790077209473, + "kl": 0.001885298639535904, + "learning_rate": 9.990528131549671e-07, + "loss": -0.0026, + "num_tokens": 1713460.0, + "reward": 0.76971435546875, + "reward_std": 0.02327745035290718, + "rewards//mean": 0.76971435546875, + "rewards//std": 0.02712077647447586, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0298, + "grad_norm": 0.9864264130592346, + "kl": 0.001316507055889815, + "learning_rate": 9.990331891898058e-07, + "loss": 0.0001, + "num_tokens": 1725092.0, + "reward": 0.8062744140625, + "reward_std": 0.024695411324501038, + "rewards//mean": 0.8062744140625, + "rewards//std": 0.02774139493703842, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.03, + "grad_norm": 1.0861222743988037, + "kl": 0.0018997594888787717, + "learning_rate": 9.990133642141357e-07, + "loss": 0.0011, + "num_tokens": 1736686.0, + "reward": 0.765380859375, + "reward_std": 0.027488838881254196, + "rewards//mean": 0.765380859375, + "rewards//std": 0.029478194192051888, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0302, + "grad_norm": 1.0571084022521973, + "kl": 0.0017590886855032295, + "learning_rate": 9.989933382359422e-07, + "loss": 0.0002, + "num_tokens": 1748230.0, + "reward": 0.77294921875, + "reward_std": 0.018908997997641563, + "rewards//mean": 0.77294921875, + "rewards//std": 0.022983575239777565, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.0304, + "grad_norm": 0.9903735518455505, + "kl": 0.0019041918130824342, + "learning_rate": 9.989731112632916e-07, + "loss": 0.0062, + "num_tokens": 1759803.0, + "reward": 0.77984619140625, + "reward_std": 0.027062054723501205, + "rewards//mean": 0.77984619140625, + "rewards//std": 0.031614989042282104, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0306, + "grad_norm": 1.1543848514556885, + "kl": 0.0023104670108295977, + "learning_rate": 9.989526833043316e-07, + "loss": 0.0002, + "num_tokens": 1771435.0, + "reward": 0.806640625, + "reward_std": 0.024279791861772537, + "rewards//mean": 0.806640625, + "rewards//std": 0.027143720537424088, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0308, + "grad_norm": 1.044055700302124, + "kl": 0.001976058498257771, + "learning_rate": 9.989320543672903e-07, + "loss": 0.0002, + "num_tokens": 1783011.0, + "reward": 0.756591796875, + "reward_std": 0.031205516308546066, + "rewards//mean": 0.756591796875, + "rewards//std": 0.038756709545850754, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.031, + "grad_norm": 1.0268096923828125, + "kl": 0.0020445024711079895, + "learning_rate": 9.989112244604771e-07, + "loss": 0.0002, + "num_tokens": 1794699.0, + "reward": 0.79425048828125, + "reward_std": 0.022723814472556114, + "rewards//mean": 0.79425048828125, + "rewards//std": 0.026929209008812904, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0312, + "grad_norm": 1.0990660190582275, + "kl": 0.0023416701587848365, + "learning_rate": 9.988901935922825e-07, + "loss": 0.0046, + "num_tokens": 1806292.0, + "reward": 0.7861328125, + "reward_std": 0.02161153219640255, + "rewards//mean": 0.7861328125, + "rewards//std": 0.022819651290774345, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.0314, + "grad_norm": 1.091678261756897, + "kl": 0.002328589034732431, + "learning_rate": 9.988689617711776e-07, + "loss": -0.0085, + "num_tokens": 1817743.0, + "reward": 0.79998779296875, + "reward_std": 0.025330033153295517, + "rewards//mean": 0.79998779296875, + "rewards//std": 0.02808380499482155, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.0316, + "grad_norm": 1.18397057056427, + "kl": 0.002129078085999936, + "learning_rate": 9.988475290057143e-07, + "loss": 0.0038, + "num_tokens": 1829333.0, + "reward": 0.80133056640625, + "reward_std": 0.03218778967857361, + "rewards//mean": 0.80133056640625, + "rewards//std": 0.03398888185620308, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0318, + "grad_norm": 1.067722201347351, + "kl": 0.002679636614629999, + "learning_rate": 9.988258953045262e-07, + "loss": 0.0003, + "num_tokens": 1840885.0, + "reward": 0.79339599609375, + "reward_std": 0.030402163043618202, + "rewards//mean": 0.79339599609375, + "rewards//std": 0.031819261610507965, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.032, + "grad_norm": 1.2133910655975342, + "kl": 0.002700318524148315, + "learning_rate": 9.988040606763272e-07, + "loss": -0.0004, + "num_tokens": 1852415.0, + "reward": 0.7880859375, + "reward_std": 0.02968628704547882, + "rewards//mean": 0.7880859375, + "rewards//std": 0.0307657178491354, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0322, + "grad_norm": 1.0541502237319946, + "kl": 0.002327424823306501, + "learning_rate": 9.98782025129912e-07, + "loss": 0.0002, + "num_tokens": 1863999.0, + "reward": 0.80535888671875, + "reward_std": 0.028069326654076576, + "rewards//mean": 0.80535888671875, + "rewards//std": 0.032034989446401596, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.0324, + "grad_norm": 1.0605329275131226, + "kl": 0.0021783143165521324, + "learning_rate": 9.987597886741568e-07, + "loss": 0.0004, + "num_tokens": 1875606.0, + "reward": 0.77490234375, + "reward_std": 0.02751433104276657, + "rewards//mean": 0.77490234375, + "rewards//std": 0.02947511151432991, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.0326, + "grad_norm": 1.147852897644043, + "kl": 0.002471390733262524, + "learning_rate": 9.987373513180184e-07, + "loss": -0.0004, + "num_tokens": 1887097.0, + "reward": 0.78143310546875, + "reward_std": 0.01953718066215515, + "rewards//mean": 0.78143310546875, + "rewards//std": 0.021195577457547188, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0328, + "grad_norm": 1.035252332687378, + "kl": 0.002560808847192675, + "learning_rate": 9.987147130705347e-07, + "loss": 0.0003, + "num_tokens": 1898601.0, + "reward": 0.79052734375, + "reward_std": 0.026871398091316223, + "rewards//mean": 0.79052734375, + "rewards//std": 0.029516169801354408, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.033, + "grad_norm": 1.0787065029144287, + "kl": 0.0025223355914931744, + "learning_rate": 9.98691873940824e-07, + "loss": 0.0004, + "num_tokens": 1910225.0, + "reward": 0.78863525390625, + "reward_std": 0.024823077023029327, + "rewards//mean": 0.78863525390625, + "rewards//std": 0.02790156379342079, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0332, + "grad_norm": 1.1916779279708862, + "kl": 0.003289496700745076, + "learning_rate": 9.98668833938086e-07, + "loss": 0.0003, + "num_tokens": 1921777.0, + "reward": 0.78369140625, + "reward_std": 0.02712547965347767, + "rewards//mean": 0.78369140625, + "rewards//std": 0.03220800310373306, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.0334, + "grad_norm": 1.1592134237289429, + "kl": 0.00308210207731463, + "learning_rate": 9.986455930716016e-07, + "loss": -0.0011, + "num_tokens": 1933323.0, + "reward": 0.81036376953125, + "reward_std": 0.02917584404349327, + "rewards//mean": 0.81036376953125, + "rewards//std": 0.03623128682374954, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0336, + "grad_norm": 1.0254720449447632, + "kl": 0.002713382651563734, + "learning_rate": 9.986221513507318e-07, + "loss": -0.0025, + "num_tokens": 1944920.0, + "reward": 0.789306640625, + "reward_std": 0.025018319487571716, + "rewards//mean": 0.789306640625, + "rewards//std": 0.02815009281039238, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0338, + "grad_norm": 1.2482420206069946, + "kl": 0.003524379659211263, + "learning_rate": 9.985985087849191e-07, + "loss": 0.0004, + "num_tokens": 1956552.0, + "reward": 0.78399658203125, + "reward_std": 0.019669707864522934, + "rewards//mean": 0.78399658203125, + "rewards//std": 0.02609443850815296, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.034, + "grad_norm": 1.042280673980713, + "kl": 0.003599055315135047, + "learning_rate": 9.985746653836866e-07, + "loss": 0.0006, + "num_tokens": 1968209.0, + "reward": 0.78985595703125, + "reward_std": 0.02310457080602646, + "rewards//mean": 0.78985595703125, + "rewards//std": 0.03127463534474373, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0342, + "grad_norm": 1.090175986289978, + "kl": 0.002962962113087997, + "learning_rate": 9.985506211566386e-07, + "loss": 0.0003, + "num_tokens": 1979785.0, + "reward": 0.8165283203125, + "reward_std": 0.0215543694794178, + "rewards//mean": 0.8165283203125, + "rewards//std": 0.025171173736453056, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.0344, + "grad_norm": 1.1816318035125732, + "kl": 0.0033234066795557737, + "learning_rate": 9.9852637611346e-07, + "loss": -0.0019, + "num_tokens": 1991368.0, + "reward": 0.78863525390625, + "reward_std": 0.02782921865582466, + "rewards//mean": 0.78863525390625, + "rewards//std": 0.03265324607491493, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0346, + "grad_norm": 1.1635127067565918, + "kl": 0.0033395921054761857, + "learning_rate": 9.98501930263917e-07, + "loss": 0.0052, + "num_tokens": 2002913.0, + "reward": 0.76849365234375, + "reward_std": 0.025203054770827293, + "rewards//mean": 0.76849365234375, + "rewards//std": 0.027920003980398178, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0348, + "grad_norm": 1.1554718017578125, + "kl": 0.0033306548139080405, + "learning_rate": 9.984772836178556e-07, + "loss": 0.0003, + "num_tokens": 2014649.0, + "reward": 0.81298828125, + "reward_std": 0.027196455746889114, + "rewards//mean": 0.81298828125, + "rewards//std": 0.030753906816244125, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.035, + "grad_norm": 1.0409566164016724, + "kl": 0.003496909572277218, + "learning_rate": 9.984524361852043e-07, + "loss": 0.0003, + "num_tokens": 2026257.0, + "reward": 0.7855224609375, + "reward_std": 0.026588259264826775, + "rewards//mean": 0.7855224609375, + "rewards//std": 0.03225473314523697, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.0352, + "grad_norm": 1.1623202562332153, + "kl": 0.0036879870167467743, + "learning_rate": 9.984273879759712e-07, + "loss": 0.0046, + "num_tokens": 2037933.0, + "reward": 0.78582763671875, + "reward_std": 0.027925759553909302, + "rewards//mean": 0.78582763671875, + "rewards//std": 0.029940271750092506, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.0354, + "grad_norm": 1.17938232421875, + "kl": 0.003348253609146923, + "learning_rate": 9.984021390002457e-07, + "loss": 0.0032, + "num_tokens": 2049607.0, + "reward": 0.8037109375, + "reward_std": 0.022836018353700638, + "rewards//mean": 0.8037109375, + "rewards//std": 0.02772638387978077, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.0356, + "grad_norm": 1.0644232034683228, + "kl": 0.004052124946611002, + "learning_rate": 9.983766892681985e-07, + "loss": 0.0116, + "num_tokens": 2061210.0, + "reward": 0.8017578125, + "reward_std": 0.022080417722463608, + "rewards//mean": 0.8017578125, + "rewards//std": 0.025572409853339195, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0358, + "grad_norm": 1.1476185321807861, + "kl": 0.003145837690681219, + "learning_rate": 9.983510387900802e-07, + "loss": 0.0003, + "num_tokens": 2072802.0, + "reward": 0.78070068359375, + "reward_std": 0.02601863071322441, + "rewards//mean": 0.78070068359375, + "rewards//std": 0.02969098463654518, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.036, + "grad_norm": 1.134774923324585, + "kl": 0.003907572419848293, + "learning_rate": 9.983251875762232e-07, + "loss": 0.0029, + "num_tokens": 2084404.0, + "reward": 0.812744140625, + "reward_std": 0.02902880683541298, + "rewards//mean": 0.812744140625, + "rewards//std": 0.0333704799413681, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0362, + "grad_norm": 1.0783344507217407, + "kl": 0.0037602077063638717, + "learning_rate": 9.982991356370403e-07, + "loss": 0.0004, + "num_tokens": 2095980.0, + "reward": 0.80108642578125, + "reward_std": 0.02613934688270092, + "rewards//mean": 0.80108642578125, + "rewards//std": 0.030051296576857567, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0364, + "grad_norm": 1.204670786857605, + "kl": 0.004715879389550537, + "learning_rate": 9.98272882983025e-07, + "loss": 0.0005, + "num_tokens": 2107612.0, + "reward": 0.80523681640625, + "reward_std": 0.0229114331305027, + "rewards//mean": 0.80523681640625, + "rewards//std": 0.026810338720679283, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0366, + "grad_norm": 1.0657269954681396, + "kl": 0.004348902846686542, + "learning_rate": 9.982464296247522e-07, + "loss": 0.0004, + "num_tokens": 2119260.0, + "reward": 0.78936767578125, + "reward_std": 0.02182740904390812, + "rewards//mean": 0.78936767578125, + "rewards//std": 0.024629736319184303, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0368, + "grad_norm": 1.1691079139709473, + "kl": 0.005267721076961607, + "learning_rate": 9.98219775572877e-07, + "loss": 0.0005, + "num_tokens": 2130716.0, + "reward": 0.787841796875, + "reward_std": 0.028710367158055305, + "rewards//mean": 0.787841796875, + "rewards//std": 0.030264686793088913, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.037, + "grad_norm": 1.1636656522750854, + "kl": 0.003953753301175311, + "learning_rate": 9.981929208381357e-07, + "loss": 0.0004, + "num_tokens": 2142324.0, + "reward": 0.80401611328125, + "reward_std": 0.034032125025987625, + "rewards//mean": 0.80401611328125, + "rewards//std": 0.038891956210136414, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0372, + "grad_norm": 1.1542350053787231, + "kl": 0.003596391237806529, + "learning_rate": 9.981658654313456e-07, + "loss": 0.0004, + "num_tokens": 2153892.0, + "reward": 0.8023681640625, + "reward_std": 0.017362188547849655, + "rewards//mean": 0.8023681640625, + "rewards//std": 0.01935959793627262, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.0374, + "grad_norm": 1.0391958951950073, + "kl": 0.004509820661041886, + "learning_rate": 9.981386093634045e-07, + "loss": 0.0031, + "num_tokens": 2165503.0, + "reward": 0.79315185546875, + "reward_std": 0.015514733269810677, + "rewards//mean": 0.79315185546875, + "rewards//std": 0.017777537927031517, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0376, + "grad_norm": 1.1463459730148315, + "kl": 0.004014673846540973, + "learning_rate": 9.98111152645291e-07, + "loss": 0.0003, + "num_tokens": 2177036.0, + "reward": 0.8017578125, + "reward_std": 0.022883962839841843, + "rewards//mean": 0.8017578125, + "rewards//std": 0.027312731370329857, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0378, + "grad_norm": 1.2099896669387817, + "kl": 0.004536258318694308, + "learning_rate": 9.98083495288065e-07, + "loss": 0.0005, + "num_tokens": 2188532.0, + "reward": 0.809814453125, + "reward_std": 0.022024860605597496, + "rewards//mean": 0.809814453125, + "rewards//std": 0.02986997365951538, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.038, + "grad_norm": 1.111228108406067, + "kl": 0.004867967450991273, + "learning_rate": 9.980556373028665e-07, + "loss": 0.0005, + "num_tokens": 2200100.0, + "reward": 0.79119873046875, + "reward_std": 0.02506091073155403, + "rewards//mean": 0.79119873046875, + "rewards//std": 0.02887262962758541, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0382, + "grad_norm": 1.110686182975769, + "kl": 0.004581064131343737, + "learning_rate": 9.98027578700917e-07, + "loss": 0.0018, + "num_tokens": 2211680.0, + "reward": 0.79254150390625, + "reward_std": 0.02677498199045658, + "rewards//mean": 0.79254150390625, + "rewards//std": 0.03160541132092476, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0384, + "grad_norm": 1.1514862775802612, + "kl": 0.003833886352367699, + "learning_rate": 9.979993194935182e-07, + "loss": 0.0004, + "num_tokens": 2223232.0, + "reward": 0.81341552734375, + "reward_std": 0.026915811002254486, + "rewards//mean": 0.81341552734375, + "rewards//std": 0.03344925865530968, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0386, + "grad_norm": 1.3335530757904053, + "kl": 0.005032151122577488, + "learning_rate": 9.979708596920529e-07, + "loss": 0.0005, + "num_tokens": 2234824.0, + "reward": 0.78338623046875, + "reward_std": 0.021854188293218613, + "rewards//mean": 0.78338623046875, + "rewards//std": 0.02573225647211075, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0388, + "grad_norm": 1.044563889503479, + "kl": 0.004707010317360982, + "learning_rate": 9.97942199307985e-07, + "loss": 0.0011, + "num_tokens": 2246409.0, + "reward": 0.79632568359375, + "reward_std": 0.023135868832468987, + "rewards//mean": 0.79632568359375, + "rewards//std": 0.025015760213136673, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.039, + "grad_norm": 1.2477060556411743, + "kl": 0.005167574243387207, + "learning_rate": 9.97913338352859e-07, + "loss": 0.005, + "num_tokens": 2258010.0, + "reward": 0.79266357421875, + "reward_std": 0.02553083561360836, + "rewards//mean": 0.79266357421875, + "rewards//std": 0.025932665914297104, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0392, + "grad_norm": 1.1134164333343506, + "kl": 0.004844380513532087, + "learning_rate": 9.978842768382998e-07, + "loss": 0.0005, + "num_tokens": 2269570.0, + "reward": 0.7744140625, + "reward_std": 0.020273501053452492, + "rewards//mean": 0.7744140625, + "rewards//std": 0.025046074762940407, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.0394, + "grad_norm": 1.242315411567688, + "kl": 0.004023640969535336, + "learning_rate": 9.978550147760131e-07, + "loss": -0.0005, + "num_tokens": 2281120.0, + "reward": 0.797119140625, + "reward_std": 0.023823058232665062, + "rewards//mean": 0.797119140625, + "rewards//std": 0.028355836868286133, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.0396, + "grad_norm": 1.239900827407837, + "kl": 0.00605322583578527, + "learning_rate": 9.978255521777862e-07, + "loss": 0.008, + "num_tokens": 2292663.0, + "reward": 0.79241943359375, + "reward_std": 0.024822881445288658, + "rewards//mean": 0.79241943359375, + "rewards//std": 0.028670065104961395, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0398, + "grad_norm": 1.1667896509170532, + "kl": 0.0040574619197286665, + "learning_rate": 9.977958890554866e-07, + "loss": 0.0004, + "num_tokens": 2304207.0, + "reward": 0.774658203125, + "reward_std": 0.01931469514966011, + "rewards//mean": 0.774658203125, + "rewards//std": 0.01941152662038803, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.04, + "grad_norm": 1.2873828411102295, + "kl": 0.004814039159100503, + "learning_rate": 9.97766025421062e-07, + "loss": 0.0005, + "num_tokens": 2315767.0, + "reward": 0.79608154296875, + "reward_std": 0.023867832496762276, + "rewards//mean": 0.79608154296875, + "rewards//std": 0.024775581434369087, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0402, + "grad_norm": 1.0735102891921997, + "kl": 0.004797437693923712, + "learning_rate": 9.977359612865422e-07, + "loss": 0.0005, + "num_tokens": 2327335.0, + "reward": 0.7845458984375, + "reward_std": 0.020987410098314285, + "rewards//mean": 0.7845458984375, + "rewards//std": 0.03149872273206711, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.0404, + "grad_norm": 1.0586689710617065, + "kl": 0.004395019088406116, + "learning_rate": 9.977056966640367e-07, + "loss": 0.0022, + "num_tokens": 2338882.0, + "reward": 0.80029296875, + "reward_std": 0.030116254463791847, + "rewards//mean": 0.80029296875, + "rewards//std": 0.03578445687890053, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0406, + "grad_norm": 1.1076661348342896, + "kl": 0.005255013064015657, + "learning_rate": 9.976752315657359e-07, + "loss": 0.0005, + "num_tokens": 2350426.0, + "reward": 0.76617431640625, + "reward_std": 0.020543277263641357, + "rewards//mean": 0.76617431640625, + "rewards//std": 0.023029552772641182, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0408, + "grad_norm": 1.1474426984786987, + "kl": 0.0045064809382893145, + "learning_rate": 9.976445660039117e-07, + "loss": -0.0012, + "num_tokens": 2361958.0, + "reward": 0.79937744140625, + "reward_std": 0.023174695670604706, + "rewards//mean": 0.79937744140625, + "rewards//std": 0.02506895549595356, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.041, + "grad_norm": 1.0058320760726929, + "kl": 0.004038142564240843, + "learning_rate": 9.976136999909155e-07, + "loss": 0.0016, + "num_tokens": 2373560.0, + "reward": 0.80419921875, + "reward_std": 0.023688359186053276, + "rewards//mean": 0.80419921875, + "rewards//std": 0.03275975584983826, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.0412, + "grad_norm": 1.1400823593139648, + "kl": 0.004990266927052289, + "learning_rate": 9.975826335391805e-07, + "loss": 0.0012, + "num_tokens": 2385059.0, + "reward": 0.8006591796875, + "reward_std": 0.027357935905456543, + "rewards//mean": 0.8006591796875, + "rewards//std": 0.028356635943055153, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0414, + "grad_norm": 1.1322851181030273, + "kl": 0.0050254149828106165, + "learning_rate": 9.975513666612203e-07, + "loss": 0.0005, + "num_tokens": 2396715.0, + "reward": 0.80303955078125, + "reward_std": 0.023408669978380203, + "rewards//mean": 0.80303955078125, + "rewards//std": 0.02761964686214924, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.0416, + "grad_norm": 1.156887173652649, + "kl": 0.006264922441914678, + "learning_rate": 9.975198993696291e-07, + "loss": 0.0007, + "num_tokens": 2408345.0, + "reward": 0.7900390625, + "reward_std": 0.02577778697013855, + "rewards//mean": 0.7900390625, + "rewards//std": 0.02993578463792801, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0418, + "grad_norm": 1.0964040756225586, + "kl": 0.005038937786594033, + "learning_rate": 9.97488231677082e-07, + "loss": 0.0005, + "num_tokens": 2420033.0, + "reward": 0.82012939453125, + "reward_std": 0.030358947813510895, + "rewards//mean": 0.82012939453125, + "rewards//std": 0.03777920454740524, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.042, + "grad_norm": 1.1427661180496216, + "kl": 0.005642639705911279, + "learning_rate": 9.974563635963347e-07, + "loss": -0.0017, + "num_tokens": 2431621.0, + "reward": 0.8011474609375, + "reward_std": 0.03087637759745121, + "rewards//mean": 0.8011474609375, + "rewards//std": 0.032842595130205154, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0422, + "grad_norm": 1.088405966758728, + "kl": 0.006400410900823772, + "learning_rate": 9.974242951402235e-07, + "loss": 0.0021, + "num_tokens": 2443138.0, + "reward": 0.802490234375, + "reward_std": 0.02888779155910015, + "rewards//mean": 0.802490234375, + "rewards//std": 0.030487943440675735, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0424, + "grad_norm": 1.3871897459030151, + "kl": 0.006332711025606841, + "learning_rate": 9.973920263216657e-07, + "loss": 0.0006, + "num_tokens": 2454650.0, + "reward": 0.80938720703125, + "reward_std": 0.030836954712867737, + "rewards//mean": 0.80938720703125, + "rewards//std": 0.03532586619257927, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.0426, + "grad_norm": 1.216434121131897, + "kl": 0.006591260491404682, + "learning_rate": 9.97359557153659e-07, + "loss": -0.0097, + "num_tokens": 2466127.0, + "reward": 0.785400390625, + "reward_std": 0.0265701562166214, + "rewards//mean": 0.785400390625, + "rewards//std": 0.03343573957681656, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0428, + "grad_norm": 1.3622052669525146, + "kl": 0.006696914904750884, + "learning_rate": 9.973268876492825e-07, + "loss": 0.0007, + "num_tokens": 2477655.0, + "reward": 0.7823486328125, + "reward_std": 0.025113951414823532, + "rewards//mean": 0.7823486328125, + "rewards//std": 0.028692036867141724, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.043, + "grad_norm": 1.2157143354415894, + "kl": 0.006106137385359034, + "learning_rate": 9.972940178216952e-07, + "loss": -0.0001, + "num_tokens": 2489202.0, + "reward": 0.77764892578125, + "reward_std": 0.020541518926620483, + "rewards//mean": 0.77764892578125, + "rewards//std": 0.020941229537129402, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.0432, + "grad_norm": 1.1486895084381104, + "kl": 0.00555938872275874, + "learning_rate": 9.972609476841365e-07, + "loss": 0.0012, + "num_tokens": 2500718.0, + "reward": 0.81988525390625, + "reward_std": 0.021576950326561928, + "rewards//mean": 0.81988525390625, + "rewards//std": 0.022521764039993286, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.0434, + "grad_norm": 1.1288295984268188, + "kl": 0.006536835164297372, + "learning_rate": 9.97227677249928e-07, + "loss": -0.0123, + "num_tokens": 2512321.0, + "reward": 0.8167724609375, + "reward_std": 0.024629496037960052, + "rewards//mean": 0.8167724609375, + "rewards//std": 0.02652745135128498, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.0436, + "grad_norm": 1.0769072771072388, + "kl": 0.005527187837287784, + "learning_rate": 9.971942065324702e-07, + "loss": 0.0002, + "num_tokens": 2523835.0, + "reward": 0.78857421875, + "reward_std": 0.02540578693151474, + "rewards//mean": 0.78857421875, + "rewards//std": 0.02815547026693821, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.0438, + "grad_norm": 1.073999047279358, + "kl": 0.006144662562292069, + "learning_rate": 9.971605355452457e-07, + "loss": 0.0011, + "num_tokens": 2535464.0, + "reward": 0.8070068359375, + "reward_std": 0.02890004776418209, + "rewards//mean": 0.8070068359375, + "rewards//std": 0.03115466982126236, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.044, + "grad_norm": 1.1830289363861084, + "kl": 0.007042144483420998, + "learning_rate": 9.97126664301817e-07, + "loss": 0.0007, + "num_tokens": 2546944.0, + "reward": 0.7838134765625, + "reward_std": 0.030589783564209938, + "rewards//mean": 0.7838134765625, + "rewards//std": 0.032205890864133835, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0442, + "grad_norm": 1.1041203737258911, + "kl": 0.007857985561713576, + "learning_rate": 9.970925928158272e-07, + "loss": 0.0008, + "num_tokens": 2558600.0, + "reward": 0.78802490234375, + "reward_std": 0.023140374571084976, + "rewards//mean": 0.78802490234375, + "rewards//std": 0.027553249150514603, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0444, + "grad_norm": 1.0675026178359985, + "kl": 0.006377070269081742, + "learning_rate": 9.970583211010007e-07, + "loss": 0.0006, + "num_tokens": 2570304.0, + "reward": 0.77301025390625, + "reward_std": 0.024956172332167625, + "rewards//mean": 0.77301025390625, + "rewards//std": 0.025287797674536705, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.109375, + "epoch": 0.0446, + "grad_norm": 1.1941890716552734, + "kl": 0.006894238060340285, + "learning_rate": 9.970238491711415e-07, + "loss": 0.0035, + "num_tokens": 2581831.0, + "reward": 0.78460693359375, + "reward_std": 0.028214259073138237, + "rewards//mean": 0.78460693359375, + "rewards//std": 0.03433627635240555, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0448, + "grad_norm": 1.224466323852539, + "kl": 0.0073683239170350134, + "learning_rate": 9.969891770401356e-07, + "loss": 0.0007, + "num_tokens": 2593543.0, + "reward": 0.80438232421875, + "reward_std": 0.024753011763095856, + "rewards//mean": 0.80438232421875, + "rewards//std": 0.026109516620635986, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.045, + "grad_norm": 1.1376057863235474, + "kl": 0.008224138524383307, + "learning_rate": 9.969543047219486e-07, + "loss": -0.001, + "num_tokens": 2605069.0, + "reward": 0.80120849609375, + "reward_std": 0.0202799029648304, + "rewards//mean": 0.80120849609375, + "rewards//std": 0.026375440880656242, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0452, + "grad_norm": 1.1367589235305786, + "kl": 0.00797433068510145, + "learning_rate": 9.96919232230627e-07, + "loss": 0.0008, + "num_tokens": 2616645.0, + "reward": 0.7869873046875, + "reward_std": 0.0214262455701828, + "rewards//mean": 0.7869873046875, + "rewards//std": 0.028322450816631317, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0454, + "grad_norm": 1.0910896062850952, + "kl": 0.007356781919952482, + "learning_rate": 9.968839595802981e-07, + "loss": 0.0007, + "num_tokens": 2628197.0, + "reward": 0.77410888671875, + "reward_std": 0.021274691447615623, + "rewards//mean": 0.77410888671875, + "rewards//std": 0.02271917834877968, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.0456, + "grad_norm": 1.0990172624588013, + "kl": 0.007296336523722857, + "learning_rate": 9.968484867851697e-07, + "loss": 0.0102, + "num_tokens": 2639796.0, + "reward": 0.81915283203125, + "reward_std": 0.02545160800218582, + "rewards//mean": 0.81915283203125, + "rewards//std": 0.02757466770708561, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0458, + "grad_norm": 1.1540635824203491, + "kl": 0.0076573657570406795, + "learning_rate": 9.968128138595302e-07, + "loss": 0.0008, + "num_tokens": 2651388.0, + "reward": 0.783935546875, + "reward_std": 0.026717711240053177, + "rewards//mean": 0.783935546875, + "rewards//std": 0.037246186286211014, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.046, + "grad_norm": 1.1302785873413086, + "kl": 0.007483624096494168, + "learning_rate": 9.967769408177488e-07, + "loss": 0.0153, + "num_tokens": 2662917.0, + "reward": 0.787353515625, + "reward_std": 0.029370015487074852, + "rewards//mean": 0.787353515625, + "rewards//std": 0.03247293457388878, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0462, + "grad_norm": 1.223042607307434, + "kl": 0.0077705884468741715, + "learning_rate": 9.967408676742751e-07, + "loss": 0.0008, + "num_tokens": 2674509.0, + "reward": 0.79876708984375, + "reward_std": 0.02704700455069542, + "rewards//mean": 0.79876708984375, + "rewards//std": 0.02886476367712021, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.0464, + "grad_norm": 1.1419559717178345, + "kl": 0.006994698429480195, + "learning_rate": 9.967045944436393e-07, + "loss": 0.0059, + "num_tokens": 2686179.0, + "reward": 0.81634521484375, + "reward_std": 0.024558382108807564, + "rewards//mean": 0.81634521484375, + "rewards//std": 0.030389411374926567, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0466, + "grad_norm": 1.1804438829421997, + "kl": 0.007548230933025479, + "learning_rate": 9.96668121140452e-07, + "loss": 0.0008, + "num_tokens": 2697883.0, + "reward": 0.7706298828125, + "reward_std": 0.022971414029598236, + "rewards//mean": 0.7706298828125, + "rewards//std": 0.027493644505739212, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.0468, + "grad_norm": 1.1975220441818237, + "kl": 0.008245188975706697, + "learning_rate": 9.966314477794052e-07, + "loss": 0.0065, + "num_tokens": 2709392.0, + "reward": 0.79461669921875, + "reward_std": 0.022853471338748932, + "rewards//mean": 0.79461669921875, + "rewards//std": 0.024937577545642853, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.047, + "grad_norm": 1.272891640663147, + "kl": 0.00855067535303533, + "learning_rate": 9.965945743752705e-07, + "loss": -0.0, + "num_tokens": 2720891.0, + "reward": 0.8162841796875, + "reward_std": 0.03155401721596718, + "rewards//mean": 0.8162841796875, + "rewards//std": 0.031696103513240814, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0472, + "grad_norm": 1.2365138530731201, + "kl": 0.006588739401195198, + "learning_rate": 9.965575009429005e-07, + "loss": 0.0007, + "num_tokens": 2732475.0, + "reward": 0.81170654296875, + "reward_std": 0.03280176222324371, + "rewards//mean": 0.81170654296875, + "rewards//std": 0.03853299096226692, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0474, + "grad_norm": 1.1724237203598022, + "kl": 0.010627396462950855, + "learning_rate": 9.965202274972286e-07, + "loss": 0.0011, + "num_tokens": 2743955.0, + "reward": 0.80169677734375, + "reward_std": 0.022364530712366104, + "rewards//mean": 0.80169677734375, + "rewards//std": 0.03697774186730385, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0476, + "grad_norm": 1.1021744012832642, + "kl": 0.008590694749727845, + "learning_rate": 9.964827540532684e-07, + "loss": 0.0015, + "num_tokens": 2755511.0, + "reward": 0.81048583984375, + "reward_std": 0.025258341804146767, + "rewards//mean": 0.81048583984375, + "rewards//std": 0.02929784543812275, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.0478, + "grad_norm": 1.2837520837783813, + "kl": 0.010990241542458534, + "learning_rate": 9.964450806261144e-07, + "loss": 0.0048, + "num_tokens": 2767061.0, + "reward": 0.7939453125, + "reward_std": 0.025663424283266068, + "rewards//mean": 0.7939453125, + "rewards//std": 0.03100099228322506, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.048, + "grad_norm": 1.1424111127853394, + "kl": 0.008943518914747983, + "learning_rate": 9.96407207230941e-07, + "loss": 0.0009, + "num_tokens": 2778693.0, + "reward": 0.7860107421875, + "reward_std": 0.027687296271324158, + "rewards//mean": 0.7860107421875, + "rewards//std": 0.0291692316532135, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0482, + "grad_norm": 1.244860053062439, + "kl": 0.010807477694470435, + "learning_rate": 9.963691338830042e-07, + "loss": 0.0011, + "num_tokens": 2790245.0, + "reward": 0.7744140625, + "reward_std": 0.02533801831305027, + "rewards//mean": 0.7744140625, + "rewards//std": 0.029992366209626198, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0484, + "grad_norm": 1.0469504594802856, + "kl": 0.008776394824963063, + "learning_rate": 9.963308605976396e-07, + "loss": 0.0009, + "num_tokens": 2801861.0, + "reward": 0.77630615234375, + "reward_std": 0.021249521523714066, + "rewards//mean": 0.77630615234375, + "rewards//std": 0.02194577269256115, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0486, + "grad_norm": 1.2696715593338013, + "kl": 0.0097695171716623, + "learning_rate": 9.962923873902636e-07, + "loss": 0.001, + "num_tokens": 2813341.0, + "reward": 0.78961181640625, + "reward_std": 0.027372390031814575, + "rewards//mean": 0.78961181640625, + "rewards//std": 0.029222311452031136, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0488, + "grad_norm": 1.3831356763839722, + "kl": 0.011655497830361128, + "learning_rate": 9.962537142763732e-07, + "loss": 0.0012, + "num_tokens": 2824901.0, + "reward": 0.80224609375, + "reward_std": 0.024543657898902893, + "rewards//mean": 0.80224609375, + "rewards//std": 0.03213271498680115, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.049, + "grad_norm": 1.2406865358352661, + "kl": 0.008833583386149257, + "learning_rate": 9.962148412715463e-07, + "loss": 0.0009, + "num_tokens": 2836581.0, + "reward": 0.76800537109375, + "reward_std": 0.02859538421034813, + "rewards//mean": 0.76800537109375, + "rewards//std": 0.03664795309305191, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.0492, + "grad_norm": 1.2430874109268188, + "kl": 0.011043741717003286, + "learning_rate": 9.961757683914405e-07, + "loss": 0.0061, + "num_tokens": 2848205.0, + "reward": 0.8192138671875, + "reward_std": 0.02882375754415989, + "rewards//mean": 0.8192138671875, + "rewards//std": 0.034809861332178116, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.0494, + "grad_norm": 1.1279609203338623, + "kl": 0.009550661663524806, + "learning_rate": 9.961364956517946e-07, + "loss": 0.0116, + "num_tokens": 2859743.0, + "reward": 0.79693603515625, + "reward_std": 0.02572210505604744, + "rewards//mean": 0.79693603515625, + "rewards//std": 0.02784399501979351, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0496, + "grad_norm": 1.1215286254882812, + "kl": 0.010010259924456477, + "learning_rate": 9.960970230684275e-07, + "loss": 0.0014, + "num_tokens": 2871316.0, + "reward": 0.81695556640625, + "reward_std": 0.023750539869070053, + "rewards//mean": 0.81695556640625, + "rewards//std": 0.024058280512690544, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.0498, + "grad_norm": 1.1543965339660645, + "kl": 0.010341011919081211, + "learning_rate": 9.960573506572389e-07, + "loss": -0.0048, + "num_tokens": 2882814.0, + "reward": 0.77532958984375, + "reward_std": 0.017231859266757965, + "rewards//mean": 0.77532958984375, + "rewards//std": 0.020195694640278816, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.05, + "grad_norm": 1.1222604513168335, + "kl": 0.010390961018856615, + "learning_rate": 9.960174784342087e-07, + "loss": 0.001, + "num_tokens": 2894390.0, + "reward": 0.79083251953125, + "reward_std": 0.023132719099521637, + "rewards//mean": 0.79083251953125, + "rewards//std": 0.025161173194646835, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0502, + "grad_norm": 1.0844975709915161, + "kl": 0.01104224193841219, + "learning_rate": 9.959774064153975e-07, + "loss": 0.0011, + "num_tokens": 2905926.0, + "reward": 0.8009033203125, + "reward_std": 0.03250252455472946, + "rewards//mean": 0.8009033203125, + "rewards//std": 0.036043886095285416, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.0504, + "grad_norm": 1.153923511505127, + "kl": 0.012111757416278124, + "learning_rate": 9.959371346169465e-07, + "loss": 0.0012, + "num_tokens": 2917565.0, + "reward": 0.7862548828125, + "reward_std": 0.023423906415700912, + "rewards//mean": 0.7862548828125, + "rewards//std": 0.029923392459750175, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.0506, + "grad_norm": 1.287606954574585, + "kl": 0.01207148830872029, + "learning_rate": 9.95896663055077e-07, + "loss": 0.0036, + "num_tokens": 2929054.0, + "reward": 0.81048583984375, + "reward_std": 0.022416427731513977, + "rewards//mean": 0.81048583984375, + "rewards//std": 0.02980598621070385, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0508, + "grad_norm": 1.1231026649475098, + "kl": 0.011375492031220347, + "learning_rate": 9.958559917460907e-07, + "loss": 0.0011, + "num_tokens": 2940694.0, + "reward": 0.78875732421875, + "reward_std": 0.02648017555475235, + "rewards//mean": 0.78875732421875, + "rewards//std": 0.03395635262131691, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.051, + "grad_norm": 1.2053015232086182, + "kl": 0.009938678645994514, + "learning_rate": 9.958151207063703e-07, + "loss": 0.0068, + "num_tokens": 2952306.0, + "reward": 0.7789306640625, + "reward_std": 0.021076750010252, + "rewards//mean": 0.7789306640625, + "rewards//std": 0.029779374599456787, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0512, + "grad_norm": 1.1794986724853516, + "kl": 0.015238574356772006, + "learning_rate": 9.957740499523785e-07, + "loss": 0.0029, + "num_tokens": 2963799.0, + "reward": 0.82135009765625, + "reward_std": 0.027614358812570572, + "rewards//mean": 0.82135009765625, + "rewards//std": 0.030712470412254333, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0514, + "grad_norm": 1.1652145385742188, + "kl": 0.009949094615876675, + "learning_rate": 9.957327795006588e-07, + "loss": -0.0007, + "num_tokens": 2975387.0, + "reward": 0.8143310546875, + "reward_std": 0.024211276322603226, + "rewards//mean": 0.8143310546875, + "rewards//std": 0.02735675685107708, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.0516, + "grad_norm": 1.2373652458190918, + "kl": 0.013123494223691523, + "learning_rate": 9.956913093678348e-07, + "loss": 0.0043, + "num_tokens": 2986908.0, + "reward": 0.79638671875, + "reward_std": 0.030045200139284134, + "rewards//mean": 0.79638671875, + "rewards//std": 0.034495554864406586, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0518, + "grad_norm": 1.1934781074523926, + "kl": 0.012349338852800429, + "learning_rate": 9.956496395706105e-07, + "loss": 0.0028, + "num_tokens": 2998560.0, + "reward": 0.8133544921875, + "reward_std": 0.025776367634534836, + "rewards//mean": 0.8133544921875, + "rewards//std": 0.03328031301498413, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.052, + "grad_norm": 1.337214708328247, + "kl": 0.013292047311551869, + "learning_rate": 9.956077701257707e-07, + "loss": 0.0079, + "num_tokens": 3010185.0, + "reward": 0.78729248046875, + "reward_std": 0.02457977831363678, + "rewards//mean": 0.78729248046875, + "rewards//std": 0.02752411551773548, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0522, + "grad_norm": 1.1447737216949463, + "kl": 0.01271663117222488, + "learning_rate": 9.955657010501806e-07, + "loss": 0.0013, + "num_tokens": 3021793.0, + "reward": 0.78466796875, + "reward_std": 0.024064302444458008, + "rewards//mean": 0.78466796875, + "rewards//std": 0.025718795135617256, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.0524, + "grad_norm": 1.255728006362915, + "kl": 0.011377447634004056, + "learning_rate": 9.955234323607851e-07, + "loss": 0.0035, + "num_tokens": 3033420.0, + "reward": 0.81072998046875, + "reward_std": 0.02416277304291725, + "rewards//mean": 0.81072998046875, + "rewards//std": 0.027699550613760948, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.0526, + "grad_norm": 1.2344557046890259, + "kl": 0.013957957737147808, + "learning_rate": 9.954809640746105e-07, + "loss": 0.0034, + "num_tokens": 3045008.0, + "reward": 0.8026123046875, + "reward_std": 0.027825985103845596, + "rewards//mean": 0.8026123046875, + "rewards//std": 0.028236806392669678, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0528, + "grad_norm": 1.1971732378005981, + "kl": 0.013251342228613794, + "learning_rate": 9.954382962087627e-07, + "loss": 0.0045, + "num_tokens": 3056577.0, + "reward": 0.81353759765625, + "reward_std": 0.029722675681114197, + "rewards//mean": 0.81353759765625, + "rewards//std": 0.032025065273046494, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.053, + "grad_norm": 1.2461051940917969, + "kl": 0.01512534893117845, + "learning_rate": 9.953954287804284e-07, + "loss": 0.0015, + "num_tokens": 3068049.0, + "reward": 0.7987060546875, + "reward_std": 0.029201455414295197, + "rewards//mean": 0.7987060546875, + "rewards//std": 0.0343722440302372, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0532, + "grad_norm": 1.3071945905685425, + "kl": 0.014019567985087633, + "learning_rate": 9.953523618068748e-07, + "loss": 0.0014, + "num_tokens": 3079545.0, + "reward": 0.7666015625, + "reward_std": 0.02329763025045395, + "rewards//mean": 0.7666015625, + "rewards//std": 0.02574232779443264, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.0534, + "grad_norm": 1.1817084550857544, + "kl": 0.014454333344474435, + "learning_rate": 9.95309095305449e-07, + "loss": -0.0007, + "num_tokens": 3091057.0, + "reward": 0.80657958984375, + "reward_std": 0.030988654121756554, + "rewards//mean": 0.80657958984375, + "rewards//std": 0.036604143679142, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.0536, + "grad_norm": 1.1785385608673096, + "kl": 0.014867257908917964, + "learning_rate": 9.952656292935788e-07, + "loss": -0.0067, + "num_tokens": 3102678.0, + "reward": 0.8118896484375, + "reward_std": 0.025450963526964188, + "rewards//mean": 0.8118896484375, + "rewards//std": 0.03180480748414993, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.0538, + "grad_norm": 1.0894451141357422, + "kl": 0.013448692625388503, + "learning_rate": 9.952219637887725e-07, + "loss": 0.0036, + "num_tokens": 3114179.0, + "reward": 0.81121826171875, + "reward_std": 0.018522003665566444, + "rewards//mean": 0.81121826171875, + "rewards//std": 0.026503117755055428, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.054, + "grad_norm": 1.2969520092010498, + "kl": 0.01466450176667422, + "learning_rate": 9.951780988086183e-07, + "loss": 0.0015, + "num_tokens": 3125723.0, + "reward": 0.7994384765625, + "reward_std": 0.02653959020972252, + "rewards//mean": 0.7994384765625, + "rewards//std": 0.02815305069088936, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.0542, + "grad_norm": 1.2147493362426758, + "kl": 0.013885351247154176, + "learning_rate": 9.95134034370785e-07, + "loss": 0.0058, + "num_tokens": 3137290.0, + "reward": 0.8013916015625, + "reward_std": 0.023777958005666733, + "rewards//mean": 0.8013916015625, + "rewards//std": 0.025115784257650375, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0544, + "grad_norm": 1.2643368244171143, + "kl": 0.014398536295630038, + "learning_rate": 9.95089770493022e-07, + "loss": 0.0014, + "num_tokens": 3148906.0, + "reward": 0.801025390625, + "reward_std": 0.026011131703853607, + "rewards//mean": 0.801025390625, + "rewards//std": 0.028167296200990677, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.0546, + "grad_norm": 1.322754979133606, + "kl": 0.01594191463664174, + "learning_rate": 9.950453071931588e-07, + "loss": 0.0076, + "num_tokens": 3160453.0, + "reward": 0.79681396484375, + "reward_std": 0.0320931002497673, + "rewards//mean": 0.79681396484375, + "rewards//std": 0.039478760212659836, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.0548, + "grad_norm": 1.2571158409118652, + "kl": 0.014130353461951017, + "learning_rate": 9.950006444891048e-07, + "loss": 0.0112, + "num_tokens": 3171978.0, + "reward": 0.73260498046875, + "reward_std": 0.030454525724053383, + "rewards//mean": 0.73260498046875, + "rewards//std": 0.03725304454565048, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.055, + "grad_norm": 1.2741409540176392, + "kl": 0.01795995549764484, + "learning_rate": 9.949557823988506e-07, + "loss": 0.0009, + "num_tokens": 3183580.0, + "reward": 0.816650390625, + "reward_std": 0.023825470358133316, + "rewards//mean": 0.816650390625, + "rewards//std": 0.024811681360006332, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.453125, + "epoch": 0.0552, + "grad_norm": 1.2093415260314941, + "kl": 0.014109914191067219, + "learning_rate": 9.949107209404663e-07, + "loss": -0.0005, + "num_tokens": 3195129.0, + "reward": 0.78509521484375, + "reward_std": 0.01757126860320568, + "rewards//mean": 0.78509521484375, + "rewards//std": 0.021748993545770645, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0554, + "grad_norm": 1.2230463027954102, + "kl": 0.01492247637361288, + "learning_rate": 9.94865460132103e-07, + "loss": -0.0015, + "num_tokens": 3206693.0, + "reward": 0.751708984375, + "reward_std": 0.029397983103990555, + "rewards//mean": 0.751708984375, + "rewards//std": 0.03515883535146713, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.0556, + "grad_norm": 1.1926078796386719, + "kl": 0.015882232692092657, + "learning_rate": 9.948199999919912e-07, + "loss": 0.0043, + "num_tokens": 3218312.0, + "reward": 0.810791015625, + "reward_std": 0.019539013504981995, + "rewards//mean": 0.810791015625, + "rewards//std": 0.022987527772784233, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.0558, + "grad_norm": 1.1551114320755005, + "kl": 0.017128022387623787, + "learning_rate": 9.947743405384428e-07, + "loss": 0.002, + "num_tokens": 3229855.0, + "reward": 0.7977294921875, + "reward_std": 0.024719813838601112, + "rewards//mean": 0.7977294921875, + "rewards//std": 0.03512157127261162, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.056, + "grad_norm": 1.1642647981643677, + "kl": 0.01898731803521514, + "learning_rate": 9.947284817898492e-07, + "loss": 0.0019, + "num_tokens": 3241415.0, + "reward": 0.78839111328125, + "reward_std": 0.0184599868953228, + "rewards//mean": 0.78839111328125, + "rewards//std": 0.023193951696157455, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.484375, + "epoch": 0.0562, + "grad_norm": 1.1425892114639282, + "kl": 0.017053907038643956, + "learning_rate": 9.946824237646824e-07, + "loss": 0.0024, + "num_tokens": 3252982.0, + "reward": 0.7890625, + "reward_std": 0.03209477663040161, + "rewards//mean": 0.7890625, + "rewards//std": 0.03771870210766792, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0564, + "grad_norm": 1.289402723312378, + "kl": 0.01572962081991136, + "learning_rate": 9.946361664814943e-07, + "loss": 0.0021, + "num_tokens": 3264490.0, + "reward": 0.79931640625, + "reward_std": 0.020042400807142258, + "rewards//mean": 0.79931640625, + "rewards//std": 0.022418100386857986, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0566, + "grad_norm": 1.1876749992370605, + "kl": 0.022140919812954962, + "learning_rate": 9.945897099589173e-07, + "loss": 0.0025, + "num_tokens": 3276030.0, + "reward": 0.7906494140625, + "reward_std": 0.021284427493810654, + "rewards//mean": 0.7906494140625, + "rewards//std": 0.022301318123936653, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.0568, + "grad_norm": 1.366654396057129, + "kl": 0.01802377635613084, + "learning_rate": 9.945430542156646e-07, + "loss": -0.0005, + "num_tokens": 3287547.0, + "reward": 0.79937744140625, + "reward_std": 0.032744791358709335, + "rewards//mean": 0.79937744140625, + "rewards//std": 0.04402047023177147, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.057, + "grad_norm": 1.2828047275543213, + "kl": 0.01531179016456008, + "learning_rate": 9.944961992705286e-07, + "loss": 0.0002, + "num_tokens": 3299024.0, + "reward": 0.7967529296875, + "reward_std": 0.027618657797574997, + "rewards//mean": 0.7967529296875, + "rewards//std": 0.035760547965765, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0572, + "grad_norm": 1.0858430862426758, + "kl": 0.02132353908382356, + "learning_rate": 9.944491451423827e-07, + "loss": 0.0021, + "num_tokens": 3310632.0, + "reward": 0.79541015625, + "reward_std": 0.015554318204522133, + "rewards//mean": 0.79541015625, + "rewards//std": 0.016682064160704613, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.0574, + "grad_norm": 1.2256423234939575, + "kl": 0.020300982054322958, + "learning_rate": 9.944018918501805e-07, + "loss": 0.0024, + "num_tokens": 3322230.0, + "reward": 0.798095703125, + "reward_std": 0.025434371083974838, + "rewards//mean": 0.798095703125, + "rewards//std": 0.030312664806842804, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.0576, + "grad_norm": 1.4023293256759644, + "kl": 0.01897532306611538, + "learning_rate": 9.94354439412955e-07, + "loss": 0.0019, + "num_tokens": 3333827.0, + "reward": 0.79205322265625, + "reward_std": 0.0313570462167263, + "rewards//mean": 0.79205322265625, + "rewards//std": 0.034888215363025665, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0578, + "grad_norm": 1.0838700532913208, + "kl": 0.01536969211883843, + "learning_rate": 9.943067878498209e-07, + "loss": 0.0015, + "num_tokens": 3345483.0, + "reward": 0.80908203125, + "reward_std": 0.022052453830838203, + "rewards//mean": 0.80908203125, + "rewards//std": 0.02603701688349247, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.058, + "grad_norm": 1.140939712524414, + "kl": 0.0168439979897812, + "learning_rate": 9.942589371799714e-07, + "loss": 0.0017, + "num_tokens": 3357139.0, + "reward": 0.8055419921875, + "reward_std": 0.023088578134775162, + "rewards//mean": 0.8055419921875, + "rewards//std": 0.025069937109947205, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0582, + "grad_norm": 1.1185773611068726, + "kl": 0.017564289271831512, + "learning_rate": 9.94210887422681e-07, + "loss": 0.0018, + "num_tokens": 3368739.0, + "reward": 0.8018798828125, + "reward_std": 0.023507263511419296, + "rewards//mean": 0.8018798828125, + "rewards//std": 0.02985045686364174, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0584, + "grad_norm": 1.2815731763839722, + "kl": 0.018888774560764432, + "learning_rate": 9.941626385973047e-07, + "loss": 0.0108, + "num_tokens": 3380215.0, + "reward": 0.8089599609375, + "reward_std": 0.028653264045715332, + "rewards//mean": 0.8089599609375, + "rewards//std": 0.030724111944437027, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.0586, + "grad_norm": 1.263424038887024, + "kl": 0.019111713161692023, + "learning_rate": 9.941141907232763e-07, + "loss": 0.0038, + "num_tokens": 3391812.0, + "reward": 0.81414794921875, + "reward_std": 0.028991851955652237, + "rewards//mean": 0.81414794921875, + "rewards//std": 0.031656619161367416, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.0588, + "grad_norm": 1.3187525272369385, + "kl": 0.018116427527274936, + "learning_rate": 9.94065543820111e-07, + "loss": 0.0047, + "num_tokens": 3403426.0, + "reward": 0.8018798828125, + "reward_std": 0.025517089292407036, + "rewards//mean": 0.8018798828125, + "rewards//std": 0.03060365840792656, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.059, + "grad_norm": 1.1258151531219482, + "kl": 0.025043359491974115, + "learning_rate": 9.94016697907404e-07, + "loss": 0.0016, + "num_tokens": 3414902.0, + "reward": 0.82110595703125, + "reward_std": 0.027923719957470894, + "rewards//mean": 0.82110595703125, + "rewards//std": 0.029216613620519638, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0592, + "grad_norm": 1.1002271175384521, + "kl": 0.01989887119270861, + "learning_rate": 9.9396765300483e-07, + "loss": 0.0049, + "num_tokens": 3426551.0, + "reward": 0.78936767578125, + "reward_std": 0.027759995311498642, + "rewards//mean": 0.78936767578125, + "rewards//std": 0.03387780115008354, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.421875, + "epoch": 0.0594, + "grad_norm": 1.1918543577194214, + "kl": 0.021069107460789382, + "learning_rate": 9.939184091321444e-07, + "loss": 0.0022, + "num_tokens": 3438074.0, + "reward": 0.80853271484375, + "reward_std": 0.024475054815411568, + "rewards//mean": 0.80853271484375, + "rewards//std": 0.028868958353996277, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.0596, + "grad_norm": 1.2204145193099976, + "kl": 0.021538736298680305, + "learning_rate": 9.938689663091827e-07, + "loss": 0.009, + "num_tokens": 3449711.0, + "reward": 0.79779052734375, + "reward_std": 0.025363672524690628, + "rewards//mean": 0.79779052734375, + "rewards//std": 0.029917510226368904, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.546875, + "epoch": 0.0598, + "grad_norm": 1.397246241569519, + "kl": 0.023905338952317834, + "learning_rate": 9.938193245558604e-07, + "loss": 0.0179, + "num_tokens": 3461242.0, + "reward": 0.81219482421875, + "reward_std": 0.031003689393401146, + "rewards//mean": 0.81219482421875, + "rewards//std": 0.03485044464468956, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.06, + "grad_norm": 1.3407435417175293, + "kl": 0.025259471964091063, + "learning_rate": 9.937694838921733e-07, + "loss": 0.001, + "num_tokens": 3472789.0, + "reward": 0.77490234375, + "reward_std": 0.02786066010594368, + "rewards//mean": 0.77490234375, + "rewards//std": 0.03486569970846176, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.0602, + "grad_norm": 1.2577412128448486, + "kl": 0.023199884220957756, + "learning_rate": 9.93719444338197e-07, + "loss": 0.0096, + "num_tokens": 3484399.0, + "reward": 0.7840576171875, + "reward_std": 0.024786051362752914, + "rewards//mean": 0.7840576171875, + "rewards//std": 0.02931004948914051, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.203125, + "epoch": 0.0604, + "grad_norm": 1.1602857112884521, + "kl": 0.025301249930635095, + "learning_rate": 9.936692059140878e-07, + "loss": 0.0029, + "num_tokens": 3495924.0, + "reward": 0.8021240234375, + "reward_std": 0.02409994788467884, + "rewards//mean": 0.8021240234375, + "rewards//std": 0.028681481257081032, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.0606, + "grad_norm": 1.1144319772720337, + "kl": 0.02766710170544684, + "learning_rate": 9.936187686400814e-07, + "loss": 0.0024, + "num_tokens": 3507629.0, + "reward": 0.8157958984375, + "reward_std": 0.023020194843411446, + "rewards//mean": 0.8157958984375, + "rewards//std": 0.03060959465801716, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0608, + "grad_norm": 1.1618551015853882, + "kl": 0.022473751567304134, + "learning_rate": 9.93568132536494e-07, + "loss": 0.0022, + "num_tokens": 3519213.0, + "reward": 0.7811279296875, + "reward_std": 0.026142794638872147, + "rewards//mean": 0.7811279296875, + "rewards//std": 0.03012104518711567, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.061, + "grad_norm": 1.202674150466919, + "kl": 0.02251893631182611, + "learning_rate": 9.935172976237217e-07, + "loss": 0.0023, + "num_tokens": 3530821.0, + "reward": 0.81500244140625, + "reward_std": 0.023249033838510513, + "rewards//mean": 0.81500244140625, + "rewards//std": 0.025117216631770134, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.40625, + "epoch": 0.0612, + "grad_norm": 1.248274803161621, + "kl": 0.02426713122986257, + "learning_rate": 9.93466263922241e-07, + "loss": -0.0034, + "num_tokens": 3542335.0, + "reward": 0.796142578125, + "reward_std": 0.024836760014295578, + "rewards//mean": 0.796142578125, + "rewards//std": 0.03267369419336319, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.0614, + "grad_norm": 1.381235122680664, + "kl": 0.02584023028612137, + "learning_rate": 9.934150314526083e-07, + "loss": 0.0042, + "num_tokens": 3553861.0, + "reward": 0.79718017578125, + "reward_std": 0.029303215444087982, + "rewards//mean": 0.79718017578125, + "rewards//std": 0.02955556847155094, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0616, + "grad_norm": 1.3703118562698364, + "kl": 0.02646248647943139, + "learning_rate": 9.933636002354599e-07, + "loss": 0.0005, + "num_tokens": 3565425.0, + "reward": 0.7718505859375, + "reward_std": 0.02536950632929802, + "rewards//mean": 0.7718505859375, + "rewards//std": 0.034186773002147675, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0618, + "grad_norm": 1.2688044309616089, + "kl": 0.0304298282135278, + "learning_rate": 9.933119702915124e-07, + "loss": 0.0026, + "num_tokens": 3576926.0, + "reward": 0.73028564453125, + "reward_std": 0.023411057889461517, + "rewards//mean": 0.73028564453125, + "rewards//std": 0.026451662182807922, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.062, + "grad_norm": 1.2057536840438843, + "kl": 0.0254519023001194, + "learning_rate": 9.93260141641562e-07, + "loss": 0.0034, + "num_tokens": 3588516.0, + "reward": 0.78173828125, + "reward_std": 0.028781825676560402, + "rewards//mean": 0.78173828125, + "rewards//std": 0.03147780895233154, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0622, + "grad_norm": 1.3810088634490967, + "kl": 0.029732419876381755, + "learning_rate": 9.932081143064858e-07, + "loss": 0.003, + "num_tokens": 3600012.0, + "reward": 0.79498291015625, + "reward_std": 0.02406536415219307, + "rewards//mean": 0.79498291015625, + "rewards//std": 0.027125800028443336, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0624, + "grad_norm": 1.1323539018630981, + "kl": 0.027476518414914608, + "learning_rate": 9.931558883072402e-07, + "loss": 0.0015, + "num_tokens": 3611640.0, + "reward": 0.80511474609375, + "reward_std": 0.022934796288609505, + "rewards//mean": 0.80511474609375, + "rewards//std": 0.02804011106491089, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0626, + "grad_norm": 1.4383113384246826, + "kl": 0.025777461705729365, + "learning_rate": 9.931034636648616e-07, + "loss": 0.0026, + "num_tokens": 3623184.0, + "reward": 0.6922607421875, + "reward_std": 0.027233479544520378, + "rewards//mean": 0.6922607421875, + "rewards//std": 0.03055613674223423, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.0628, + "grad_norm": 1.2763906717300415, + "kl": 0.029940301086753607, + "learning_rate": 9.930508404004666e-07, + "loss": 0.0034, + "num_tokens": 3634703.0, + "reward": 0.81060791015625, + "reward_std": 0.027314525097608566, + "rewards//mean": 0.81060791015625, + "rewards//std": 0.033320486545562744, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.063, + "grad_norm": 1.1659605503082275, + "kl": 0.02346780290827155, + "learning_rate": 9.929980185352525e-07, + "loss": 0.0025, + "num_tokens": 3646206.0, + "reward": 0.78179931640625, + "reward_std": 0.02480373904109001, + "rewards//mean": 0.78179931640625, + "rewards//std": 0.029437018558382988, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0632, + "grad_norm": 1.1683433055877686, + "kl": 0.027125434949994087, + "learning_rate": 9.929449980904951e-07, + "loss": 0.0062, + "num_tokens": 3657759.0, + "reward": 0.80218505859375, + "reward_std": 0.030723676085472107, + "rewards//mean": 0.80218505859375, + "rewards//std": 0.03729487210512161, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.0634, + "grad_norm": 1.2210272550582886, + "kl": 0.024885480874218047, + "learning_rate": 9.928917790875516e-07, + "loss": 0.0056, + "num_tokens": 3669370.0, + "reward": 0.79052734375, + "reward_std": 0.019992616027593613, + "rewards//mean": 0.79052734375, + "rewards//std": 0.02374039590358734, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0636, + "grad_norm": 1.2803411483764648, + "kl": 0.032130842097103596, + "learning_rate": 9.928383615478586e-07, + "loss": 0.0018, + "num_tokens": 3680862.0, + "reward": 0.82244873046875, + "reward_std": 0.025192122906446457, + "rewards//mean": 0.82244873046875, + "rewards//std": 0.02827076055109501, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.546875, + "epoch": 0.0638, + "grad_norm": 1.1926652193069458, + "kl": 0.027993900002911687, + "learning_rate": 9.927847454929322e-07, + "loss": 0.0035, + "num_tokens": 3692401.0, + "reward": 0.80584716796875, + "reward_std": 0.02786526456475258, + "rewards//mean": 0.80584716796875, + "rewards//std": 0.0284735057502985, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.064, + "grad_norm": 1.1263068914413452, + "kl": 0.030394022120162845, + "learning_rate": 9.927309309443695e-07, + "loss": 0.003, + "num_tokens": 3703929.0, + "reward": 0.79681396484375, + "reward_std": 0.02531307376921177, + "rewards//mean": 0.79681396484375, + "rewards//std": 0.026725510135293007, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.0642, + "grad_norm": 1.1440457105636597, + "kl": 0.03007757500745356, + "learning_rate": 9.926769179238464e-07, + "loss": 0.0056, + "num_tokens": 3715440.0, + "reward": 0.8018798828125, + "reward_std": 0.03379879146814346, + "rewards//mean": 0.8018798828125, + "rewards//std": 0.03723704069852829, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.28125, + "epoch": 0.0644, + "grad_norm": 1.4551879167556763, + "kl": 0.03213723236694932, + "learning_rate": 9.926227064531199e-07, + "loss": 0.0084, + "num_tokens": 3726874.0, + "reward": 0.77099609375, + "reward_std": 0.029695551842451096, + "rewards//mean": 0.77099609375, + "rewards//std": 0.030133355408906937, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.453125, + "epoch": 0.0646, + "grad_norm": 1.2525047063827515, + "kl": 0.040605742018669844, + "learning_rate": 9.925682965540263e-07, + "loss": 0.0056, + "num_tokens": 3738375.0, + "reward": 0.80010986328125, + "reward_std": 0.01977054961025715, + "rewards//mean": 0.80010986328125, + "rewards//std": 0.027050357311964035, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.0648, + "grad_norm": 1.3429172039031982, + "kl": 0.034672586945816875, + "learning_rate": 9.925136882484815e-07, + "loss": 0.0106, + "num_tokens": 3749894.0, + "reward": 0.80419921875, + "reward_std": 0.027832452207803726, + "rewards//mean": 0.80419921875, + "rewards//std": 0.02808656543493271, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.625, + "epoch": 0.065, + "grad_norm": 1.4187966585159302, + "kl": 0.02910194161813706, + "learning_rate": 9.92458881558482e-07, + "loss": 0.0065, + "num_tokens": 3761486.0, + "reward": 0.78460693359375, + "reward_std": 0.027457643300294876, + "rewards//mean": 0.78460693359375, + "rewards//std": 0.02987801842391491, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.0652, + "grad_norm": 1.3828402757644653, + "kl": 0.029935470316559076, + "learning_rate": 9.92403876506104e-07, + "loss": 0.0045, + "num_tokens": 3772971.0, + "reward": 0.8255615234375, + "reward_std": 0.02778216078877449, + "rewards//mean": 0.8255615234375, + "rewards//std": 0.033037446439266205, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "epoch": 0.0654, + "grad_norm": 1.2401894330978394, + "kl": 0.02992502530105412, + "learning_rate": 9.923486731135033e-07, + "loss": -0.0117, + "num_tokens": 3784523.0, + "reward": 0.794921875, + "reward_std": 0.02845122292637825, + "rewards//mean": 0.794921875, + "rewards//std": 0.03391135483980179, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.0656, + "grad_norm": 1.1398768424987793, + "kl": 0.026519964216277003, + "learning_rate": 9.922932714029163e-07, + "loss": 0.0024, + "num_tokens": 3795987.0, + "reward": 0.79254150390625, + "reward_std": 0.023310715332627296, + "rewards//mean": 0.79254150390625, + "rewards//std": 0.02576223947107792, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.390625, + "epoch": 0.0658, + "grad_norm": 1.2348361015319824, + "kl": 0.03295831196010113, + "learning_rate": 9.92237671396658e-07, + "loss": 0.0074, + "num_tokens": 3807508.0, + "reward": 0.8155517578125, + "reward_std": 0.02153700962662697, + "rewards//mean": 0.8155517578125, + "rewards//std": 0.023596813902258873, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.066, + "grad_norm": 1.3914519548416138, + "kl": 0.030958315823227167, + "learning_rate": 9.921818731171248e-07, + "loss": 0.0053, + "num_tokens": 3819043.0, + "reward": 0.79833984375, + "reward_std": 0.025814559310674667, + "rewards//mean": 0.79833984375, + "rewards//std": 0.031523942947387695, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0662, + "grad_norm": 1.3525701761245728, + "kl": 0.03262951271608472, + "learning_rate": 9.921258765867919e-07, + "loss": 0.0028, + "num_tokens": 3830720.0, + "reward": 0.76580810546875, + "reward_std": 0.025150394067168236, + "rewards//mean": 0.76580810546875, + "rewards//std": 0.030524104833602905, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.0664, + "grad_norm": 1.226157307624817, + "kl": 0.03116460796445608, + "learning_rate": 9.920696818282147e-07, + "loss": -0.0049, + "num_tokens": 3842224.0, + "reward": 0.7908935546875, + "reward_std": 0.025506190955638885, + "rewards//mean": 0.7908935546875, + "rewards//std": 0.029272839426994324, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.0666, + "grad_norm": 1.1195716857910156, + "kl": 0.031989158829674125, + "learning_rate": 9.920132888640284e-07, + "loss": 0.0073, + "num_tokens": 3853735.0, + "reward": 0.7935791015625, + "reward_std": 0.027827303856611252, + "rewards//mean": 0.7935791015625, + "rewards//std": 0.031698014587163925, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.546875, + "epoch": 0.0668, + "grad_norm": 1.1506812572479248, + "kl": 0.02873358433134854, + "learning_rate": 9.919566977169485e-07, + "loss": 0.0035, + "num_tokens": 3865370.0, + "reward": 0.798583984375, + "reward_std": 0.021428434178233147, + "rewards//mean": 0.798583984375, + "rewards//std": 0.029878081753849983, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.515625, + "epoch": 0.067, + "grad_norm": 1.2587814331054688, + "kl": 0.033506108447909355, + "learning_rate": 9.918999084097694e-07, + "loss": -0.006, + "num_tokens": 3876955.0, + "reward": 0.8260498046875, + "reward_std": 0.022423313930630684, + "rewards//mean": 0.8260498046875, + "rewards//std": 0.023354357108473778, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.0672, + "grad_norm": 1.238105297088623, + "kl": 0.029267802834510803, + "learning_rate": 9.91842920965366e-07, + "loss": 0.0104, + "num_tokens": 3888496.0, + "reward": 0.77783203125, + "reward_std": 0.022726602852344513, + "rewards//mean": 0.77783203125, + "rewards//std": 0.027432184666395187, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.0674, + "grad_norm": 1.158074975013733, + "kl": 0.031539147486910224, + "learning_rate": 9.91785735406693e-07, + "loss": 0.0033, + "num_tokens": 3900134.0, + "reward": 0.80023193359375, + "reward_std": 0.030601641163229942, + "rewards//mean": 0.80023193359375, + "rewards//std": 0.037724271416664124, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0676, + "grad_norm": 1.4569143056869507, + "kl": 0.031348762568086386, + "learning_rate": 9.917283517567843e-07, + "loss": 0.0052, + "num_tokens": 3911703.0, + "reward": 0.76611328125, + "reward_std": 0.030468005686998367, + "rewards//mean": 0.76611328125, + "rewards//std": 0.032192960381507874, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.0678, + "grad_norm": 1.3769904375076294, + "kl": 0.03177173831500113, + "learning_rate": 9.916707700387545e-07, + "loss": 0.0002, + "num_tokens": 3923323.0, + "reward": 0.80963134765625, + "reward_std": 0.031262271106243134, + "rewards//mean": 0.80963134765625, + "rewards//std": 0.03489862382411957, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.068, + "grad_norm": 1.225389838218689, + "kl": 0.03257970581762493, + "learning_rate": 9.916129902757974e-07, + "loss": 0.0033, + "num_tokens": 3934947.0, + "reward": 0.81158447265625, + "reward_std": 0.025010230019688606, + "rewards//mean": 0.81158447265625, + "rewards//std": 0.029328828677535057, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.0682, + "grad_norm": 1.203823208808899, + "kl": 0.03133397805504501, + "learning_rate": 9.915550124911866e-07, + "loss": -0.0015, + "num_tokens": 3946697.0, + "reward": 0.79888916015625, + "reward_std": 0.021743791177868843, + "rewards//mean": 0.79888916015625, + "rewards//std": 0.02500728704035282, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0684, + "grad_norm": 1.1600826978683472, + "kl": 0.03172787418588996, + "learning_rate": 9.914968367082755e-07, + "loss": 0.0019, + "num_tokens": 3958229.0, + "reward": 0.77716064453125, + "reward_std": 0.025884857401251793, + "rewards//mean": 0.77716064453125, + "rewards//std": 0.029540199786424637, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.0686, + "grad_norm": 1.131107211112976, + "kl": 0.03289396036416292, + "learning_rate": 9.914384629504973e-07, + "loss": 0.0041, + "num_tokens": 3969811.0, + "reward": 0.8204345703125, + "reward_std": 0.02523455023765564, + "rewards//mean": 0.8204345703125, + "rewards//std": 0.025854693725705147, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.0688, + "grad_norm": 1.1898224353790283, + "kl": 0.03414563275873661, + "learning_rate": 9.913798912413652e-07, + "loss": -0.0088, + "num_tokens": 3981361.0, + "reward": 0.8079833984375, + "reward_std": 0.022591372951865196, + "rewards//mean": 0.8079833984375, + "rewards//std": 0.02468782663345337, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.069, + "grad_norm": 1.2624619007110596, + "kl": 0.034018988721072674, + "learning_rate": 9.913211216044713e-07, + "loss": -0.0022, + "num_tokens": 3992887.0, + "reward": 0.7962646484375, + "reward_std": 0.02231900580227375, + "rewards//mean": 0.7962646484375, + "rewards//std": 0.02758818306028843, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.625, + "epoch": 0.0692, + "grad_norm": 1.2249315977096558, + "kl": 0.036993842804804444, + "learning_rate": 9.912621540634886e-07, + "loss": 0.0069, + "num_tokens": 4004527.0, + "reward": 0.8157958984375, + "reward_std": 0.02640628069639206, + "rewards//mean": 0.8157958984375, + "rewards//std": 0.02935752458870411, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0694, + "grad_norm": 1.4077168703079224, + "kl": 0.041965661104768515, + "learning_rate": 9.91202988642169e-07, + "loss": 0.0042, + "num_tokens": 4016031.0, + "reward": 0.77655029296875, + "reward_std": 0.023539628833532333, + "rewards//mean": 0.77655029296875, + "rewards//std": 0.02671361342072487, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0696, + "grad_norm": 1.217477560043335, + "kl": 0.03215849818661809, + "learning_rate": 9.911436253643443e-07, + "loss": 0.0032, + "num_tokens": 4027623.0, + "reward": 0.788330078125, + "reward_std": 0.02636794187128544, + "rewards//mean": 0.788330078125, + "rewards//std": 0.029731810092926025, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.0698, + "grad_norm": 1.1424661874771118, + "kl": 0.035533767426386476, + "learning_rate": 9.91084064253926e-07, + "loss": 0.004, + "num_tokens": 4039206.0, + "reward": 0.7974853515625, + "reward_std": 0.025544654577970505, + "rewards//mean": 0.7974853515625, + "rewards//std": 0.028824683278799057, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.515625, + "epoch": 0.07, + "grad_norm": 1.4300824403762817, + "kl": 0.035149377305060625, + "learning_rate": 9.910243053349055e-07, + "loss": -0.0015, + "num_tokens": 4050727.0, + "reward": 0.805908203125, + "reward_std": 0.019575463607907295, + "rewards//mean": 0.805908203125, + "rewards//std": 0.024497317150235176, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.0702, + "grad_norm": 1.2772653102874756, + "kl": 0.035782160237431526, + "learning_rate": 9.909643486313533e-07, + "loss": -0.0062, + "num_tokens": 4062278.0, + "reward": 0.78118896484375, + "reward_std": 0.021593401208519936, + "rewards//mean": 0.78118896484375, + "rewards//std": 0.025050833821296692, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.0704, + "grad_norm": 1.2039093971252441, + "kl": 0.03318748972378671, + "learning_rate": 9.909041941674204e-07, + "loss": 0.0061, + "num_tokens": 4073847.0, + "reward": 0.8140869140625, + "reward_std": 0.02752489596605301, + "rewards//mean": 0.8140869140625, + "rewards//std": 0.03034336306154728, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.0706, + "grad_norm": 1.1472655534744263, + "kl": 0.03346814797259867, + "learning_rate": 9.908438419673366e-07, + "loss": 0.0043, + "num_tokens": 4085349.0, + "reward": 0.8123779296875, + "reward_std": 0.01901896297931671, + "rewards//mean": 0.8123779296875, + "rewards//std": 0.020902609452605247, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.0708, + "grad_norm": 1.1409374475479126, + "kl": 0.03465895843692124, + "learning_rate": 9.90783292055412e-07, + "loss": 0.0022, + "num_tokens": 4096942.0, + "reward": 0.79925537109375, + "reward_std": 0.019223207607865334, + "rewards//mean": 0.79925537109375, + "rewards//std": 0.022070953622460365, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.071, + "grad_norm": 1.3787992000579834, + "kl": 0.03627641638740897, + "learning_rate": 9.907225444560361e-07, + "loss": 0.0018, + "num_tokens": 4108491.0, + "reward": 0.79766845703125, + "reward_std": 0.024145551025867462, + "rewards//mean": 0.79766845703125, + "rewards//std": 0.026237910613417625, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.0712, + "grad_norm": 1.2785499095916748, + "kl": 0.03738506883382797, + "learning_rate": 9.90661599193678e-07, + "loss": -0.0039, + "num_tokens": 4120143.0, + "reward": 0.79388427734375, + "reward_std": 0.02348959445953369, + "rewards//mean": 0.79388427734375, + "rewards//std": 0.028200527653098106, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.0714, + "grad_norm": 1.241267442703247, + "kl": 0.035516455536708236, + "learning_rate": 9.906004562928863e-07, + "loss": 0.0044, + "num_tokens": 4131681.0, + "reward": 0.80291748046875, + "reward_std": 0.01918039843440056, + "rewards//mean": 0.80291748046875, + "rewards//std": 0.021417247131466866, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.0716, + "grad_norm": 1.3162062168121338, + "kl": 0.03826586063951254, + "learning_rate": 9.905391157782897e-07, + "loss": 0.0046, + "num_tokens": 4143145.0, + "reward": 0.83905029296875, + "reward_std": 0.025836888700723648, + "rewards//mean": 0.83905029296875, + "rewards//std": 0.033161554485559464, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0718, + "grad_norm": 1.2055145502090454, + "kl": 0.03755038604140282, + "learning_rate": 9.904775776745956e-07, + "loss": 0.0063, + "num_tokens": 4154670.0, + "reward": 0.793212890625, + "reward_std": 0.03611326962709427, + "rewards//mean": 0.793212890625, + "rewards//std": 0.039999157190322876, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.072, + "grad_norm": 1.2966399192810059, + "kl": 0.03584053018130362, + "learning_rate": 9.904158420065922e-07, + "loss": 0.0052, + "num_tokens": 4166302.0, + "reward": 0.8045654296875, + "reward_std": 0.024638304486870766, + "rewards//mean": 0.8045654296875, + "rewards//std": 0.029248006641864777, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.515625, + "epoch": 0.0722, + "grad_norm": 1.2763437032699585, + "kl": 0.043002511374652386, + "learning_rate": 9.903539087991461e-07, + "loss": 0.0018, + "num_tokens": 4177727.0, + "reward": 0.79241943359375, + "reward_std": 0.029620736837387085, + "rewards//mean": 0.79241943359375, + "rewards//std": 0.036063358187675476, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.515625, + "epoch": 0.0724, + "grad_norm": 1.3326338529586792, + "kl": 0.035948282573372126, + "learning_rate": 9.902917780772042e-07, + "loss": 0.0033, + "num_tokens": 4189240.0, + "reward": 0.82403564453125, + "reward_std": 0.023061469197273254, + "rewards//mean": 0.82403564453125, + "rewards//std": 0.0274403914809227, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.0726, + "grad_norm": 1.2857993841171265, + "kl": 0.03505432326346636, + "learning_rate": 9.902294498657929e-07, + "loss": 0.0048, + "num_tokens": 4200934.0, + "reward": 0.80694580078125, + "reward_std": 0.03454373776912689, + "rewards//mean": 0.80694580078125, + "rewards//std": 0.04150828346610069, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.0728, + "grad_norm": 1.1578078269958496, + "kl": 0.03790907934308052, + "learning_rate": 9.901669241900176e-07, + "loss": 0.0027, + "num_tokens": 4212407.0, + "reward": 0.8111572265625, + "reward_std": 0.02655794471502304, + "rewards//mean": 0.8111572265625, + "rewards//std": 0.031774330884218216, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.073, + "grad_norm": 1.2454404830932617, + "kl": 0.03847545967437327, + "learning_rate": 9.90104201075064e-07, + "loss": 0.0031, + "num_tokens": 4223996.0, + "reward": 0.79736328125, + "reward_std": 0.02928919345140457, + "rewards//mean": 0.79736328125, + "rewards//std": 0.03760616108775139, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.546875, + "epoch": 0.0732, + "grad_norm": 1.2818838357925415, + "kl": 0.03853247966617346, + "learning_rate": 9.900412805461966e-07, + "loss": -0.009, + "num_tokens": 4235663.0, + "reward": 0.78045654296875, + "reward_std": 0.026578780263662338, + "rewards//mean": 0.78045654296875, + "rewards//std": 0.03302982449531555, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0734, + "grad_norm": 1.1701369285583496, + "kl": 0.03944171965122223, + "learning_rate": 9.899781626287602e-07, + "loss": 0.0039, + "num_tokens": 4247391.0, + "reward": 0.80755615234375, + "reward_std": 0.03193872421979904, + "rewards//mean": 0.80755615234375, + "rewards//std": 0.0383506715297699, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0736, + "grad_norm": 1.344720482826233, + "kl": 0.042347042355686426, + "learning_rate": 9.899148473481784e-07, + "loss": 0.0042, + "num_tokens": 4258951.0, + "reward": 0.82110595703125, + "reward_std": 0.022355489432811737, + "rewards//mean": 0.82110595703125, + "rewards//std": 0.02412174642086029, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0738, + "grad_norm": 1.3763811588287354, + "kl": 0.04303055093623698, + "learning_rate": 9.898513347299547e-07, + "loss": 0.0058, + "num_tokens": 4270499.0, + "reward": 0.81365966796875, + "reward_std": 0.030093904584646225, + "rewards//mean": 0.81365966796875, + "rewards//std": 0.03360773250460625, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.53125, + "epoch": 0.074, + "grad_norm": 1.178135871887207, + "kl": 0.04423736780881882, + "learning_rate": 9.89787624799672e-07, + "loss": 0.0038, + "num_tokens": 4282037.0, + "reward": 0.81842041015625, + "reward_std": 0.029096785932779312, + "rewards//mean": 0.81842041015625, + "rewards//std": 0.03384695574641228, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.90625, + "epoch": 0.0742, + "grad_norm": 1.2565642595291138, + "kl": 0.041873099748045206, + "learning_rate": 9.897237175829926e-07, + "loss": 0.0149, + "num_tokens": 4293471.0, + "reward": 0.82244873046875, + "reward_std": 0.024692248553037643, + "rewards//mean": 0.82244873046875, + "rewards//std": 0.028738096356391907, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.0744, + "grad_norm": 1.5979794263839722, + "kl": 0.04749782243743539, + "learning_rate": 9.896596131056582e-07, + "loss": -0.0014, + "num_tokens": 4305017.0, + "reward": 0.8233642578125, + "reward_std": 0.022950297221541405, + "rewards//mean": 0.8233642578125, + "rewards//std": 0.022696930915117264, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.546875, + "epoch": 0.0746, + "grad_norm": 1.291281819343567, + "kl": 0.04409839352592826, + "learning_rate": 9.895953113934903e-07, + "loss": 0.0136, + "num_tokens": 4316548.0, + "reward": 0.81134033203125, + "reward_std": 0.028813570737838745, + "rewards//mean": 0.81134033203125, + "rewards//std": 0.038347117602825165, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0748, + "grad_norm": 1.2957156896591187, + "kl": 0.046250255312770605, + "learning_rate": 9.895308124723896e-07, + "loss": 0.0046, + "num_tokens": 4328236.0, + "reward": 0.7904052734375, + "reward_std": 0.026066003367304802, + "rewards//mean": 0.7904052734375, + "rewards//std": 0.030955791473388672, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.075, + "grad_norm": 1.324583888053894, + "kl": 0.05067467666231096, + "learning_rate": 9.89466116368336e-07, + "loss": 0.0086, + "num_tokens": 4339771.0, + "reward": 0.7989501953125, + "reward_std": 0.023396246135234833, + "rewards//mean": 0.7989501953125, + "rewards//std": 0.029773274436593056, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.484375, + "epoch": 0.0752, + "grad_norm": 1.2587722539901733, + "kl": 0.04225762444548309, + "learning_rate": 9.894012231073895e-07, + "loss": 0.0011, + "num_tokens": 4351322.0, + "reward": 0.7930908203125, + "reward_std": 0.021248944103717804, + "rewards//mean": 0.7930908203125, + "rewards//std": 0.032188963145017624, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.0754, + "grad_norm": 1.2941175699234009, + "kl": 0.03811949072405696, + "learning_rate": 9.893361327156884e-07, + "loss": 0.0049, + "num_tokens": 4362943.0, + "reward": 0.827392578125, + "reward_std": 0.02312588505446911, + "rewards//mean": 0.827392578125, + "rewards//std": 0.02548583783209324, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0756, + "grad_norm": 1.2711883783340454, + "kl": 0.03777657821774483, + "learning_rate": 9.89270845219452e-07, + "loss": 0.0038, + "num_tokens": 4374503.0, + "reward": 0.803466796875, + "reward_std": 0.025391606613993645, + "rewards//mean": 0.803466796875, + "rewards//std": 0.032210823148489, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.0758, + "grad_norm": 1.2160468101501465, + "kl": 0.048937785904854536, + "learning_rate": 9.892053606449774e-07, + "loss": 0.0162, + "num_tokens": 4386044.0, + "reward": 0.8319091796875, + "reward_std": 0.031720198690891266, + "rewards//mean": 0.8319091796875, + "rewards//std": 0.03744783625006676, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.076, + "grad_norm": 1.2322711944580078, + "kl": 0.04837035248056054, + "learning_rate": 9.891396790186422e-07, + "loss": 0.0077, + "num_tokens": 4397571.0, + "reward": 0.81060791015625, + "reward_std": 0.03173164278268814, + "rewards//mean": 0.81060791015625, + "rewards//std": 0.0382605716586113, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0762, + "grad_norm": 1.2089402675628662, + "kl": 0.04401159449480474, + "learning_rate": 9.890738003669027e-07, + "loss": 0.0044, + "num_tokens": 4409123.0, + "reward": 0.807373046875, + "reward_std": 0.021978460252285004, + "rewards//mean": 0.807373046875, + "rewards//std": 0.035962529480457306, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.984375, + "epoch": 0.0764, + "grad_norm": 1.355175495147705, + "kl": 0.04720056802034378, + "learning_rate": 9.89007724716295e-07, + "loss": 0.008, + "num_tokens": 4420762.0, + "reward": 0.8157958984375, + "reward_std": 0.027678146958351135, + "rewards//mean": 0.8157958984375, + "rewards//std": 0.03619978204369545, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.0766, + "grad_norm": 1.3728119134902954, + "kl": 0.04904782539233565, + "learning_rate": 9.889414520934343e-07, + "loss": 0.0035, + "num_tokens": 4432336.0, + "reward": 0.79449462890625, + "reward_std": 0.040972404181957245, + "rewards//mean": 0.79449462890625, + "rewards//std": 0.040172819048166275, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.0768, + "grad_norm": 1.3392159938812256, + "kl": 0.05838443525135517, + "learning_rate": 9.88874982525015e-07, + "loss": 0.0057, + "num_tokens": 4443893.0, + "reward": 0.814208984375, + "reward_std": 0.026552587747573853, + "rewards//mean": 0.814208984375, + "rewards//std": 0.03254743292927742, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.077, + "grad_norm": 1.2945582866668701, + "kl": 0.04591795476153493, + "learning_rate": 9.888083160378112e-07, + "loss": 0.0046, + "num_tokens": 4455437.0, + "reward": 0.789794921875, + "reward_std": 0.01999097689986229, + "rewards//mean": 0.789794921875, + "rewards//std": 0.026345649734139442, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.0772, + "grad_norm": 1.2085317373275757, + "kl": 0.04750163294374943, + "learning_rate": 9.887414526586763e-07, + "loss": 0.0055, + "num_tokens": 4466984.0, + "reward": 0.798095703125, + "reward_std": 0.02421397902071476, + "rewards//mean": 0.798095703125, + "rewards//std": 0.03376730903983116, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.0774, + "grad_norm": 1.210067868232727, + "kl": 0.05140670109540224, + "learning_rate": 9.886743924145426e-07, + "loss": 0.0016, + "num_tokens": 4478519.0, + "reward": 0.81134033203125, + "reward_std": 0.03145097941160202, + "rewards//mean": 0.81134033203125, + "rewards//std": 0.036469489336013794, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5625, + "epoch": 0.0776, + "grad_norm": 1.3598098754882812, + "kl": 0.05385316349565983, + "learning_rate": 9.886071353324222e-07, + "loss": 0.012, + "num_tokens": 4490019.0, + "reward": 0.81549072265625, + "reward_std": 0.029430070891976357, + "rewards//mean": 0.81549072265625, + "rewards//std": 0.036595869809389114, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.53125, + "epoch": 0.0778, + "grad_norm": 1.4910199642181396, + "kl": 0.052869205828756094, + "learning_rate": 9.88539681439406e-07, + "loss": 0.0132, + "num_tokens": 4501557.0, + "reward": 0.7974853515625, + "reward_std": 0.0280414167791605, + "rewards//mean": 0.7974853515625, + "rewards//std": 0.03679043799638748, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.078, + "grad_norm": 1.1800705194473267, + "kl": 0.056939542293548584, + "learning_rate": 9.884720307626646e-07, + "loss": 0.0186, + "num_tokens": 4513106.0, + "reward": 0.80328369140625, + "reward_std": 0.02333611249923706, + "rewards//mean": 0.80328369140625, + "rewards//std": 0.024757858365774155, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.328125, + "epoch": 0.0782, + "grad_norm": 1.3258925676345825, + "kl": 0.04753875592723489, + "learning_rate": 9.884041833294475e-07, + "loss": -0.0147, + "num_tokens": 4524615.0, + "reward": 0.823974609375, + "reward_std": 0.038292597979307175, + "rewards//mean": 0.823974609375, + "rewards//std": 0.040671687573194504, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0784, + "grad_norm": 1.249348521232605, + "kl": 0.055820658802986145, + "learning_rate": 9.883361391670839e-07, + "loss": 0.015, + "num_tokens": 4536128.0, + "reward": 0.75, + "reward_std": 0.025190230458974838, + "rewards//mean": 0.75, + "rewards//std": 0.026784423738718033, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0786, + "grad_norm": 1.187734603881836, + "kl": 0.047443247865885496, + "learning_rate": 9.882678983029817e-07, + "loss": 0.0047, + "num_tokens": 4547768.0, + "reward": 0.78564453125, + "reward_std": 0.026222210377454758, + "rewards//mean": 0.78564453125, + "rewards//std": 0.02630539983510971, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.0788, + "grad_norm": 1.4220222234725952, + "kl": 0.050050461664795876, + "learning_rate": 9.881994607646286e-07, + "loss": 0.0016, + "num_tokens": 4559294.0, + "reward": 0.772216796875, + "reward_std": 0.02526206150650978, + "rewards//mean": 0.772216796875, + "rewards//std": 0.03432222083210945, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.079, + "grad_norm": 1.3015559911727905, + "kl": 0.049576858757063746, + "learning_rate": 9.881308265795911e-07, + "loss": 0.005, + "num_tokens": 4570846.0, + "reward": 0.82275390625, + "reward_std": 0.030672501772642136, + "rewards//mean": 0.82275390625, + "rewards//std": 0.03324413299560547, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0792, + "grad_norm": 1.262276530265808, + "kl": 0.05481417058035731, + "learning_rate": 9.88061995775515e-07, + "loss": 0.006, + "num_tokens": 4582507.0, + "reward": 0.81207275390625, + "reward_std": 0.02057238109409809, + "rewards//mean": 0.81207275390625, + "rewards//std": 0.028680624440312386, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.0794, + "grad_norm": 1.56681489944458, + "kl": 0.05259015364572406, + "learning_rate": 9.879929683801253e-07, + "loss": 0.0059, + "num_tokens": 4594002.0, + "reward": 0.8333740234375, + "reward_std": 0.03237863630056381, + "rewards//mean": 0.8333740234375, + "rewards//std": 0.041235558688640594, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.0796, + "grad_norm": 1.1908644437789917, + "kl": 0.060262958984822035, + "learning_rate": 9.879237444212264e-07, + "loss": 0.0048, + "num_tokens": 4605591.0, + "reward": 0.79693603515625, + "reward_std": 0.02268817275762558, + "rewards//mean": 0.79693603515625, + "rewards//std": 0.023617573082447052, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0798, + "grad_norm": 1.2083261013031006, + "kl": 0.0529950219206512, + "learning_rate": 9.878543239267014e-07, + "loss": 0.0195, + "num_tokens": 4617144.0, + "reward": 0.798583984375, + "reward_std": 0.027390284463763237, + "rewards//mean": 0.798583984375, + "rewards//std": 0.02986186370253563, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.08, + "grad_norm": 1.146141767501831, + "kl": 0.04584077885374427, + "learning_rate": 9.877847069245133e-07, + "loss": 0.0012, + "num_tokens": 4628772.0, + "reward": 0.80224609375, + "reward_std": 0.026287127286195755, + "rewards//mean": 0.80224609375, + "rewards//std": 0.033120047301054, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.0802, + "grad_norm": 1.351252555847168, + "kl": 0.050411889562383294, + "learning_rate": 9.877148934427035e-07, + "loss": 0.001, + "num_tokens": 4640324.0, + "reward": 0.80206298828125, + "reward_std": 0.020917750895023346, + "rewards//mean": 0.80206298828125, + "rewards//std": 0.027401749044656754, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0804, + "grad_norm": 1.2814686298370361, + "kl": 0.052924488205462694, + "learning_rate": 9.876448835093929e-07, + "loss": 0.007, + "num_tokens": 4651945.0, + "reward": 0.8076171875, + "reward_std": 0.026150967925786972, + "rewards//mean": 0.8076171875, + "rewards//std": 0.0312344953417778, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.0806, + "grad_norm": 1.1983723640441895, + "kl": 0.0572818573564291, + "learning_rate": 9.875746771527815e-07, + "loss": 0.0029, + "num_tokens": 4663484.0, + "reward": 0.7974853515625, + "reward_std": 0.023921016603708267, + "rewards//mean": 0.7974853515625, + "rewards//std": 0.02772829495370388, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.0808, + "grad_norm": 1.0655951499938965, + "kl": 0.06574817141517997, + "learning_rate": 9.875042744011486e-07, + "loss": 0.0052, + "num_tokens": 4674978.0, + "reward": 0.8021240234375, + "reward_std": 0.01946561224758625, + "rewards//mean": 0.8021240234375, + "rewards//std": 0.0209055058658123, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.081, + "grad_norm": 1.3169454336166382, + "kl": 0.060431577265262604, + "learning_rate": 9.874336752828522e-07, + "loss": 0.006, + "num_tokens": 4686498.0, + "reward": 0.82196044921875, + "reward_std": 0.022618073970079422, + "rewards//mean": 0.82196044921875, + "rewards//std": 0.027803190052509308, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.0812, + "grad_norm": 1.2333340644836426, + "kl": 0.04891548119485378, + "learning_rate": 9.873628798263295e-07, + "loss": 0.0042, + "num_tokens": 4698087.0, + "reward": 0.81927490234375, + "reward_std": 0.028938330709934235, + "rewards//mean": 0.81927490234375, + "rewards//std": 0.03549046441912651, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0814, + "grad_norm": 1.1734318733215332, + "kl": 0.0535324621014297, + "learning_rate": 9.872918880600973e-07, + "loss": 0.0054, + "num_tokens": 4709767.0, + "reward": 0.81365966796875, + "reward_std": 0.024313898757100105, + "rewards//mean": 0.81365966796875, + "rewards//std": 0.030303113162517548, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0816, + "grad_norm": 1.3884910345077515, + "kl": 0.05825360957533121, + "learning_rate": 9.87220700012751e-07, + "loss": 0.0047, + "num_tokens": 4721355.0, + "reward": 0.810791015625, + "reward_std": 0.030731182545423508, + "rewards//mean": 0.810791015625, + "rewards//std": 0.03358030319213867, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0818, + "grad_norm": 1.4319816827774048, + "kl": 0.04891437548212707, + "learning_rate": 9.871493157129647e-07, + "loss": 0.0049, + "num_tokens": 4732963.0, + "reward": 0.79425048828125, + "reward_std": 0.028765326365828514, + "rewards//mean": 0.79425048828125, + "rewards//std": 0.03264536336064339, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.082, + "grad_norm": 1.1647368669509888, + "kl": 0.04496137611567974, + "learning_rate": 9.870777351894926e-07, + "loss": 0.0045, + "num_tokens": 4744571.0, + "reward": 0.80889892578125, + "reward_std": 0.027914132922887802, + "rewards//mean": 0.80889892578125, + "rewards//std": 0.035830482840538025, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.0822, + "grad_norm": 1.1405107975006104, + "kl": 0.054889919236302376, + "learning_rate": 9.870059584711668e-07, + "loss": 0.0088, + "num_tokens": 4756165.0, + "reward": 0.78594970703125, + "reward_std": 0.020538128912448883, + "rewards//mean": 0.78594970703125, + "rewards//std": 0.024470653384923935, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.0824, + "grad_norm": 1.2859779596328735, + "kl": 0.06602783408015966, + "learning_rate": 9.869339855868991e-07, + "loss": 0.0063, + "num_tokens": 4767680.0, + "reward": 0.81390380859375, + "reward_std": 0.029978923499584198, + "rewards//mean": 0.81390380859375, + "rewards//std": 0.03428333252668381, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0826, + "grad_norm": 1.2792450189590454, + "kl": 0.052898318041116, + "learning_rate": 9.868618165656804e-07, + "loss": 0.0053, + "num_tokens": 4779268.0, + "reward": 0.8228759765625, + "reward_std": 0.018871277570724487, + "rewards//mean": 0.8228759765625, + "rewards//std": 0.024441329762339592, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.0828, + "grad_norm": 1.302666187286377, + "kl": 0.058500540908426046, + "learning_rate": 9.8678945143658e-07, + "loss": 0.0059, + "num_tokens": 4790750.0, + "reward": 0.78082275390625, + "reward_std": 0.01989973708987236, + "rewards//mean": 0.78082275390625, + "rewards//std": 0.02306370809674263, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.083, + "grad_norm": 1.556481122970581, + "kl": 0.058490983210504055, + "learning_rate": 9.86716890228747e-07, + "loss": 0.0145, + "num_tokens": 4802333.0, + "reward": 0.7874755859375, + "reward_std": 0.03204387053847313, + "rewards//mean": 0.7874755859375, + "rewards//std": 0.03524720296263695, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.0832, + "grad_norm": 1.2481883764266968, + "kl": 0.054078247398138046, + "learning_rate": 9.866441329714087e-07, + "loss": 0.0057, + "num_tokens": 4813907.0, + "reward": 0.82232666015625, + "reward_std": 0.02630779705941677, + "rewards//mean": 0.82232666015625, + "rewards//std": 0.02842508628964424, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.21875, + "epoch": 0.0834, + "grad_norm": 1.4550602436065674, + "kl": 0.05495173716917634, + "learning_rate": 9.86571179693872e-07, + "loss": -0.0079, + "num_tokens": 4825481.0, + "reward": 0.81396484375, + "reward_std": 0.022040415555238724, + "rewards//mean": 0.81396484375, + "rewards//std": 0.02382187359035015, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.0836, + "grad_norm": 1.2795768976211548, + "kl": 0.05718524754047394, + "learning_rate": 9.86498030425522e-07, + "loss": 0.0085, + "num_tokens": 4836991.0, + "reward": 0.815673828125, + "reward_std": 0.021860117092728615, + "rewards//mean": 0.815673828125, + "rewards//std": 0.025514332577586174, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.0838, + "grad_norm": 1.2054413557052612, + "kl": 0.05543432803824544, + "learning_rate": 9.864246851958237e-07, + "loss": 0.0043, + "num_tokens": 4848527.0, + "reward": 0.81500244140625, + "reward_std": 0.02505394071340561, + "rewards//mean": 0.81500244140625, + "rewards//std": 0.032446760684251785, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.084, + "grad_norm": 1.3820056915283203, + "kl": 0.06666966481134295, + "learning_rate": 9.863511440343205e-07, + "loss": 0.0081, + "num_tokens": 4860027.0, + "reward": 0.8231201171875, + "reward_std": 0.02914276346564293, + "rewards//mean": 0.8231201171875, + "rewards//std": 0.03419208526611328, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.875, + "epoch": 0.0842, + "grad_norm": 1.2565011978149414, + "kl": 0.056452089454978704, + "learning_rate": 9.862774069706345e-07, + "loss": 0.0189, + "num_tokens": 4871515.0, + "reward": 0.81060791015625, + "reward_std": 0.029188329353928566, + "rewards//mean": 0.81060791015625, + "rewards//std": 0.032859355211257935, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.0844, + "grad_norm": 1.2319518327713013, + "kl": 0.055275293067097664, + "learning_rate": 9.862034740344671e-07, + "loss": 0.0021, + "num_tokens": 4883093.0, + "reward": 0.82177734375, + "reward_std": 0.02431487664580345, + "rewards//mean": 0.82177734375, + "rewards//std": 0.02667115069925785, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.0846, + "grad_norm": 1.5798003673553467, + "kl": 0.05668703466653824, + "learning_rate": 9.861293452555986e-07, + "loss": 0.0092, + "num_tokens": 4894563.0, + "reward": 0.81317138671875, + "reward_std": 0.03155597299337387, + "rewards//mean": 0.81317138671875, + "rewards//std": 0.036268871277570724, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5625, + "epoch": 0.0848, + "grad_norm": 1.2722678184509277, + "kl": 0.0675958339124918, + "learning_rate": 9.86055020663888e-07, + "loss": -0.0067, + "num_tokens": 4906079.0, + "reward": 0.81207275390625, + "reward_std": 0.021274980157613754, + "rewards//mean": 0.81207275390625, + "rewards//std": 0.02613559365272522, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.085, + "grad_norm": 1.2663617134094238, + "kl": 0.06123335612937808, + "learning_rate": 9.859805002892731e-07, + "loss": 0.0073, + "num_tokens": 4917580.0, + "reward": 0.80029296875, + "reward_std": 0.03437019884586334, + "rewards//mean": 0.80029296875, + "rewards//std": 0.036027293652296066, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.0852, + "grad_norm": 1.204244613647461, + "kl": 0.06753659760579467, + "learning_rate": 9.859057841617708e-07, + "loss": 0.0124, + "num_tokens": 4929057.0, + "reward": 0.7890625, + "reward_std": 0.026421882212162018, + "rewards//mean": 0.7890625, + "rewards//std": 0.037018708884716034, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.0854, + "grad_norm": 1.2038135528564453, + "kl": 0.0542770279571414, + "learning_rate": 9.858308723114768e-07, + "loss": 0.0019, + "num_tokens": 4940631.0, + "reward": 0.80279541015625, + "reward_std": 0.03720984235405922, + "rewards//mean": 0.80279541015625, + "rewards//std": 0.0445706769824028, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.4375, + "epoch": 0.0856, + "grad_norm": 1.268694519996643, + "kl": 0.055203264113515615, + "learning_rate": 9.857557647685655e-07, + "loss": -0.0002, + "num_tokens": 4952243.0, + "reward": 0.79400634765625, + "reward_std": 0.02562260441482067, + "rewards//mean": 0.79400634765625, + "rewards//std": 0.031521961092948914, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.0858, + "grad_norm": 1.3694819211959839, + "kl": 0.050670324359089136, + "learning_rate": 9.856804615632901e-07, + "loss": 0.0063, + "num_tokens": 4963932.0, + "reward": 0.81719970703125, + "reward_std": 0.029478155076503754, + "rewards//mean": 0.81719970703125, + "rewards//std": 0.03378383815288544, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.086, + "grad_norm": 1.356811285018921, + "kl": 0.05481780553236604, + "learning_rate": 9.856049627259832e-07, + "loss": 0.0065, + "num_tokens": 4975471.0, + "reward": 0.8026123046875, + "reward_std": 0.03857314586639404, + "rewards//mean": 0.8026123046875, + "rewards//std": 0.04348684847354889, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.0862, + "grad_norm": 1.3036189079284668, + "kl": 0.053566953632980585, + "learning_rate": 9.85529268287055e-07, + "loss": 0.0044, + "num_tokens": 4986986.0, + "reward": 0.76971435546875, + "reward_std": 0.03186199814081192, + "rewards//mean": 0.76971435546875, + "rewards//std": 0.03664010390639305, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.0864, + "grad_norm": 1.2640087604522705, + "kl": 0.06252979254350066, + "learning_rate": 9.854533782769959e-07, + "loss": 0.0121, + "num_tokens": 4998480.0, + "reward": 0.8184814453125, + "reward_std": 0.03319408744573593, + "rewards//mean": 0.8184814453125, + "rewards//std": 0.03502143546938896, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0866, + "grad_norm": 1.1871178150177002, + "kl": 0.053515492007136345, + "learning_rate": 9.853772927263739e-07, + "loss": 0.0096, + "num_tokens": 5010116.0, + "reward": 0.7803955078125, + "reward_std": 0.0248456709086895, + "rewards//mean": 0.7803955078125, + "rewards//std": 0.030060676857829094, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.171875, + "epoch": 0.0868, + "grad_norm": 1.2201967239379883, + "kl": 0.05811973661184311, + "learning_rate": 9.853010116658366e-07, + "loss": 0.0262, + "num_tokens": 5021711.0, + "reward": 0.80615234375, + "reward_std": 0.03446679189801216, + "rewards//mean": 0.80615234375, + "rewards//std": 0.03649485483765602, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.087, + "grad_norm": 1.187144160270691, + "kl": 0.0660905004478991, + "learning_rate": 9.852245351261097e-07, + "loss": 0.0103, + "num_tokens": 5033208.0, + "reward": 0.81304931640625, + "reward_std": 0.02430643141269684, + "rewards//mean": 0.81304931640625, + "rewards//std": 0.035241350531578064, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0872, + "grad_norm": 1.4636203050613403, + "kl": 0.06494804518297315, + "learning_rate": 9.851478631379982e-07, + "loss": 0.0063, + "num_tokens": 5044709.0, + "reward": 0.830322265625, + "reward_std": 0.0226895809173584, + "rewards//mean": 0.830322265625, + "rewards//std": 0.023082159459590912, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.203125, + "epoch": 0.0874, + "grad_norm": 1.4351367950439453, + "kl": 0.06297813775017858, + "learning_rate": 9.850709957323854e-07, + "loss": -0.0088, + "num_tokens": 5056226.0, + "reward": 0.7908935546875, + "reward_std": 0.031248789280653, + "rewards//mean": 0.7908935546875, + "rewards//std": 0.03569953888654709, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0876, + "grad_norm": 1.2753044366836548, + "kl": 0.06140360003337264, + "learning_rate": 9.849939329402336e-07, + "loss": 0.0061, + "num_tokens": 5067746.0, + "reward": 0.796875, + "reward_std": 0.02211393043398857, + "rewards//mean": 0.796875, + "rewards//std": 0.027009544894099236, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0878, + "grad_norm": 1.6263372898101807, + "kl": 0.06558625213801861, + "learning_rate": 9.849166747925834e-07, + "loss": 0.0086, + "num_tokens": 5079331.0, + "reward": 0.8115234375, + "reward_std": 0.01769370771944523, + "rewards//mean": 0.8115234375, + "rewards//std": 0.019475366920232773, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.088, + "grad_norm": 1.1383663415908813, + "kl": 0.06161572318524122, + "learning_rate": 9.848392213205547e-07, + "loss": 0.0085, + "num_tokens": 5090820.0, + "reward": 0.82305908203125, + "reward_std": 0.03503207117319107, + "rewards//mean": 0.82305908203125, + "rewards//std": 0.03981589898467064, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.296875, + "epoch": 0.0882, + "grad_norm": 1.4681365489959717, + "kl": 0.06311738025397062, + "learning_rate": 9.847615725553455e-07, + "loss": 0.0148, + "num_tokens": 5102359.0, + "reward": 0.8218994140625, + "reward_std": 0.02298644557595253, + "rewards//mean": 0.8218994140625, + "rewards//std": 0.026224421337246895, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0884, + "grad_norm": 1.303862452507019, + "kl": 0.07351981196552515, + "learning_rate": 9.84683728528233e-07, + "loss": 0.0074, + "num_tokens": 5113959.0, + "reward": 0.82647705078125, + "reward_std": 0.02516179531812668, + "rewards//mean": 0.82647705078125, + "rewards//std": 0.032806333154439926, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.484375, + "epoch": 0.0886, + "grad_norm": 1.1816575527191162, + "kl": 0.06019823020324111, + "learning_rate": 9.846056892705727e-07, + "loss": 0.0152, + "num_tokens": 5125622.0, + "reward": 0.793212890625, + "reward_std": 0.029127493500709534, + "rewards//mean": 0.793212890625, + "rewards//std": 0.03202985227108002, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.46875, + "epoch": 0.0888, + "grad_norm": 1.301532506942749, + "kl": 0.07345152320340276, + "learning_rate": 9.845274548137985e-07, + "loss": 0.0214, + "num_tokens": 5137108.0, + "reward": 0.80511474609375, + "reward_std": 0.028401944786310196, + "rewards//mean": 0.80511474609375, + "rewards//std": 0.0327199324965477, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.089, + "grad_norm": 1.2223082780838013, + "kl": 0.06702146865427494, + "learning_rate": 9.844490251894236e-07, + "loss": 0.0095, + "num_tokens": 5148675.0, + "reward": 0.83380126953125, + "reward_std": 0.0282619446516037, + "rewards//mean": 0.83380126953125, + "rewards//std": 0.029214540496468544, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.0892, + "grad_norm": 1.3344072103500366, + "kl": 0.06695383926853538, + "learning_rate": 9.843704004290392e-07, + "loss": 0.0065, + "num_tokens": 5160345.0, + "reward": 0.816162109375, + "reward_std": 0.021146001294255257, + "rewards//mean": 0.816162109375, + "rewards//std": 0.02986186370253563, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0894, + "grad_norm": 1.225375771522522, + "kl": 0.07325892196968198, + "learning_rate": 9.842915805643156e-07, + "loss": 0.0055, + "num_tokens": 5171910.0, + "reward": 0.78167724609375, + "reward_std": 0.020869063213467598, + "rewards//mean": 0.78167724609375, + "rewards//std": 0.02483355812728405, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.28125, + "epoch": 0.0896, + "grad_norm": 1.2554978132247925, + "kl": 0.07537907268851995, + "learning_rate": 9.84212565627001e-07, + "loss": -0.0092, + "num_tokens": 5183480.0, + "reward": 0.80548095703125, + "reward_std": 0.031934142112731934, + "rewards//mean": 0.80548095703125, + "rewards//std": 0.03587017580866814, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0898, + "grad_norm": 1.3209415674209595, + "kl": 0.07083390513435006, + "learning_rate": 9.841333556489232e-07, + "loss": 0.0071, + "num_tokens": 5195144.0, + "reward": 0.7708740234375, + "reward_std": 0.021352743729948997, + "rewards//mean": 0.7708740234375, + "rewards//std": 0.028495095670223236, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.09, + "grad_norm": 1.3329912424087524, + "kl": 0.06673079170286655, + "learning_rate": 9.840539506619872e-07, + "loss": 0.0105, + "num_tokens": 5206651.0, + "reward": 0.78173828125, + "reward_std": 0.027670420706272125, + "rewards//mean": 0.78173828125, + "rewards//std": 0.030973635613918304, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.0902, + "grad_norm": 1.3149265050888062, + "kl": 0.06228609220124781, + "learning_rate": 9.83974350698178e-07, + "loss": 0.0086, + "num_tokens": 5218268.0, + "reward": 0.812744140625, + "reward_std": 0.02244272641837597, + "rewards//mean": 0.812744140625, + "rewards//std": 0.03245801106095314, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.0904, + "grad_norm": 1.1316109895706177, + "kl": 0.07316886773332953, + "learning_rate": 9.838945557895584e-07, + "loss": 0.0043, + "num_tokens": 5229912.0, + "reward": 0.81585693359375, + "reward_std": 0.021685611456632614, + "rewards//mean": 0.81585693359375, + "rewards//std": 0.02243286743760109, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.0906, + "grad_norm": 1.2300893068313599, + "kl": 0.07194394990801811, + "learning_rate": 9.838145659682692e-07, + "loss": 0.0037, + "num_tokens": 5241579.0, + "reward": 0.810302734375, + "reward_std": 0.01935657486319542, + "rewards//mean": 0.810302734375, + "rewards//std": 0.021086113527417183, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.0908, + "grad_norm": 1.1938008069992065, + "kl": 0.07558053079992533, + "learning_rate": 9.83734381266531e-07, + "loss": 0.0102, + "num_tokens": 5253019.0, + "reward": 0.81842041015625, + "reward_std": 0.021757088601589203, + "rewards//mean": 0.81842041015625, + "rewards//std": 0.028365377336740494, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.091, + "grad_norm": 1.4447282552719116, + "kl": 0.07077213795855641, + "learning_rate": 9.836540017166419e-07, + "loss": 0.0071, + "num_tokens": 5264635.0, + "reward": 0.7982177734375, + "reward_std": 0.034284137189388275, + "rewards//mean": 0.7982177734375, + "rewards//std": 0.03975830599665642, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.0912, + "grad_norm": 1.2949038743972778, + "kl": 0.06851950893178582, + "learning_rate": 9.835734273509785e-07, + "loss": 0.0067, + "num_tokens": 5276177.0, + "reward": 0.815185546875, + "reward_std": 0.026415590196847916, + "rewards//mean": 0.815185546875, + "rewards//std": 0.03810123726725578, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.0914, + "grad_norm": 1.4060544967651367, + "kl": 0.07025480084121227, + "learning_rate": 9.834926582019966e-07, + "loss": 0.0129, + "num_tokens": 5287772.0, + "reward": 0.80108642578125, + "reward_std": 0.030475925654172897, + "rewards//mean": 0.80108642578125, + "rewards//std": 0.03150707110762596, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.0916, + "grad_norm": 1.2969262599945068, + "kl": 0.0757476119324565, + "learning_rate": 9.834116943022297e-07, + "loss": 0.008, + "num_tokens": 5299382.0, + "reward": 0.79669189453125, + "reward_std": 0.02796255424618721, + "rewards//mean": 0.79669189453125, + "rewards//std": 0.032558999955654144, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5625, + "epoch": 0.0918, + "grad_norm": 1.18767511844635, + "kl": 0.07633181475102901, + "learning_rate": 9.8333053568429e-07, + "loss": -0.0016, + "num_tokens": 5310906.0, + "reward": 0.84185791015625, + "reward_std": 0.037358567118644714, + "rewards//mean": 0.84185791015625, + "rewards//std": 0.04797300696372986, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.092, + "grad_norm": 1.1845643520355225, + "kl": 0.07049541501328349, + "learning_rate": 9.832491823808686e-07, + "loss": 0.0044, + "num_tokens": 5322572.0, + "reward": 0.83038330078125, + "reward_std": 0.020690465345978737, + "rewards//mean": 0.83038330078125, + "rewards//std": 0.024810995906591415, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0922, + "grad_norm": 1.5235270261764526, + "kl": 0.08809420745819807, + "learning_rate": 9.831676344247342e-07, + "loss": 0.0088, + "num_tokens": 5334148.0, + "reward": 0.8072509765625, + "reward_std": 0.02366703748703003, + "rewards//mean": 0.8072509765625, + "rewards//std": 0.027885079383850098, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0924, + "grad_norm": 1.171995997428894, + "kl": 0.06448721000924706, + "learning_rate": 9.830858918487346e-07, + "loss": 0.0064, + "num_tokens": 5345892.0, + "reward": 0.80560302734375, + "reward_std": 0.027092576026916504, + "rewards//mean": 0.80560302734375, + "rewards//std": 0.03237483277916908, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.0926, + "grad_norm": 1.2524101734161377, + "kl": 0.07028303295373917, + "learning_rate": 9.830039546857952e-07, + "loss": 0.0084, + "num_tokens": 5357457.0, + "reward": 0.8253173828125, + "reward_std": 0.028225034475326538, + "rewards//mean": 0.8253173828125, + "rewards//std": 0.036241576075553894, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.0928, + "grad_norm": 1.3786554336547852, + "kl": 0.08576635736972094, + "learning_rate": 9.829218229689209e-07, + "loss": 0.0058, + "num_tokens": 5368964.0, + "reward": 0.82940673828125, + "reward_std": 0.025409279391169548, + "rewards//mean": 0.82940673828125, + "rewards//std": 0.029057634994387627, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.093, + "grad_norm": 1.2185924053192139, + "kl": 0.07155512180179358, + "learning_rate": 9.828394967311938e-07, + "loss": 0.0081, + "num_tokens": 5380567.0, + "reward": 0.80828857421875, + "reward_std": 0.031101834028959274, + "rewards//mean": 0.80828857421875, + "rewards//std": 0.038684334605932236, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.0932, + "grad_norm": 1.4732379913330078, + "kl": 0.08264558855444193, + "learning_rate": 9.827569760057754e-07, + "loss": -0.007, + "num_tokens": 5392110.0, + "reward": 0.8135986328125, + "reward_std": 0.021513434126973152, + "rewards//mean": 0.8135986328125, + "rewards//std": 0.023973582312464714, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.0934, + "grad_norm": 1.28080153465271, + "kl": 0.08075165189802647, + "learning_rate": 9.826742608259047e-07, + "loss": 0.0057, + "num_tokens": 5403699.0, + "reward": 0.8223876953125, + "reward_std": 0.02456752210855484, + "rewards//mean": 0.8223876953125, + "rewards//std": 0.02711666002869606, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.0936, + "grad_norm": 1.4338269233703613, + "kl": 0.08569623297080398, + "learning_rate": 9.825913512248995e-07, + "loss": 0.0068, + "num_tokens": 5415250.0, + "reward": 0.7772216796875, + "reward_std": 0.03051215410232544, + "rewards//mean": 0.7772216796875, + "rewards//std": 0.03190934658050537, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0938, + "grad_norm": 1.5390422344207764, + "kl": 0.083765281829983, + "learning_rate": 9.825082472361556e-07, + "loss": 0.0084, + "num_tokens": 5426874.0, + "reward": 0.8128662109375, + "reward_std": 0.022787289693951607, + "rewards//mean": 0.8128662109375, + "rewards//std": 0.034292880445718765, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.094, + "grad_norm": 1.2325215339660645, + "kl": 0.08007821161299944, + "learning_rate": 9.824249488931475e-07, + "loss": 0.0077, + "num_tokens": 5438415.0, + "reward": 0.8294677734375, + "reward_std": 0.03150644525885582, + "rewards//mean": 0.8294677734375, + "rewards//std": 0.037202876061201096, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.0942, + "grad_norm": 1.5335402488708496, + "kl": 0.07616242486983538, + "learning_rate": 9.82341456229428e-07, + "loss": 0.0068, + "num_tokens": 5449956.0, + "reward": 0.80828857421875, + "reward_std": 0.02996237948536873, + "rewards//mean": 0.80828857421875, + "rewards//std": 0.03438165783882141, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.265625, + "epoch": 0.0944, + "grad_norm": 1.2239545583724976, + "kl": 0.08514336496591568, + "learning_rate": 9.822577692786272e-07, + "loss": 0.0013, + "num_tokens": 5461517.0, + "reward": 0.83306884765625, + "reward_std": 0.02442818507552147, + "rewards//mean": 0.83306884765625, + "rewards//std": 0.029050862416625023, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0946, + "grad_norm": 1.394740104675293, + "kl": 0.08109706407412887, + "learning_rate": 9.821738880744547e-07, + "loss": 0.0081, + "num_tokens": 5473029.0, + "reward": 0.81524658203125, + "reward_std": 0.024296781048178673, + "rewards//mean": 0.81524658203125, + "rewards//std": 0.02834562584757805, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.0948, + "grad_norm": 1.252071738243103, + "kl": 0.07104279566556215, + "learning_rate": 9.820898126506979e-07, + "loss": -0.0005, + "num_tokens": 5484519.0, + "reward": 0.76904296875, + "reward_std": 0.032503265887498856, + "rewards//mean": 0.76904296875, + "rewards//std": 0.0392017625272274, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.095, + "grad_norm": 1.260479211807251, + "kl": 0.0731711215339601, + "learning_rate": 9.820055430412219e-07, + "loss": 0.0138, + "num_tokens": 5496053.0, + "reward": 0.821533203125, + "reward_std": 0.03276493772864342, + "rewards//mean": 0.821533203125, + "rewards//std": 0.03519326075911522, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0952, + "grad_norm": 1.3579024076461792, + "kl": 0.07893198449164629, + "learning_rate": 9.81921079279971e-07, + "loss": 0.0016, + "num_tokens": 5507686.0, + "reward": 0.78851318359375, + "reward_std": 0.02613705024123192, + "rewards//mean": 0.78851318359375, + "rewards//std": 0.028678512200713158, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0954, + "grad_norm": 1.424095630645752, + "kl": 0.08564876578748226, + "learning_rate": 9.81836421400967e-07, + "loss": -0.0044, + "num_tokens": 5519274.0, + "reward": 0.79583740234375, + "reward_std": 0.0254361592233181, + "rewards//mean": 0.79583740234375, + "rewards//std": 0.031119847670197487, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0956, + "grad_norm": 1.4588207006454468, + "kl": 0.08377953432500362, + "learning_rate": 9.817515694383102e-07, + "loss": 0.0123, + "num_tokens": 5530822.0, + "reward": 0.8182373046875, + "reward_std": 0.03187720105051994, + "rewards//mean": 0.8182373046875, + "rewards//std": 0.03638497740030289, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.296875, + "epoch": 0.0958, + "grad_norm": 1.4523743391036987, + "kl": 0.0865940828807652, + "learning_rate": 9.816665234261786e-07, + "loss": 0.0014, + "num_tokens": 5542313.0, + "reward": 0.77923583984375, + "reward_std": 0.02401074394583702, + "rewards//mean": 0.77923583984375, + "rewards//std": 0.025911059230566025, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.096, + "grad_norm": 1.1478002071380615, + "kl": 0.08146358886733651, + "learning_rate": 9.81581283398829e-07, + "loss": 0.0105, + "num_tokens": 5553765.0, + "reward": 0.82794189453125, + "reward_std": 0.0307990200817585, + "rewards//mean": 0.82794189453125, + "rewards//std": 0.03344699740409851, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.0962, + "grad_norm": 1.2867639064788818, + "kl": 0.08200537227094173, + "learning_rate": 9.814958493905962e-07, + "loss": 0.0141, + "num_tokens": 5565299.0, + "reward": 0.81884765625, + "reward_std": 0.02990609221160412, + "rewards//mean": 0.81884765625, + "rewards//std": 0.03190578520298004, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.0964, + "grad_norm": 1.38454270362854, + "kl": 0.0822724737226963, + "learning_rate": 9.814102214358926e-07, + "loss": 0.0073, + "num_tokens": 5576858.0, + "reward": 0.8160400390625, + "reward_std": 0.024135317653417587, + "rewards//mean": 0.8160400390625, + "rewards//std": 0.02682705968618393, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.0966, + "grad_norm": 1.2208223342895508, + "kl": 0.08534923894330859, + "learning_rate": 9.813243995692097e-07, + "loss": 0.0116, + "num_tokens": 5588370.0, + "reward": 0.83819580078125, + "reward_std": 0.02994173765182495, + "rewards//mean": 0.83819580078125, + "rewards//std": 0.03988351672887802, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0968, + "grad_norm": 1.2231444120407104, + "kl": 0.0776977026835084, + "learning_rate": 9.81238383825116e-07, + "loss": 0.0078, + "num_tokens": 5599874.0, + "reward": 0.8133544921875, + "reward_std": 0.02791026420891285, + "rewards//mean": 0.8133544921875, + "rewards//std": 0.03150833025574684, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.097, + "grad_norm": 1.1778688430786133, + "kl": 0.0699493926949799, + "learning_rate": 9.81152174238259e-07, + "loss": 0.007, + "num_tokens": 5611482.0, + "reward": 0.81793212890625, + "reward_std": 0.026184963062405586, + "rewards//mean": 0.81793212890625, + "rewards//std": 0.027749236673116684, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0972, + "grad_norm": 1.1969714164733887, + "kl": 0.08459043595939875, + "learning_rate": 9.810657708433635e-07, + "loss": 0.0085, + "num_tokens": 5623026.0, + "reward": 0.81964111328125, + "reward_std": 0.02521602436900139, + "rewards//mean": 0.81964111328125, + "rewards//std": 0.030133794993162155, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.0974, + "grad_norm": 1.5844535827636719, + "kl": 0.08831861382350326, + "learning_rate": 9.809791736752332e-07, + "loss": 0.0061, + "num_tokens": 5634553.0, + "reward": 0.78985595703125, + "reward_std": 0.025955840945243835, + "rewards//mean": 0.78985595703125, + "rewards//std": 0.03303715959191322, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.0976, + "grad_norm": 1.7093342542648315, + "kl": 0.08042505849152803, + "learning_rate": 9.808923827687492e-07, + "loss": 0.006, + "num_tokens": 5646060.0, + "reward": 0.8336181640625, + "reward_std": 0.025936080142855644, + "rewards//mean": 0.8336181640625, + "rewards//std": 0.029460471123456955, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.0978, + "grad_norm": 1.266553521156311, + "kl": 0.08229110389947891, + "learning_rate": 9.80805398158871e-07, + "loss": 0.0135, + "num_tokens": 5657604.0, + "reward": 0.80975341796875, + "reward_std": 0.021332455798983574, + "rewards//mean": 0.80975341796875, + "rewards//std": 0.028987742960453033, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.098, + "grad_norm": 1.327900767326355, + "kl": 0.08754283655434847, + "learning_rate": 9.80718219880636e-07, + "loss": 0.0102, + "num_tokens": 5669192.0, + "reward": 0.800048828125, + "reward_std": 0.02183220162987709, + "rewards//mean": 0.800048828125, + "rewards//std": 0.027699071913957596, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.0982, + "grad_norm": 1.3510090112686157, + "kl": 0.07454448752105236, + "learning_rate": 9.806308479691594e-07, + "loss": 0.0145, + "num_tokens": 5680774.0, + "reward": 0.8223876953125, + "reward_std": 0.033111169934272766, + "rewards//mean": 0.8223876953125, + "rewards//std": 0.037692807614803314, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "epoch": 0.0984, + "grad_norm": 1.2434260845184326, + "kl": 0.06987044122070074, + "learning_rate": 9.805432824596347e-07, + "loss": 0.0143, + "num_tokens": 5692214.0, + "reward": 0.8231201171875, + "reward_std": 0.03391135111451149, + "rewards//mean": 0.8231201171875, + "rewards//std": 0.034163739532232285, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.0986, + "grad_norm": 1.2728962898254395, + "kl": 0.08778978325426579, + "learning_rate": 9.804555233873332e-07, + "loss": -0.0021, + "num_tokens": 5703744.0, + "reward": 0.8095703125, + "reward_std": 0.023176230490207672, + "rewards//mean": 0.8095703125, + "rewards//std": 0.027735117822885513, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.0988, + "grad_norm": 1.190981388092041, + "kl": 0.08150128275156021, + "learning_rate": 9.803675707876048e-07, + "loss": -0.0048, + "num_tokens": 5715206.0, + "reward": 0.76165771484375, + "reward_std": 0.02360847219824791, + "rewards//mean": 0.76165771484375, + "rewards//std": 0.04232124239206314, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.099, + "grad_norm": 1.356329321861267, + "kl": 0.08091806340962648, + "learning_rate": 9.80279424695876e-07, + "loss": 0.0067, + "num_tokens": 5726826.0, + "reward": 0.81829833984375, + "reward_std": 0.037455759942531586, + "rewards//mean": 0.81829833984375, + "rewards//std": 0.04320760443806648, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0992, + "grad_norm": 1.2281384468078613, + "kl": 0.08078571129590273, + "learning_rate": 9.801910851476524e-07, + "loss": 0.0081, + "num_tokens": 5738386.0, + "reward": 0.81683349609375, + "reward_std": 0.037616074085235596, + "rewards//mean": 0.81683349609375, + "rewards//std": 0.04088101536035538, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.515625, + "epoch": 0.0994, + "grad_norm": 1.1897907257080078, + "kl": 0.07609264133498073, + "learning_rate": 9.80102552178517e-07, + "loss": 0.0057, + "num_tokens": 5749915.0, + "reward": 0.816162109375, + "reward_std": 0.029379744082689285, + "rewards//mean": 0.816162109375, + "rewards//std": 0.03279208391904831, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.0996, + "grad_norm": 1.3332589864730835, + "kl": 0.08528816793113947, + "learning_rate": 9.800138258241309e-07, + "loss": 0.0087, + "num_tokens": 5761516.0, + "reward": 0.79705810546875, + "reward_std": 0.024435605853796005, + "rewards//mean": 0.79705810546875, + "rewards//std": 0.034035611897706985, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0998, + "grad_norm": 1.2685091495513916, + "kl": 0.0846557542681694, + "learning_rate": 9.799249061202334e-07, + "loss": 0.0116, + "num_tokens": 5773064.0, + "reward": 0.80670166015625, + "reward_std": 0.03253260627388954, + "rewards//mean": 0.80670166015625, + "rewards//std": 0.03634516894817352, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.1, + "grad_norm": 1.25624418258667, + "kl": 0.08814771194010973, + "learning_rate": 9.798357931026412e-07, + "loss": 0.0076, + "num_tokens": 5784643.0, + "reward": 0.82073974609375, + "reward_std": 0.031649477779865265, + "rewards//mean": 0.82073974609375, + "rewards//std": 0.03304402902722359, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.1002, + "grad_norm": 1.2751920223236084, + "kl": 0.08597521390765905, + "learning_rate": 9.797464868072486e-07, + "loss": 0.005, + "num_tokens": 5796261.0, + "reward": 0.78521728515625, + "reward_std": 0.025556009262800217, + "rewards//mean": 0.78521728515625, + "rewards//std": 0.03369275480508804, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1004, + "grad_norm": 1.1816672086715698, + "kl": 0.08120293822139502, + "learning_rate": 9.796569872700287e-07, + "loss": 0.0081, + "num_tokens": 5807789.0, + "reward": 0.80047607421875, + "reward_std": 0.028731008991599083, + "rewards//mean": 0.80047607421875, + "rewards//std": 0.03066560998558998, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1006, + "grad_norm": 1.3965822458267212, + "kl": 0.08313928358256817, + "learning_rate": 9.795672945270316e-07, + "loss": 0.0083, + "num_tokens": 5819445.0, + "reward": 0.8184814453125, + "reward_std": 0.026940450072288513, + "rewards//mean": 0.8184814453125, + "rewards//std": 0.03399673104286194, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1008, + "grad_norm": 1.3802001476287842, + "kl": 0.08679562713950872, + "learning_rate": 9.794774086143857e-07, + "loss": 0.0087, + "num_tokens": 5831069.0, + "reward": 0.8037109375, + "reward_std": 0.02174842730164528, + "rewards//mean": 0.8037109375, + "rewards//std": 0.027134796604514122, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.101, + "grad_norm": 1.2436765432357788, + "kl": 0.07691568555310369, + "learning_rate": 9.79387329568297e-07, + "loss": 0.0122, + "num_tokens": 5842632.0, + "reward": 0.81463623046875, + "reward_std": 0.025821691378951073, + "rewards//mean": 0.81463623046875, + "rewards//std": 0.027934638783335686, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.1012, + "grad_norm": 1.252874732017517, + "kl": 0.09949897043406963, + "learning_rate": 9.792970574250493e-07, + "loss": 0.0128, + "num_tokens": 5854156.0, + "reward": 0.8284912109375, + "reward_std": 0.02308451384305954, + "rewards//mean": 0.8284912109375, + "rewards//std": 0.028019385412335396, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.1014, + "grad_norm": 1.4068330526351929, + "kl": 0.08417150098830462, + "learning_rate": 9.79206592221004e-07, + "loss": 0.0086, + "num_tokens": 5865762.0, + "reward": 0.81622314453125, + "reward_std": 0.026028819382190704, + "rewards//mean": 0.81622314453125, + "rewards//std": 0.029367513954639435, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.40625, + "epoch": 0.1016, + "grad_norm": 1.3112162351608276, + "kl": 0.0980263683013618, + "learning_rate": 9.791159339926008e-07, + "loss": 0.0037, + "num_tokens": 5877380.0, + "reward": 0.78399658203125, + "reward_std": 0.02097933180630207, + "rewards//mean": 0.78399658203125, + "rewards//std": 0.024984879419207573, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1018, + "grad_norm": 1.2269960641860962, + "kl": 0.10290224757045507, + "learning_rate": 9.790250827763565e-07, + "loss": 0.0103, + "num_tokens": 5888940.0, + "reward": 0.8555908203125, + "reward_std": 0.032164156436920166, + "rewards//mean": 0.8555908203125, + "rewards//std": 0.03313809260725975, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.102, + "grad_norm": 1.194116473197937, + "kl": 0.09419394005089998, + "learning_rate": 9.789340386088662e-07, + "loss": 0.0092, + "num_tokens": 5900441.0, + "reward": 0.8199462890625, + "reward_std": 0.02815665304660797, + "rewards//mean": 0.8199462890625, + "rewards//std": 0.03364402800798416, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1022, + "grad_norm": 1.255768060684204, + "kl": 0.09281587600708008, + "learning_rate": 9.788428015268026e-07, + "loss": 0.0093, + "num_tokens": 5912049.0, + "reward": 0.78173828125, + "reward_std": 0.023645499721169472, + "rewards//mean": 0.78173828125, + "rewards//std": 0.027835363522171974, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.1024, + "grad_norm": 1.2338354587554932, + "kl": 0.09818124212324619, + "learning_rate": 9.787513715669157e-07, + "loss": 0.0023, + "num_tokens": 5923685.0, + "reward": 0.787353515625, + "reward_std": 0.01785670407116413, + "rewards//mean": 0.787353515625, + "rewards//std": 0.03079620935022831, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.1026, + "grad_norm": 1.359536051750183, + "kl": 0.08856823947280645, + "learning_rate": 9.786597487660335e-07, + "loss": 0.0066, + "num_tokens": 5935206.0, + "reward": 0.79632568359375, + "reward_std": 0.021267715841531754, + "rewards//mean": 0.79632568359375, + "rewards//std": 0.030622635036706924, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1028, + "grad_norm": 1.2578295469284058, + "kl": 0.09269805997610092, + "learning_rate": 9.78567933161062e-07, + "loss": 0.0093, + "num_tokens": 5946798.0, + "reward": 0.82354736328125, + "reward_std": 0.03401347994804382, + "rewards//mean": 0.82354736328125, + "rewards//std": 0.03661448135972023, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.103, + "grad_norm": 1.4712307453155518, + "kl": 0.09900089353322983, + "learning_rate": 9.78475924788984e-07, + "loss": 0.0099, + "num_tokens": 5958390.0, + "reward": 0.80706787109375, + "reward_std": 0.025684040039777756, + "rewards//mean": 0.80706787109375, + "rewards//std": 0.03497748076915741, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.1032, + "grad_norm": 1.3101959228515625, + "kl": 0.10261872783303261, + "learning_rate": 9.783837236868609e-07, + "loss": 0.0097, + "num_tokens": 5969896.0, + "reward": 0.8121337890625, + "reward_std": 0.01823873445391655, + "rewards//mean": 0.8121337890625, + "rewards//std": 0.020180417224764824, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.1034, + "grad_norm": 1.2196444272994995, + "kl": 0.09777991566807032, + "learning_rate": 9.782913298918308e-07, + "loss": 0.0099, + "num_tokens": 5981447.0, + "reward": 0.79998779296875, + "reward_std": 0.03195427358150482, + "rewards//mean": 0.79998779296875, + "rewards//std": 0.04242805391550064, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.1036, + "grad_norm": 1.2648110389709473, + "kl": 0.10987312998622656, + "learning_rate": 9.781987434411106e-07, + "loss": 0.0198, + "num_tokens": 5993047.0, + "reward": 0.795654296875, + "reward_std": 0.020303701981902122, + "rewards//mean": 0.795654296875, + "rewards//std": 0.03092178702354431, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.34375, + "epoch": 0.1038, + "grad_norm": 1.201332926750183, + "kl": 0.09599946439266205, + "learning_rate": 9.781059643719936e-07, + "loss": 0.0112, + "num_tokens": 6004565.0, + "reward": 0.79052734375, + "reward_std": 0.02446388453245163, + "rewards//mean": 0.79052734375, + "rewards//std": 0.026589294895529747, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.104, + "grad_norm": 1.2868168354034424, + "kl": 0.09307952970266342, + "learning_rate": 9.780129927218511e-07, + "loss": 0.0093, + "num_tokens": 6016037.0, + "reward": 0.8040771484375, + "reward_std": 0.04565904662013054, + "rewards//mean": 0.8040771484375, + "rewards//std": 0.05326150357723236, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.1042, + "grad_norm": 1.126501202583313, + "kl": 0.09349644463509321, + "learning_rate": 9.779198285281326e-07, + "loss": 0.0098, + "num_tokens": 6027627.0, + "reward": 0.813720703125, + "reward_std": 0.029926437884569168, + "rewards//mean": 0.813720703125, + "rewards//std": 0.03504844009876251, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1044, + "grad_norm": 1.1983786821365356, + "kl": 0.09455719869583845, + "learning_rate": 9.77826471828364e-07, + "loss": 0.0095, + "num_tokens": 6039179.0, + "reward": 0.8348388671875, + "reward_std": 0.0341310128569603, + "rewards//mean": 0.8348388671875, + "rewards//std": 0.040244210511446, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1046, + "grad_norm": 1.3300468921661377, + "kl": 0.09820039663463831, + "learning_rate": 9.777329226601501e-07, + "loss": 0.0098, + "num_tokens": 6050675.0, + "reward": 0.812255859375, + "reward_std": 0.03599702566862106, + "rewards//mean": 0.812255859375, + "rewards//std": 0.037941984832286835, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.1048, + "grad_norm": 1.3399581909179688, + "kl": 0.09307860303670168, + "learning_rate": 9.776391810611718e-07, + "loss": -0.0036, + "num_tokens": 6062169.0, + "reward": 0.77752685546875, + "reward_std": 0.02888987958431244, + "rewards//mean": 0.77752685546875, + "rewards//std": 0.033396270126104355, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.105, + "grad_norm": 1.3198696374893188, + "kl": 0.09150571748614311, + "learning_rate": 9.775452470691885e-07, + "loss": 0.0092, + "num_tokens": 6073705.0, + "reward": 0.79681396484375, + "reward_std": 0.022306114435195923, + "rewards//mean": 0.79681396484375, + "rewards//std": 0.028505917638540268, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.1052, + "grad_norm": 1.1961342096328735, + "kl": 0.1122370669618249, + "learning_rate": 9.774511207220368e-07, + "loss": -0.0043, + "num_tokens": 6085215.0, + "reward": 0.8270263671875, + "reward_std": 0.03180377557873726, + "rewards//mean": 0.8270263671875, + "rewards//std": 0.0469961054623127, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.1054, + "grad_norm": 1.1468545198440552, + "kl": 0.09769860655069351, + "learning_rate": 9.77356802057631e-07, + "loss": 0.0105, + "num_tokens": 6096834.0, + "reward": 0.78155517578125, + "reward_std": 0.027521200478076935, + "rewards//mean": 0.78155517578125, + "rewards//std": 0.03228915110230446, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.1056, + "grad_norm": 1.1783431768417358, + "kl": 0.09257271513342857, + "learning_rate": 9.77262291113962e-07, + "loss": 0.0093, + "num_tokens": 6108407.0, + "reward": 0.82122802734375, + "reward_std": 0.024412354454398155, + "rewards//mean": 0.82122802734375, + "rewards//std": 0.02632603608071804, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.1058, + "grad_norm": 1.6203062534332275, + "kl": 0.1146109439432621, + "learning_rate": 9.771675879290996e-07, + "loss": 0.0054, + "num_tokens": 6120021.0, + "reward": 0.81829833984375, + "reward_std": 0.02651885151863098, + "rewards//mean": 0.81829833984375, + "rewards//std": 0.030401363968849182, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.106, + "grad_norm": 1.3683401346206665, + "kl": 0.09059350937604904, + "learning_rate": 9.770726925411897e-07, + "loss": 0.015, + "num_tokens": 6131615.0, + "reward": 0.8226318359375, + "reward_std": 0.027664324268698692, + "rewards//mean": 0.8226318359375, + "rewards//std": 0.0347558967769146, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1062, + "grad_norm": 1.4481327533721924, + "kl": 0.10410920437425375, + "learning_rate": 9.769776049884563e-07, + "loss": 0.0104, + "num_tokens": 6143183.0, + "reward": 0.8001708984375, + "reward_std": 0.03135540708899498, + "rewards//mean": 0.8001708984375, + "rewards//std": 0.03838208690285683, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.1064, + "grad_norm": 1.4059730768203735, + "kl": 0.09134924225509167, + "learning_rate": 9.768823253092008e-07, + "loss": 0.0114, + "num_tokens": 6154836.0, + "reward": 0.8104248046875, + "reward_std": 0.026328952983021736, + "rewards//mean": 0.8104248046875, + "rewards//std": 0.03291073814034462, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1066, + "grad_norm": 1.12079656124115, + "kl": 0.11107327323406935, + "learning_rate": 9.767868535418014e-07, + "loss": 0.0111, + "num_tokens": 6166460.0, + "reward": 0.81451416015625, + "reward_std": 0.024560201913118362, + "rewards//mean": 0.81451416015625, + "rewards//std": 0.033240433782339096, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.375, + "epoch": 0.1068, + "grad_norm": 1.6003732681274414, + "kl": 0.10313827730715275, + "learning_rate": 9.766911897247146e-07, + "loss": -0.0243, + "num_tokens": 6177980.0, + "reward": 0.81597900390625, + "reward_std": 0.026936236768960953, + "rewards//mean": 0.81597900390625, + "rewards//std": 0.03286027908325195, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.107, + "grad_norm": 1.49257230758667, + "kl": 0.11361462064087391, + "learning_rate": 9.765953338964734e-07, + "loss": 0.0114, + "num_tokens": 6189564.0, + "reward": 0.82647705078125, + "reward_std": 0.03576479107141495, + "rewards//mean": 0.82647705078125, + "rewards//std": 0.043307337909936905, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1072, + "grad_norm": 1.1492758989334106, + "kl": 0.11430125031620264, + "learning_rate": 9.76499286095689e-07, + "loss": 0.0114, + "num_tokens": 6201132.0, + "reward": 0.8201904296875, + "reward_std": 0.025126760825514793, + "rewards//mean": 0.8201904296875, + "rewards//std": 0.03667009621858597, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1074, + "grad_norm": 1.2170757055282593, + "kl": 0.112099370919168, + "learning_rate": 9.764030463610488e-07, + "loss": 0.0112, + "num_tokens": 6212740.0, + "reward": 0.8155517578125, + "reward_std": 0.023733610287308693, + "rewards//mean": 0.8155517578125, + "rewards//std": 0.027695519849658012, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.1076, + "grad_norm": 1.2131115198135376, + "kl": 0.11502694059163332, + "learning_rate": 9.763066147313189e-07, + "loss": 0.0078, + "num_tokens": 6224313.0, + "reward": 0.81744384765625, + "reward_std": 0.02053101919591427, + "rewards//mean": 0.81744384765625, + "rewards//std": 0.02306501939892769, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1078, + "grad_norm": 1.1164547204971313, + "kl": 0.11128564272075891, + "learning_rate": 9.762099912453412e-07, + "loss": 0.0111, + "num_tokens": 6235825.0, + "reward": 0.811767578125, + "reward_std": 0.024952014908194542, + "rewards//mean": 0.811767578125, + "rewards//std": 0.030733227729797363, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.108, + "grad_norm": 1.2678186893463135, + "kl": 0.12174430023878813, + "learning_rate": 9.76113175942036e-07, + "loss": -0.0011, + "num_tokens": 6247326.0, + "reward": 0.7957763671875, + "reward_std": 0.02578216791152954, + "rewards//mean": 0.7957763671875, + "rewards//std": 0.03105928935110569, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1082, + "grad_norm": 1.3169679641723633, + "kl": 0.1103695947676897, + "learning_rate": 9.760161688604007e-07, + "loss": 0.011, + "num_tokens": 6258854.0, + "reward": 0.78472900390625, + "reward_std": 0.025775330141186714, + "rewards//mean": 0.78472900390625, + "rewards//std": 0.03212983161211014, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1084, + "grad_norm": 1.2496285438537598, + "kl": 0.11027486063539982, + "learning_rate": 9.759189700395095e-07, + "loss": 0.011, + "num_tokens": 6270422.0, + "reward": 0.81463623046875, + "reward_std": 0.02733474038541317, + "rewards//mean": 0.81463623046875, + "rewards//std": 0.04740896448493004, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1086, + "grad_norm": 1.2534323930740356, + "kl": 0.10321422759443521, + "learning_rate": 9.758215795185138e-07, + "loss": 0.0103, + "num_tokens": 6282030.0, + "reward": 0.81219482421875, + "reward_std": 0.031105205416679382, + "rewards//mean": 0.81219482421875, + "rewards//std": 0.03638055548071861, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1088, + "grad_norm": 1.247572660446167, + "kl": 0.11084875371307135, + "learning_rate": 9.757239973366428e-07, + "loss": 0.0111, + "num_tokens": 6293606.0, + "reward": 0.802734375, + "reward_std": 0.03025290183722973, + "rewards//mean": 0.802734375, + "rewards//std": 0.03516658395528793, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.109, + "grad_norm": 1.312682867050171, + "kl": 0.11360176838934422, + "learning_rate": 9.756262235332028e-07, + "loss": 0.0114, + "num_tokens": 6305166.0, + "reward": 0.8128662109375, + "reward_std": 0.01981412246823311, + "rewards//mean": 0.8128662109375, + "rewards//std": 0.02158384583890438, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1092, + "grad_norm": 1.196443796157837, + "kl": 0.11094903107732534, + "learning_rate": 9.755282581475767e-07, + "loss": 0.0111, + "num_tokens": 6316694.0, + "reward": 0.81512451171875, + "reward_std": 0.030984356999397278, + "rewards//mean": 0.81512451171875, + "rewards//std": 0.03601757809519768, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.421875, + "epoch": 0.1094, + "grad_norm": 1.1314456462860107, + "kl": 0.09996502101421356, + "learning_rate": 9.754301012192253e-07, + "loss": 0.0008, + "num_tokens": 6328177.0, + "reward": 0.825927734375, + "reward_std": 0.029453981667757034, + "rewards//mean": 0.825927734375, + "rewards//std": 0.03124128095805645, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.1096, + "grad_norm": 1.4698749780654907, + "kl": 0.11026521865278482, + "learning_rate": 9.753317527876856e-07, + "loss": 0.0132, + "num_tokens": 6339717.0, + "reward": 0.83306884765625, + "reward_std": 0.03581928834319115, + "rewards//mean": 0.83306884765625, + "rewards//std": 0.03683007135987282, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.1098, + "grad_norm": 1.1762778759002686, + "kl": 0.14345601480454206, + "learning_rate": 9.75233212892573e-07, + "loss": 0.0099, + "num_tokens": 6351222.0, + "reward": 0.81591796875, + "reward_std": 0.028282009065151215, + "rewards//mean": 0.81591796875, + "rewards//std": 0.030879657715559006, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.11, + "grad_norm": 1.08161199092865, + "kl": 0.11797116510570049, + "learning_rate": 9.75134481573579e-07, + "loss": 0.0119, + "num_tokens": 6362697.0, + "reward": 0.83251953125, + "reward_std": 0.03080570697784424, + "rewards//mean": 0.83251953125, + "rewards//std": 0.038735613226890564, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.1102, + "grad_norm": 1.3603509664535522, + "kl": 0.11399806011468172, + "learning_rate": 9.750355588704727e-07, + "loss": 0.0034, + "num_tokens": 6374435.0, + "reward": 0.78857421875, + "reward_std": 0.02743181586265564, + "rewards//mean": 0.78857421875, + "rewards//std": 0.03349815681576729, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1104, + "grad_norm": 1.3117700815200806, + "kl": 0.13058975618332624, + "learning_rate": 9.749364448231e-07, + "loss": 0.0131, + "num_tokens": 6386035.0, + "reward": 0.7891845703125, + "reward_std": 0.025663703680038452, + "rewards//mean": 0.7891845703125, + "rewards//std": 0.04000578075647354, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1106, + "grad_norm": 1.201008677482605, + "kl": 0.13088107760995626, + "learning_rate": 9.748371394713841e-07, + "loss": 0.0131, + "num_tokens": 6397619.0, + "reward": 0.83544921875, + "reward_std": 0.031003683805465698, + "rewards//mean": 0.83544921875, + "rewards//std": 0.0357234887778759, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.1108, + "grad_norm": 1.1407558917999268, + "kl": 0.12917668279260397, + "learning_rate": 9.747376428553253e-07, + "loss": 0.0011, + "num_tokens": 6409124.0, + "reward": 0.810302734375, + "reward_std": 0.026500778272747993, + "rewards//mean": 0.810302734375, + "rewards//std": 0.030256683006882668, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.111, + "grad_norm": 1.1229151487350464, + "kl": 0.12356696277856827, + "learning_rate": 9.746379550150008e-07, + "loss": 0.0093, + "num_tokens": 6420696.0, + "reward": 0.8502197265625, + "reward_std": 0.03003159537911415, + "rewards//mean": 0.8502197265625, + "rewards//std": 0.04210163280367851, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1112, + "grad_norm": 1.1989996433258057, + "kl": 0.13624413590878248, + "learning_rate": 9.745380759905647e-07, + "loss": 0.0136, + "num_tokens": 6432368.0, + "reward": 0.81732177734375, + "reward_std": 0.02304716967046261, + "rewards//mean": 0.81732177734375, + "rewards//std": 0.029624631628394127, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.1114, + "grad_norm": 1.508568525314331, + "kl": 0.1207421999424696, + "learning_rate": 9.744380058222482e-07, + "loss": 0.0117, + "num_tokens": 6443941.0, + "reward": 0.814697265625, + "reward_std": 0.029115233570337296, + "rewards//mean": 0.814697265625, + "rewards//std": 0.03594231978058815, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1116, + "grad_norm": 1.5525394678115845, + "kl": 0.12516305223107338, + "learning_rate": 9.743377445503597e-07, + "loss": 0.0125, + "num_tokens": 6455565.0, + "reward": 0.799560546875, + "reward_std": 0.025408893823623657, + "rewards//mean": 0.799560546875, + "rewards//std": 0.027106888592243195, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1118, + "grad_norm": 1.1782374382019043, + "kl": 0.11256752163171768, + "learning_rate": 9.742372922152845e-07, + "loss": 0.0113, + "num_tokens": 6467133.0, + "reward": 0.83892822265625, + "reward_std": 0.02843494340777397, + "rewards//mean": 0.83892822265625, + "rewards//std": 0.0386369563639164, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.112, + "grad_norm": 1.2065207958221436, + "kl": 0.11853205598890781, + "learning_rate": 9.74136648857485e-07, + "loss": 0.0119, + "num_tokens": 6478685.0, + "reward": 0.82476806640625, + "reward_std": 0.025786232203245163, + "rewards//mean": 0.82476806640625, + "rewards//std": 0.03695235028862953, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1122, + "grad_norm": 1.3910183906555176, + "kl": 0.11939899530261755, + "learning_rate": 9.740358145174997e-07, + "loss": 0.0119, + "num_tokens": 6490317.0, + "reward": 0.8216552734375, + "reward_std": 0.024913553148508072, + "rewards//mean": 0.8216552734375, + "rewards//std": 0.03401987627148628, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1124, + "grad_norm": 1.37178373336792, + "kl": 0.14490484725683928, + "learning_rate": 9.73934789235945e-07, + "loss": 0.0145, + "num_tokens": 6501957.0, + "reward": 0.8167724609375, + "reward_std": 0.028217412531375885, + "rewards//mean": 0.8167724609375, + "rewards//std": 0.0335286483168602, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1126, + "grad_norm": 1.3405133485794067, + "kl": 0.10781250894069672, + "learning_rate": 9.73833573053514e-07, + "loss": 0.0108, + "num_tokens": 6513573.0, + "reward": 0.82861328125, + "reward_std": 0.03741041570901871, + "rewards//mean": 0.82861328125, + "rewards//std": 0.04622199386358261, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1128, + "grad_norm": 1.211061716079712, + "kl": 0.12247586343437433, + "learning_rate": 9.737321660109766e-07, + "loss": 0.0122, + "num_tokens": 6525013.0, + "reward": 0.8353271484375, + "reward_std": 0.02499314770102501, + "rewards//mean": 0.8353271484375, + "rewards//std": 0.02750905603170395, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.113, + "grad_norm": 1.2218048572540283, + "kl": 0.1011108374223113, + "learning_rate": 9.73630568149179e-07, + "loss": 0.0101, + "num_tokens": 6536573.0, + "reward": 0.80743408203125, + "reward_std": 0.023550860583782196, + "rewards//mean": 0.80743408203125, + "rewards//std": 0.03435610979795456, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.1132, + "grad_norm": 1.2329257726669312, + "kl": 0.1337531665340066, + "learning_rate": 9.735287795090454e-07, + "loss": 0.0051, + "num_tokens": 6548153.0, + "reward": 0.7958984375, + "reward_std": 0.01887517049908638, + "rewards//mean": 0.7958984375, + "rewards//std": 0.024240141734480858, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1134, + "grad_norm": 1.208362340927124, + "kl": 0.11809708923101425, + "learning_rate": 9.734268001315759e-07, + "loss": 0.0118, + "num_tokens": 6559657.0, + "reward": 0.8101806640625, + "reward_std": 0.03768424689769745, + "rewards//mean": 0.8101806640625, + "rewards//std": 0.04934544488787651, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1136, + "grad_norm": 1.2299914360046387, + "kl": 0.12272349651902914, + "learning_rate": 9.733246300578482e-07, + "loss": 0.0123, + "num_tokens": 6571169.0, + "reward": 0.8524169921875, + "reward_std": 0.03134138882160187, + "rewards//mean": 0.8524169921875, + "rewards//std": 0.036501284688711166, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1138, + "grad_norm": 1.187177062034607, + "kl": 0.12089210469275713, + "learning_rate": 9.73222269329016e-07, + "loss": 0.0121, + "num_tokens": 6582641.0, + "reward": 0.82867431640625, + "reward_std": 0.02502138540148735, + "rewards//mean": 0.82867431640625, + "rewards//std": 0.028150025755167007, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.114, + "grad_norm": 1.1883318424224854, + "kl": 0.10490340273827314, + "learning_rate": 9.731197179863103e-07, + "loss": 0.0105, + "num_tokens": 6594289.0, + "reward": 0.80029296875, + "reward_std": 0.023789796978235245, + "rewards//mean": 0.80029296875, + "rewards//std": 0.027040911838412285, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1142, + "grad_norm": 1.202166199684143, + "kl": 0.13180434796959162, + "learning_rate": 9.730169760710385e-07, + "loss": 0.0123, + "num_tokens": 6605911.0, + "reward": 0.83685302734375, + "reward_std": 0.029161013662815094, + "rewards//mean": 0.83685302734375, + "rewards//std": 0.032672710716724396, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1144, + "grad_norm": 1.2971137762069702, + "kl": 0.1200775159522891, + "learning_rate": 9.729140436245856e-07, + "loss": 0.012, + "num_tokens": 6617463.0, + "reward": 0.81524658203125, + "reward_std": 0.02985677495598793, + "rewards//mean": 0.81524658203125, + "rewards//std": 0.03196781873703003, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1146, + "grad_norm": 1.3372288942337036, + "kl": 0.1181802423670888, + "learning_rate": 9.728109206884125e-07, + "loss": 0.0118, + "num_tokens": 6629015.0, + "reward": 0.83612060546875, + "reward_std": 0.0292508564889431, + "rewards//mean": 0.83612060546875, + "rewards//std": 0.036013372242450714, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.1148, + "grad_norm": 1.2086893320083618, + "kl": 0.10278946440666914, + "learning_rate": 9.72707607304057e-07, + "loss": 0.0103, + "num_tokens": 6640638.0, + "reward": 0.832763671875, + "reward_std": 0.03184591978788376, + "rewards//mean": 0.832763671875, + "rewards//std": 0.040252674371004105, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.115, + "grad_norm": 1.2741526365280151, + "kl": 0.11063980683684349, + "learning_rate": 9.726041035131338e-07, + "loss": 0.0083, + "num_tokens": 6652169.0, + "reward": 0.80828857421875, + "reward_std": 0.02222496084868908, + "rewards//mean": 0.80828857421875, + "rewards//std": 0.028110742568969727, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1152, + "grad_norm": 1.2789784669876099, + "kl": 0.11121342796832323, + "learning_rate": 9.72500409357334e-07, + "loss": 0.0111, + "num_tokens": 6663817.0, + "reward": 0.81365966796875, + "reward_std": 0.026583004742860794, + "rewards//mean": 0.81365966796875, + "rewards//std": 0.030263124033808708, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1154, + "grad_norm": 1.5274078845977783, + "kl": 0.11804934497922659, + "learning_rate": 9.723965248784262e-07, + "loss": 0.0111, + "num_tokens": 6675423.0, + "reward": 0.82537841796875, + "reward_std": 0.032701294869184494, + "rewards//mean": 0.82537841796875, + "rewards//std": 0.039840225130319595, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1156, + "grad_norm": 1.3856934309005737, + "kl": 0.12319510523229837, + "learning_rate": 9.722924501182546e-07, + "loss": 0.0123, + "num_tokens": 6686991.0, + "reward": 0.82275390625, + "reward_std": 0.035941991955041885, + "rewards//mean": 0.82275390625, + "rewards//std": 0.04530106112360954, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1158, + "grad_norm": 1.3114652633666992, + "kl": 0.11609841790050268, + "learning_rate": 9.721881851187405e-07, + "loss": 0.0116, + "num_tokens": 6698743.0, + "reward": 0.8114013671875, + "reward_std": 0.023209843784570694, + "rewards//mean": 0.8114013671875, + "rewards//std": 0.0314621776342392, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.116, + "grad_norm": 1.2497955560684204, + "kl": 0.11556929629296064, + "learning_rate": 9.720837299218818e-07, + "loss": 0.0078, + "num_tokens": 6710242.0, + "reward": 0.8187255859375, + "reward_std": 0.025913972407579422, + "rewards//mean": 0.8187255859375, + "rewards//std": 0.030335379764437675, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1162, + "grad_norm": 1.2352428436279297, + "kl": 0.12888285797089338, + "learning_rate": 9.719790845697532e-07, + "loss": 0.0129, + "num_tokens": 6721746.0, + "reward": 0.81585693359375, + "reward_std": 0.03499312698841095, + "rewards//mean": 0.81585693359375, + "rewards//std": 0.04324052482843399, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1164, + "grad_norm": 1.1900992393493652, + "kl": 0.10719060245901346, + "learning_rate": 9.71874249104506e-07, + "loss": 0.0107, + "num_tokens": 6733322.0, + "reward": 0.8201904296875, + "reward_std": 0.02410704642534256, + "rewards//mean": 0.8201904296875, + "rewards//std": 0.029773274436593056, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1166, + "grad_norm": 1.2646002769470215, + "kl": 0.11360308341681957, + "learning_rate": 9.717692235683674e-07, + "loss": 0.0114, + "num_tokens": 6744954.0, + "reward": 0.818603515625, + "reward_std": 0.026904214173555374, + "rewards//mean": 0.818603515625, + "rewards//std": 0.03202228993177414, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1168, + "grad_norm": 1.176546335220337, + "kl": 0.12164522893726826, + "learning_rate": 9.716640080036423e-07, + "loss": 0.0122, + "num_tokens": 6756506.0, + "reward": 0.818359375, + "reward_std": 0.02788224071264267, + "rewards//mean": 0.818359375, + "rewards//std": 0.030385488644242287, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.117, + "grad_norm": 1.1335680484771729, + "kl": 0.14541488140821457, + "learning_rate": 9.715586024527109e-07, + "loss": 0.0145, + "num_tokens": 6768002.0, + "reward": 0.8453369140625, + "reward_std": 0.03004387952387333, + "rewards//mean": 0.8453369140625, + "rewards//std": 0.03385215625166893, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1172, + "grad_norm": 1.223003625869751, + "kl": 0.12065796460956335, + "learning_rate": 9.714530069580308e-07, + "loss": 0.0121, + "num_tokens": 6779530.0, + "reward": 0.8238525390625, + "reward_std": 0.027275700122117996, + "rewards//mean": 0.8238525390625, + "rewards//std": 0.03796013444662094, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.1174, + "grad_norm": 1.3055769205093384, + "kl": 0.10278373118489981, + "learning_rate": 9.71347221562136e-07, + "loss": 0.0023, + "num_tokens": 6791101.0, + "reward": 0.81903076171875, + "reward_std": 0.022604744881391525, + "rewards//mean": 0.81903076171875, + "rewards//std": 0.029911944642663002, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1176, + "grad_norm": 1.4633756875991821, + "kl": 0.13203849829733372, + "learning_rate": 9.712412463076367e-07, + "loss": 0.0132, + "num_tokens": 6802677.0, + "reward": 0.82830810546875, + "reward_std": 0.026641560718417168, + "rewards//mean": 0.82830810546875, + "rewards//std": 0.030293120071291924, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.1178, + "grad_norm": 1.1674063205718994, + "kl": 0.12614594958722591, + "learning_rate": 9.711350812372196e-07, + "loss": 0.0076, + "num_tokens": 6814222.0, + "reward": 0.812744140625, + "reward_std": 0.023960445076227188, + "rewards//mean": 0.812744140625, + "rewards//std": 0.030344609171152115, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.118, + "grad_norm": 1.3106448650360107, + "kl": 0.1427105749025941, + "learning_rate": 9.710287263936483e-07, + "loss": 0.0143, + "num_tokens": 6825846.0, + "reward": 0.84454345703125, + "reward_std": 0.022686250507831573, + "rewards//mean": 0.84454345703125, + "rewards//std": 0.03536013141274452, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1182, + "grad_norm": 1.1509559154510498, + "kl": 0.11517918296158314, + "learning_rate": 9.709221818197623e-07, + "loss": 0.0115, + "num_tokens": 6837398.0, + "reward": 0.81610107421875, + "reward_std": 0.02988544851541519, + "rewards//mean": 0.81610107421875, + "rewards//std": 0.03874064236879349, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.1184, + "grad_norm": 1.4482303857803345, + "kl": 0.1256280280649662, + "learning_rate": 9.708154475584777e-07, + "loss": 0.0073, + "num_tokens": 6848921.0, + "reward": 0.82720947265625, + "reward_std": 0.023284487426280975, + "rewards//mean": 0.82720947265625, + "rewards//std": 0.026206159964203835, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1186, + "grad_norm": 1.1914942264556885, + "kl": 0.13399283960461617, + "learning_rate": 9.707085236527873e-07, + "loss": 0.0134, + "num_tokens": 6860385.0, + "reward": 0.8209228515625, + "reward_std": 0.020736508071422577, + "rewards//mean": 0.8209228515625, + "rewards//std": 0.02750905603170395, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1188, + "grad_norm": 1.2347382307052612, + "kl": 0.129773017950356, + "learning_rate": 9.706014101457599e-07, + "loss": 0.013, + "num_tokens": 6871961.0, + "reward": 0.78192138671875, + "reward_std": 0.028897032141685486, + "rewards//mean": 0.78192138671875, + "rewards//std": 0.03247800096869469, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.515625, + "epoch": 0.119, + "grad_norm": 1.2105742692947388, + "kl": 0.1253968020901084, + "learning_rate": 9.704941070805405e-07, + "loss": -0.0118, + "num_tokens": 6883506.0, + "reward": 0.81439208984375, + "reward_std": 0.028076469898223877, + "rewards//mean": 0.81439208984375, + "rewards//std": 0.036544542759656906, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1192, + "grad_norm": 1.1256208419799805, + "kl": 0.14606273733079433, + "learning_rate": 9.70386614500351e-07, + "loss": 0.0146, + "num_tokens": 6895114.0, + "reward": 0.82080078125, + "reward_std": 0.02459372952580452, + "rewards//mean": 0.82080078125, + "rewards//std": 0.027948250994086266, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1194, + "grad_norm": 1.1593387126922607, + "kl": 0.12020003795623779, + "learning_rate": 9.702789324484896e-07, + "loss": 0.012, + "num_tokens": 6906754.0, + "reward": 0.8271484375, + "reward_std": 0.026726767420768738, + "rewards//mean": 0.8271484375, + "rewards//std": 0.03046509437263012, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1196, + "grad_norm": 1.404576301574707, + "kl": 0.12157627940177917, + "learning_rate": 9.701710609683305e-07, + "loss": 0.0122, + "num_tokens": 6918298.0, + "reward": 0.7894287109375, + "reward_std": 0.025310108438134193, + "rewards//mean": 0.7894287109375, + "rewards//std": 0.03832841292023659, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1198, + "grad_norm": 1.1148115396499634, + "kl": 0.11435713432729244, + "learning_rate": 9.700630001033243e-07, + "loss": 0.0114, + "num_tokens": 6929874.0, + "reward": 0.78790283203125, + "reward_std": 0.026443835347890854, + "rewards//mean": 0.78790283203125, + "rewards//std": 0.031084319576621056, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.12, + "grad_norm": 1.265589714050293, + "kl": 0.13790509942919016, + "learning_rate": 9.699547498969978e-07, + "loss": 0.0138, + "num_tokens": 6941538.0, + "reward": 0.80181884765625, + "reward_std": 0.02443511411547661, + "rewards//mean": 0.80181884765625, + "rewards//std": 0.03476651385426521, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1202, + "grad_norm": 1.3160189390182495, + "kl": 0.12224370520561934, + "learning_rate": 9.698463103929541e-07, + "loss": 0.0122, + "num_tokens": 6953130.0, + "reward": 0.7861328125, + "reward_std": 0.018871434032917023, + "rewards//mean": 0.7861328125, + "rewards//std": 0.028056368231773376, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1204, + "grad_norm": 1.2451874017715454, + "kl": 0.13376090303063393, + "learning_rate": 9.69737681634873e-07, + "loss": 0.0134, + "num_tokens": 6964730.0, + "reward": 0.81591796875, + "reward_std": 0.021901370957493782, + "rewards//mean": 0.81591796875, + "rewards//std": 0.025108851492404938, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1206, + "grad_norm": 1.1979478597640991, + "kl": 0.12651814054697752, + "learning_rate": 9.696288636665097e-07, + "loss": 0.0127, + "num_tokens": 6976298.0, + "reward": 0.82763671875, + "reward_std": 0.027601445093750954, + "rewards//mean": 0.82763671875, + "rewards//std": 0.040639664977788925, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1208, + "grad_norm": 1.3017418384552002, + "kl": 0.1524014938622713, + "learning_rate": 9.695198565316964e-07, + "loss": 0.0152, + "num_tokens": 6987802.0, + "reward": 0.8096923828125, + "reward_std": 0.023282241076231003, + "rewards//mean": 0.8096923828125, + "rewards//std": 0.028352364897727966, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.121, + "grad_norm": 1.26602041721344, + "kl": 0.14476882852613926, + "learning_rate": 9.69410660274341e-07, + "loss": 0.0145, + "num_tokens": 6999449.0, + "reward": 0.8126220703125, + "reward_std": 0.027829274535179138, + "rewards//mean": 0.8126220703125, + "rewards//std": 0.03893188014626503, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1212, + "grad_norm": 1.2432734966278076, + "kl": 0.13984739687293768, + "learning_rate": 9.693012749384277e-07, + "loss": 0.014, + "num_tokens": 7010961.0, + "reward": 0.8482666015625, + "reward_std": 0.026889599859714508, + "rewards//mean": 0.8482666015625, + "rewards//std": 0.03018731065094471, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1214, + "grad_norm": 1.3396881818771362, + "kl": 0.13147233333438635, + "learning_rate": 9.691917005680173e-07, + "loss": 0.0131, + "num_tokens": 7022433.0, + "reward": 0.82733154296875, + "reward_std": 0.03126010671257973, + "rewards//mean": 0.82733154296875, + "rewards//std": 0.036259688436985016, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1216, + "grad_norm": 1.1638662815093994, + "kl": 0.11859513446688652, + "learning_rate": 9.690819372072456e-07, + "loss": 0.0119, + "num_tokens": 7033977.0, + "reward": 0.783447265625, + "reward_std": 0.019310375675559044, + "rewards//mean": 0.783447265625, + "rewards//std": 0.023713599890470505, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1218, + "grad_norm": 1.3703309297561646, + "kl": 0.14265389181673527, + "learning_rate": 9.68971984900326e-07, + "loss": 0.0143, + "num_tokens": 7045561.0, + "reward": 0.83154296875, + "reward_std": 0.03574472665786743, + "rewards//mean": 0.83154296875, + "rewards//std": 0.03675936162471771, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.122, + "grad_norm": 1.405618667602539, + "kl": 0.12883269414305687, + "learning_rate": 9.688618436915468e-07, + "loss": 0.0032, + "num_tokens": 7057117.0, + "reward": 0.83099365234375, + "reward_std": 0.023905295878648758, + "rewards//mean": 0.83099365234375, + "rewards//std": 0.034345973283052444, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1222, + "grad_norm": 1.2535886764526367, + "kl": 0.11675217561423779, + "learning_rate": 9.68751513625273e-07, + "loss": 0.0117, + "num_tokens": 7068685.0, + "reward": 0.828125, + "reward_std": 0.029207345098257065, + "rewards//mean": 0.828125, + "rewards//std": 0.0363718681037426, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1224, + "grad_norm": 1.2359966039657593, + "kl": 0.13727618847042322, + "learning_rate": 9.686409947459457e-07, + "loss": 0.0137, + "num_tokens": 7080453.0, + "reward": 0.82366943359375, + "reward_std": 0.027125798165798187, + "rewards//mean": 0.82366943359375, + "rewards//std": 0.03177165240049362, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.1226, + "grad_norm": 1.211582899093628, + "kl": 0.11979338992387056, + "learning_rate": 9.685302870980817e-07, + "loss": 0.0082, + "num_tokens": 7092009.0, + "reward": 0.85845947265625, + "reward_std": 0.03233296796679497, + "rewards//mean": 0.85845947265625, + "rewards//std": 0.03985883668065071, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1228, + "grad_norm": 1.3438102006912231, + "kl": 0.14547414425760508, + "learning_rate": 9.684193907262742e-07, + "loss": 0.0145, + "num_tokens": 7103569.0, + "reward": 0.8167724609375, + "reward_std": 0.024924103170633316, + "rewards//mean": 0.8167724609375, + "rewards//std": 0.036276642233133316, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.123, + "grad_norm": 1.329016923904419, + "kl": 0.1411065962165594, + "learning_rate": 9.68308305675192e-07, + "loss": 0.0141, + "num_tokens": 7115049.0, + "reward": 0.83935546875, + "reward_std": 0.03134314343333244, + "rewards//mean": 0.83935546875, + "rewards//std": 0.03352706506848335, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1232, + "grad_norm": 1.3077123165130615, + "kl": 0.1379355899989605, + "learning_rate": 9.681970319895802e-07, + "loss": 0.0138, + "num_tokens": 7126697.0, + "reward": 0.77569580078125, + "reward_std": 0.021954728290438652, + "rewards//mean": 0.77569580078125, + "rewards//std": 0.027523016557097435, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1234, + "grad_norm": 1.4507185220718384, + "kl": 0.14476235676556826, + "learning_rate": 9.6808556971426e-07, + "loss": 0.0145, + "num_tokens": 7138337.0, + "reward": 0.82415771484375, + "reward_std": 0.0437425896525383, + "rewards//mean": 0.82415771484375, + "rewards//std": 0.054862130433321, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.1236, + "grad_norm": 1.169690728187561, + "kl": 0.13282020669430494, + "learning_rate": 9.679739188941283e-07, + "loss": 0.0104, + "num_tokens": 7149915.0, + "reward": 0.82281494140625, + "reward_std": 0.03284463286399841, + "rewards//mean": 0.82281494140625, + "rewards//std": 0.03579709306359291, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1238, + "grad_norm": 1.2285606861114502, + "kl": 0.12787367403507233, + "learning_rate": 9.678620795741582e-07, + "loss": 0.0128, + "num_tokens": 7161611.0, + "reward": 0.80615234375, + "reward_std": 0.024720409885048866, + "rewards//mean": 0.80615234375, + "rewards//std": 0.034340742975473404, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.124, + "grad_norm": 1.486730694770813, + "kl": 0.12794929649680853, + "learning_rate": 9.677500517993982e-07, + "loss": 0.0128, + "num_tokens": 7173235.0, + "reward": 0.8226318359375, + "reward_std": 0.030288420617580414, + "rewards//mean": 0.8226318359375, + "rewards//std": 0.04129131883382797, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1242, + "grad_norm": 1.3796049356460571, + "kl": 0.12353674974292517, + "learning_rate": 9.676378356149732e-07, + "loss": 0.0124, + "num_tokens": 7184819.0, + "reward": 0.82220458984375, + "reward_std": 0.03624134138226509, + "rewards//mean": 0.82220458984375, + "rewards//std": 0.04350925236940384, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1244, + "grad_norm": 1.1188644170761108, + "kl": 0.14867455046623945, + "learning_rate": 9.675254310660841e-07, + "loss": 0.0149, + "num_tokens": 7196371.0, + "reward": 0.8441162109375, + "reward_std": 0.022229300811886787, + "rewards//mean": 0.8441162109375, + "rewards//std": 0.02868570387363434, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1246, + "grad_norm": 1.5610766410827637, + "kl": 0.13624810706824064, + "learning_rate": 9.674128381980071e-07, + "loss": 0.0136, + "num_tokens": 7207963.0, + "reward": 0.793212890625, + "reward_std": 0.020954517647624016, + "rewards//mean": 0.793212890625, + "rewards//std": 0.031310975551605225, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1248, + "grad_norm": 1.2414499521255493, + "kl": 0.14938489720225334, + "learning_rate": 9.67300057056095e-07, + "loss": 0.0149, + "num_tokens": 7219483.0, + "reward": 0.82470703125, + "reward_std": 0.025796670466661453, + "rewards//mean": 0.82470703125, + "rewards//std": 0.029671669006347656, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.125, + "grad_norm": 1.296709656715393, + "kl": 0.16825002804398537, + "learning_rate": 9.671870876857758e-07, + "loss": 0.0168, + "num_tokens": 7231035.0, + "reward": 0.8408203125, + "reward_std": 0.03482663631439209, + "rewards//mean": 0.8408203125, + "rewards//std": 0.04180014878511429, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.1252, + "grad_norm": 1.291471004486084, + "kl": 0.13808605819940567, + "learning_rate": 9.670739301325534e-07, + "loss": 0.0111, + "num_tokens": 7242629.0, + "reward": 0.8125, + "reward_std": 0.0342724546790123, + "rewards//mean": 0.8125, + "rewards//std": 0.04191587492823601, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1254, + "grad_norm": 1.0764292478561401, + "kl": 0.15403421875089407, + "learning_rate": 9.669605844420078e-07, + "loss": 0.0154, + "num_tokens": 7254213.0, + "reward": 0.81280517578125, + "reward_std": 0.025021512061357498, + "rewards//mean": 0.81280517578125, + "rewards//std": 0.032624952495098114, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1256, + "grad_norm": 1.2726171016693115, + "kl": 0.149785284884274, + "learning_rate": 9.668470506597946e-07, + "loss": 0.015, + "num_tokens": 7265885.0, + "reward": 0.81658935546875, + "reward_std": 0.042385686188936234, + "rewards//mean": 0.81658935546875, + "rewards//std": 0.047960065305233, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.1258, + "grad_norm": 1.2755024433135986, + "kl": 0.14496907405555248, + "learning_rate": 9.667333288316453e-07, + "loss": 0.0148, + "num_tokens": 7277422.0, + "reward": 0.82708740234375, + "reward_std": 0.019414519891142845, + "rewards//mean": 0.82708740234375, + "rewards//std": 0.025997377932071686, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.126, + "grad_norm": 1.1242860555648804, + "kl": 0.14610035717487335, + "learning_rate": 9.66619419003367e-07, + "loss": 0.0146, + "num_tokens": 7289046.0, + "reward": 0.8193359375, + "reward_std": 0.02854299545288086, + "rewards//mean": 0.8193359375, + "rewards//std": 0.03170398250222206, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1262, + "grad_norm": 1.1154667139053345, + "kl": 0.1466538393869996, + "learning_rate": 9.665053212208426e-07, + "loss": 0.0146, + "num_tokens": 7300636.0, + "reward": 0.8101806640625, + "reward_std": 0.027686744928359985, + "rewards//mean": 0.8101806640625, + "rewards//std": 0.03410165011882782, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1264, + "grad_norm": 1.1868302822113037, + "kl": 0.13812534511089325, + "learning_rate": 9.663910355300304e-07, + "loss": 0.0138, + "num_tokens": 7312212.0, + "reward": 0.81500244140625, + "reward_std": 0.02388088032603264, + "rewards//mean": 0.81500244140625, + "rewards//std": 0.03367028385400772, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.1266, + "grad_norm": 1.1835402250289917, + "kl": 0.15147992223501205, + "learning_rate": 9.66276561976965e-07, + "loss": 0.019, + "num_tokens": 7323770.0, + "reward": 0.8419189453125, + "reward_std": 0.023815032094717026, + "rewards//mean": 0.8419189453125, + "rewards//std": 0.02603207528591156, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.53125, + "epoch": 0.1268, + "grad_norm": 1.1192691326141357, + "kl": 0.15467112511396408, + "learning_rate": 9.661619006077561e-07, + "loss": 0.0016, + "num_tokens": 7335300.0, + "reward": 0.83819580078125, + "reward_std": 0.03290678188204765, + "rewards//mean": 0.83819580078125, + "rewards//std": 0.03912324830889702, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.127, + "grad_norm": 1.4517428874969482, + "kl": 0.14558970741927624, + "learning_rate": 9.660470514685895e-07, + "loss": 0.0146, + "num_tokens": 7346828.0, + "reward": 0.802490234375, + "reward_std": 0.02557399310171604, + "rewards//mean": 0.802490234375, + "rewards//std": 0.028703903779387474, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1272, + "grad_norm": 1.245538353919983, + "kl": 0.13768978230655193, + "learning_rate": 9.659320146057262e-07, + "loss": 0.0138, + "num_tokens": 7358380.0, + "reward": 0.80035400390625, + "reward_std": 0.028152545914053917, + "rewards//mean": 0.80035400390625, + "rewards//std": 0.034274499863386154, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.1274, + "grad_norm": 1.3385926485061646, + "kl": 0.131441293284297, + "learning_rate": 9.65816790065503e-07, + "loss": 0.0127, + "num_tokens": 7369985.0, + "reward": 0.83203125, + "reward_std": 0.026648562401533127, + "rewards//mean": 0.83203125, + "rewards//std": 0.04247267544269562, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1276, + "grad_norm": 1.0533852577209473, + "kl": 0.16539601050317287, + "learning_rate": 9.657013778943327e-07, + "loss": 0.0165, + "num_tokens": 7381521.0, + "reward": 0.84228515625, + "reward_std": 0.022608309984207153, + "rewards//mean": 0.84228515625, + "rewards//std": 0.03834598511457443, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1278, + "grad_norm": 1.1917743682861328, + "kl": 0.13295103702694178, + "learning_rate": 9.65585778138703e-07, + "loss": 0.0133, + "num_tokens": 7393089.0, + "reward": 0.806640625, + "reward_std": 0.026672042906284332, + "rewards//mean": 0.806640625, + "rewards//std": 0.03800656273961067, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.128, + "grad_norm": 1.1815310716629028, + "kl": 0.16485237702727318, + "learning_rate": 9.654699908451776e-07, + "loss": 0.0165, + "num_tokens": 7404672.0, + "reward": 0.8153076171875, + "reward_std": 0.0284893698990345, + "rewards//mean": 0.8153076171875, + "rewards//std": 0.04183761402964592, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.1282, + "grad_norm": 1.2857344150543213, + "kl": 0.1408406412228942, + "learning_rate": 9.653540160603955e-07, + "loss": 0.0122, + "num_tokens": 7416232.0, + "reward": 0.78887939453125, + "reward_std": 0.024209661409258842, + "rewards//mean": 0.78887939453125, + "rewards//std": 0.03553948178887367, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1284, + "grad_norm": 1.382019281387329, + "kl": 0.14308995753526688, + "learning_rate": 9.652378538310713e-07, + "loss": 0.0143, + "num_tokens": 7427792.0, + "reward": 0.81878662109375, + "reward_std": 0.027863318100571632, + "rewards//mean": 0.81878662109375, + "rewards//std": 0.04251608997583389, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1286, + "grad_norm": 1.3685214519500732, + "kl": 0.14664169866591692, + "learning_rate": 9.651215042039953e-07, + "loss": 0.0147, + "num_tokens": 7439368.0, + "reward": 0.85394287109375, + "reward_std": 0.028918026015162468, + "rewards//mean": 0.85394287109375, + "rewards//std": 0.03701620176434517, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1288, + "grad_norm": 1.229391098022461, + "kl": 0.16724017821252346, + "learning_rate": 9.650049672260333e-07, + "loss": 0.0167, + "num_tokens": 7450944.0, + "reward": 0.84136962890625, + "reward_std": 0.018721679225564003, + "rewards//mean": 0.84136962890625, + "rewards//std": 0.021616345271468163, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.129, + "grad_norm": 1.0987398624420166, + "kl": 0.15099513344466686, + "learning_rate": 9.648882429441256e-07, + "loss": 0.0151, + "num_tokens": 7462480.0, + "reward": 0.85577392578125, + "reward_std": 0.02985389344394207, + "rewards//mean": 0.85577392578125, + "rewards//std": 0.03486433997750282, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1292, + "grad_norm": 1.100364089012146, + "kl": 0.15432786010205746, + "learning_rate": 9.647713314052895e-07, + "loss": 0.0154, + "num_tokens": 7474192.0, + "reward": 0.83453369140625, + "reward_std": 0.02226937748491764, + "rewards//mean": 0.83453369140625, + "rewards//std": 0.031252846121788025, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.1294, + "grad_norm": 1.1075187921524048, + "kl": 0.1691021155565977, + "learning_rate": 9.646542326566168e-07, + "loss": 0.0167, + "num_tokens": 7485771.0, + "reward": 0.81903076171875, + "reward_std": 0.026576977223157883, + "rewards//mean": 0.81903076171875, + "rewards//std": 0.036272209137678146, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1296, + "grad_norm": 1.2719308137893677, + "kl": 0.16073499340564013, + "learning_rate": 9.645369467452745e-07, + "loss": 0.0161, + "num_tokens": 7497419.0, + "reward": 0.79510498046875, + "reward_std": 0.018313627690076828, + "rewards//mean": 0.79510498046875, + "rewards//std": 0.021292487159371376, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.1298, + "grad_norm": 1.4146422147750854, + "kl": 0.17992917820811272, + "learning_rate": 9.644194737185057e-07, + "loss": 0.0153, + "num_tokens": 7508955.0, + "reward": 0.841796875, + "reward_std": 0.030942000448703766, + "rewards//mean": 0.841796875, + "rewards//std": 0.03460421413183212, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.13, + "grad_norm": 1.1167027950286865, + "kl": 0.1580989509820938, + "learning_rate": 9.643018136236286e-07, + "loss": 0.0158, + "num_tokens": 7520595.0, + "reward": 0.79876708984375, + "reward_std": 0.021750640124082565, + "rewards//mean": 0.79876708984375, + "rewards//std": 0.04469006881117821, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1302, + "grad_norm": 1.413482666015625, + "kl": 0.15827967785298824, + "learning_rate": 9.641839665080363e-07, + "loss": 0.0158, + "num_tokens": 7532107.0, + "reward": 0.80169677734375, + "reward_std": 0.0197550468146801, + "rewards//mean": 0.80169677734375, + "rewards//std": 0.022965045645833015, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.1304, + "grad_norm": 1.150400161743164, + "kl": 0.165702604688704, + "learning_rate": 9.640659324191978e-07, + "loss": 0.0147, + "num_tokens": 7543672.0, + "reward": 0.81365966796875, + "reward_std": 0.024539198726415634, + "rewards//mean": 0.81365966796875, + "rewards//std": 0.02798444777727127, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1306, + "grad_norm": 1.255098581314087, + "kl": 0.1565816579386592, + "learning_rate": 9.639477114046572e-07, + "loss": 0.0157, + "num_tokens": 7555280.0, + "reward": 0.86004638671875, + "reward_std": 0.0225874911993742, + "rewards//mean": 0.86004638671875, + "rewards//std": 0.02793138660490513, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1308, + "grad_norm": 1.046695590019226, + "kl": 0.16154744010418653, + "learning_rate": 9.63829303512034e-07, + "loss": 0.0162, + "num_tokens": 7566880.0, + "reward": 0.8233642578125, + "reward_std": 0.022659584879875183, + "rewards//mean": 0.8233642578125, + "rewards//std": 0.028751065954566002, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.131, + "grad_norm": 1.0935418605804443, + "kl": 0.15650385804474354, + "learning_rate": 9.63710708789023e-07, + "loss": 0.0157, + "num_tokens": 7578440.0, + "reward": 0.8052978515625, + "reward_std": 0.019535548985004425, + "rewards//mean": 0.8052978515625, + "rewards//std": 0.025950536131858826, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1312, + "grad_norm": 1.2375555038452148, + "kl": 0.18075008317828178, + "learning_rate": 9.635919272833937e-07, + "loss": 0.0181, + "num_tokens": 7590000.0, + "reward": 0.8323974609375, + "reward_std": 0.029504738748073578, + "rewards//mean": 0.8323974609375, + "rewards//std": 0.031340695917606354, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1314, + "grad_norm": 1.2551177740097046, + "kl": 0.16362422239035368, + "learning_rate": 9.634729590429916e-07, + "loss": 0.0164, + "num_tokens": 7601600.0, + "reward": 0.7640380859375, + "reward_std": 0.020554494112730026, + "rewards//mean": 0.7640380859375, + "rewards//std": 0.027283618226647377, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1316, + "grad_norm": 1.1948167085647583, + "kl": 0.16558035090565681, + "learning_rate": 9.63353804115737e-07, + "loss": 0.0166, + "num_tokens": 7613264.0, + "reward": 0.8492431640625, + "reward_std": 0.03233948349952698, + "rewards//mean": 0.8492431640625, + "rewards//std": 0.041150301694869995, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1318, + "grad_norm": 1.1545871496200562, + "kl": 0.17060186248272657, + "learning_rate": 9.632344625496255e-07, + "loss": 0.0171, + "num_tokens": 7624824.0, + "reward": 0.840576171875, + "reward_std": 0.024097809568047523, + "rewards//mean": 0.840576171875, + "rewards//std": 0.027355927973985672, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.132, + "grad_norm": 1.0850441455841064, + "kl": 0.14781702496111393, + "learning_rate": 9.63114934392728e-07, + "loss": 0.0148, + "num_tokens": 7636400.0, + "reward": 0.8115234375, + "reward_std": 0.022033847868442535, + "rewards//mean": 0.8115234375, + "rewards//std": 0.0327264703810215, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1322, + "grad_norm": 1.173119068145752, + "kl": 0.16661114059388638, + "learning_rate": 9.6299521969319e-07, + "loss": 0.0167, + "num_tokens": 7648024.0, + "reward": 0.80047607421875, + "reward_std": 0.016896428540349007, + "rewards//mean": 0.80047607421875, + "rewards//std": 0.021891212090849876, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.1324, + "grad_norm": 1.296788215637207, + "kl": 0.16532842814922333, + "learning_rate": 9.628753184992333e-07, + "loss": 0.0135, + "num_tokens": 7659536.0, + "reward": 0.83087158203125, + "reward_std": 0.03235410898923874, + "rewards//mean": 0.83087158203125, + "rewards//std": 0.04061688855290413, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1326, + "grad_norm": 1.1631090641021729, + "kl": 0.15770272724330425, + "learning_rate": 9.627552308591533e-07, + "loss": 0.0158, + "num_tokens": 7671088.0, + "reward": 0.8436279296875, + "reward_std": 0.025664018467068672, + "rewards//mean": 0.8436279296875, + "rewards//std": 0.030875490978360176, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1328, + "grad_norm": 1.1104793548583984, + "kl": 0.17702124826610088, + "learning_rate": 9.62634956821322e-07, + "loss": 0.0177, + "num_tokens": 7682656.0, + "reward": 0.80047607421875, + "reward_std": 0.017546044662594795, + "rewards//mean": 0.80047607421875, + "rewards//std": 0.019970322027802467, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.133, + "grad_norm": 1.4134159088134766, + "kl": 0.14951214846223593, + "learning_rate": 9.625144964341852e-07, + "loss": 0.015, + "num_tokens": 7694168.0, + "reward": 0.82318115234375, + "reward_std": 0.029057979583740234, + "rewards//mean": 0.82318115234375, + "rewards//std": 0.038205139338970184, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1332, + "grad_norm": 1.2921435832977295, + "kl": 0.15267203841358423, + "learning_rate": 9.623938497462645e-07, + "loss": 0.0153, + "num_tokens": 7705768.0, + "reward": 0.81817626953125, + "reward_std": 0.023136954754590988, + "rewards//mean": 0.81817626953125, + "rewards//std": 0.027285490185022354, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1334, + "grad_norm": 1.323230266571045, + "kl": 0.16745659243315458, + "learning_rate": 9.622730168061567e-07, + "loss": 0.0149, + "num_tokens": 7717222.0, + "reward": 0.8326416015625, + "reward_std": 0.022858578711748123, + "rewards//mean": 0.8326416015625, + "rewards//std": 0.031918831169605255, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.1336, + "grad_norm": 1.1825178861618042, + "kl": 0.13435619696974754, + "learning_rate": 9.621519976625326e-07, + "loss": 0.0111, + "num_tokens": 7728839.0, + "reward": 0.7935791015625, + "reward_std": 0.030785975977778435, + "rewards//mean": 0.7935791015625, + "rewards//std": 0.03653278946876526, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.1338, + "grad_norm": 1.3217600584030151, + "kl": 0.1689299065619707, + "learning_rate": 9.620307923641392e-07, + "loss": 0.0044, + "num_tokens": 7740465.0, + "reward": 0.815673828125, + "reward_std": 0.028295379132032394, + "rewards//mean": 0.815673828125, + "rewards//std": 0.031218014657497406, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.134, + "grad_norm": 1.1593748331069946, + "kl": 0.13407859299331903, + "learning_rate": 9.61909400959798e-07, + "loss": 0.0134, + "num_tokens": 7752057.0, + "reward": 0.80279541015625, + "reward_std": 0.030773909762501717, + "rewards//mean": 0.80279541015625, + "rewards//std": 0.03571326658129692, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1342, + "grad_norm": 1.212249994277954, + "kl": 0.15374301560223103, + "learning_rate": 9.617878234984054e-07, + "loss": 0.0154, + "num_tokens": 7763497.0, + "reward": 0.82281494140625, + "reward_std": 0.019249988719820976, + "rewards//mean": 0.82281494140625, + "rewards//std": 0.029946843162178993, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1344, + "grad_norm": 1.1040232181549072, + "kl": 0.1661779684945941, + "learning_rate": 9.616660600289327e-07, + "loss": 0.0166, + "num_tokens": 7775041.0, + "reward": 0.806396484375, + "reward_std": 0.026026234030723572, + "rewards//mean": 0.806396484375, + "rewards//std": 0.030717460438609123, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1346, + "grad_norm": 1.2002105712890625, + "kl": 0.1672296728938818, + "learning_rate": 9.615441106004262e-07, + "loss": 0.0167, + "num_tokens": 7786553.0, + "reward": 0.82696533203125, + "reward_std": 0.028828883543610573, + "rewards//mean": 0.82696533203125, + "rewards//std": 0.03560246154665947, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1348, + "grad_norm": 1.442652702331543, + "kl": 0.17829453386366367, + "learning_rate": 9.614219752620072e-07, + "loss": 0.0178, + "num_tokens": 7798257.0, + "reward": 0.823486328125, + "reward_std": 0.03340047597885132, + "rewards//mean": 0.823486328125, + "rewards//std": 0.036543767899274826, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.135, + "grad_norm": 1.1492503881454468, + "kl": 0.1543736569583416, + "learning_rate": 9.612996540628717e-07, + "loss": 0.0154, + "num_tokens": 7809785.0, + "reward": 0.8116455078125, + "reward_std": 0.030009740963578224, + "rewards//mean": 0.8116455078125, + "rewards//std": 0.04296857491135597, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1352, + "grad_norm": 1.5224509239196777, + "kl": 0.1540409680455923, + "learning_rate": 9.611771470522907e-07, + "loss": 0.0154, + "num_tokens": 7821353.0, + "reward": 0.8074951171875, + "reward_std": 0.02628636546432972, + "rewards//mean": 0.8074951171875, + "rewards//std": 0.030965572223067284, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1354, + "grad_norm": 1.3093997240066528, + "kl": 0.174217676743865, + "learning_rate": 9.6105445427961e-07, + "loss": 0.0174, + "num_tokens": 7832873.0, + "reward": 0.82147216796875, + "reward_std": 0.02061517909169197, + "rewards//mean": 0.82147216796875, + "rewards//std": 0.022181103006005287, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1356, + "grad_norm": 1.2596698999404907, + "kl": 0.17036225646734238, + "learning_rate": 9.609315757942502e-07, + "loss": 0.017, + "num_tokens": 7844417.0, + "reward": 0.8431396484375, + "reward_std": 0.03238525241613388, + "rewards//mean": 0.8431396484375, + "rewards//std": 0.037052836269140244, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1358, + "grad_norm": 1.2148748636245728, + "kl": 0.17456241976469755, + "learning_rate": 9.608085116457068e-07, + "loss": 0.0175, + "num_tokens": 7855977.0, + "reward": 0.83978271484375, + "reward_std": 0.024061156436800957, + "rewards//mean": 0.83978271484375, + "rewards//std": 0.03043719381093979, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.136, + "grad_norm": 1.184975504875183, + "kl": 0.16744863986968994, + "learning_rate": 9.606852618835502e-07, + "loss": 0.0167, + "num_tokens": 7867569.0, + "reward": 0.83270263671875, + "reward_std": 0.025440191850066185, + "rewards//mean": 0.83270263671875, + "rewards//std": 0.028293240815401077, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1362, + "grad_norm": 1.2261731624603271, + "kl": 0.16495999135077, + "learning_rate": 9.60561826557425e-07, + "loss": 0.0165, + "num_tokens": 7879041.0, + "reward": 0.83660888671875, + "reward_std": 0.03255604952573776, + "rewards//mean": 0.83660888671875, + "rewards//std": 0.03766443207859993, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1364, + "grad_norm": 1.232954502105713, + "kl": 0.1781325088813901, + "learning_rate": 9.604382057170512e-07, + "loss": 0.0178, + "num_tokens": 7890617.0, + "reward": 0.8291015625, + "reward_std": 0.026488471776247025, + "rewards//mean": 0.8291015625, + "rewards//std": 0.033087119460105896, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1366, + "grad_norm": 1.2845312356948853, + "kl": 0.17245538905262947, + "learning_rate": 9.603143994122232e-07, + "loss": 0.0172, + "num_tokens": 7902161.0, + "reward": 0.76483154296875, + "reward_std": 0.01435815542936325, + "rewards//mean": 0.76483154296875, + "rewards//std": 0.027762871235609055, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1368, + "grad_norm": 1.386894941329956, + "kl": 0.16626290045678616, + "learning_rate": 9.601904076928102e-07, + "loss": 0.0166, + "num_tokens": 7913705.0, + "reward": 0.7999267578125, + "reward_std": 0.024868233129382133, + "rewards//mean": 0.7999267578125, + "rewards//std": 0.02858208492398262, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.137, + "grad_norm": 1.24199378490448, + "kl": 0.17786473967134953, + "learning_rate": 9.60066230608756e-07, + "loss": 0.0178, + "num_tokens": 7925297.0, + "reward": 0.8306884765625, + "reward_std": 0.02287285588681698, + "rewards//mean": 0.8306884765625, + "rewards//std": 0.026650426909327507, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1372, + "grad_norm": 1.2597159147262573, + "kl": 0.2000079806894064, + "learning_rate": 9.599418682100792e-07, + "loss": 0.02, + "num_tokens": 7936777.0, + "reward": 0.851806640625, + "reward_std": 0.023689093068242073, + "rewards//mean": 0.851806640625, + "rewards//std": 0.027026351541280746, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.1374, + "grad_norm": 1.6263643503189087, + "kl": 0.180186465382576, + "learning_rate": 9.598173205468727e-07, + "loss": 0.0123, + "num_tokens": 7948306.0, + "reward": 0.82879638671875, + "reward_std": 0.027122747153043747, + "rewards//mean": 0.82879638671875, + "rewards//std": 0.036081403493881226, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1376, + "grad_norm": 1.2597159147262573, + "kl": 0.16254964284598827, + "learning_rate": 9.596925876693047e-07, + "loss": 0.0163, + "num_tokens": 7959954.0, + "reward": 0.81817626953125, + "reward_std": 0.0317290723323822, + "rewards//mean": 0.81817626953125, + "rewards//std": 0.03901863843202591, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1378, + "grad_norm": 1.2642208337783813, + "kl": 0.19745133072137833, + "learning_rate": 9.595676696276171e-07, + "loss": 0.0197, + "num_tokens": 7971466.0, + "reward": 0.7923583984375, + "reward_std": 0.028849199414253235, + "rewards//mean": 0.7923583984375, + "rewards//std": 0.033683598041534424, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.138, + "grad_norm": 1.1715487241744995, + "kl": 0.19801762141287327, + "learning_rate": 9.594425664721274e-07, + "loss": 0.0198, + "num_tokens": 7982970.0, + "reward": 0.85504150390625, + "reward_std": 0.03306799381971359, + "rewards//mean": 0.85504150390625, + "rewards//std": 0.04171382635831833, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1382, + "grad_norm": 1.326195478439331, + "kl": 0.16151238046586514, + "learning_rate": 9.593172782532267e-07, + "loss": 0.0162, + "num_tokens": 7994506.0, + "reward": 0.80511474609375, + "reward_std": 0.02267209067940712, + "rewards//mean": 0.80511474609375, + "rewards//std": 0.02494061179459095, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1384, + "grad_norm": 1.3001879453659058, + "kl": 0.15529984049499035, + "learning_rate": 9.591918050213813e-07, + "loss": 0.0155, + "num_tokens": 8006010.0, + "reward": 0.81317138671875, + "reward_std": 0.023817697539925575, + "rewards//mean": 0.81317138671875, + "rewards//std": 0.039522066712379456, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1386, + "grad_norm": 1.2440539598464966, + "kl": 0.17086596600711346, + "learning_rate": 9.590661468271318e-07, + "loss": 0.0171, + "num_tokens": 8017602.0, + "reward": 0.816650390625, + "reward_std": 0.024306312203407288, + "rewards//mean": 0.816650390625, + "rewards//std": 0.027942834421992302, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1388, + "grad_norm": 1.407833218574524, + "kl": 0.17386738024652004, + "learning_rate": 9.589403037210931e-07, + "loss": 0.0174, + "num_tokens": 8029130.0, + "reward": 0.7939453125, + "reward_std": 0.03092615120112896, + "rewards//mean": 0.7939453125, + "rewards//std": 0.04050534963607788, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.139, + "grad_norm": 1.3873745203018188, + "kl": 0.1803432647138834, + "learning_rate": 9.58814275753955e-07, + "loss": 0.018, + "num_tokens": 8040690.0, + "reward": 0.8226318359375, + "reward_std": 0.026437442749738693, + "rewards//mean": 0.8226318359375, + "rewards//std": 0.03281308338046074, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1392, + "grad_norm": 1.2270647287368774, + "kl": 0.18770145252346992, + "learning_rate": 9.586880629764817e-07, + "loss": 0.0188, + "num_tokens": 8052226.0, + "reward": 0.81890869140625, + "reward_std": 0.02352830581367016, + "rewards//mean": 0.81890869140625, + "rewards//std": 0.03028462454676628, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1394, + "grad_norm": 1.2406091690063477, + "kl": 0.155813779681921, + "learning_rate": 9.585616654395112e-07, + "loss": 0.0156, + "num_tokens": 8063778.0, + "reward": 0.8057861328125, + "reward_std": 0.04226931184530258, + "rewards//mean": 0.8057861328125, + "rewards//std": 0.050123415887355804, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.1396, + "grad_norm": 1.170878291130066, + "kl": 0.18764514476060867, + "learning_rate": 9.584350831939569e-07, + "loss": 0.0151, + "num_tokens": 8075316.0, + "reward": 0.83001708984375, + "reward_std": 0.031098622828722, + "rewards//mean": 0.83001708984375, + "rewards//std": 0.037324897944927216, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1398, + "grad_norm": 1.4174830913543701, + "kl": 0.20181789249181747, + "learning_rate": 9.58308316290806e-07, + "loss": 0.0202, + "num_tokens": 8086876.0, + "reward": 0.8162841796875, + "reward_std": 0.02576325833797455, + "rewards//mean": 0.8162841796875, + "rewards//std": 0.03155825659632683, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.14, + "grad_norm": 1.1614900827407837, + "kl": 0.17588164657354355, + "learning_rate": 9.581813647811197e-07, + "loss": 0.0176, + "num_tokens": 8098396.0, + "reward": 0.85107421875, + "reward_std": 0.022553205490112305, + "rewards//mean": 0.85107421875, + "rewards//std": 0.02770453691482544, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1402, + "grad_norm": 1.2569938898086548, + "kl": 0.13660258706659079, + "learning_rate": 9.580542287160346e-07, + "loss": 0.0137, + "num_tokens": 8109908.0, + "reward": 0.8443603515625, + "reward_std": 0.022768884897232056, + "rewards//mean": 0.8443603515625, + "rewards//std": 0.02570672519505024, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1404, + "grad_norm": 1.1591744422912598, + "kl": 0.18572602048516273, + "learning_rate": 9.579269081467613e-07, + "loss": 0.0186, + "num_tokens": 8121396.0, + "reward": 0.836181640625, + "reward_std": 0.019829990342259407, + "rewards//mean": 0.836181640625, + "rewards//std": 0.027873404324054718, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1406, + "grad_norm": 1.3924274444580078, + "kl": 0.16767912358045578, + "learning_rate": 9.57799403124584e-07, + "loss": 0.0168, + "num_tokens": 8132948.0, + "reward": 0.85675048828125, + "reward_std": 0.033882979303598404, + "rewards//mean": 0.85675048828125, + "rewards//std": 0.04334786534309387, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1408, + "grad_norm": 1.2855558395385742, + "kl": 0.1725648883730173, + "learning_rate": 9.576717137008617e-07, + "loss": 0.0173, + "num_tokens": 8144492.0, + "reward": 0.79974365234375, + "reward_std": 0.021472381427884102, + "rewards//mean": 0.79974365234375, + "rewards//std": 0.04177837073802948, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.141, + "grad_norm": 1.3780018091201782, + "kl": 0.16287734918296337, + "learning_rate": 9.575438399270278e-07, + "loss": 0.0163, + "num_tokens": 8156020.0, + "reward": 0.82257080078125, + "reward_std": 0.03323841094970703, + "rewards//mean": 0.82257080078125, + "rewards//std": 0.042857591062784195, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1412, + "grad_norm": 1.257286548614502, + "kl": 0.19834582600742579, + "learning_rate": 9.5741578185459e-07, + "loss": 0.0198, + "num_tokens": 8167708.0, + "reward": 0.8389892578125, + "reward_std": 0.02776196226477623, + "rewards//mean": 0.8389892578125, + "rewards//std": 0.03317826986312866, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1414, + "grad_norm": 1.2907512187957764, + "kl": 0.1812795028090477, + "learning_rate": 9.572875395351301e-07, + "loss": 0.0181, + "num_tokens": 8179372.0, + "reward": 0.81396484375, + "reward_std": 0.020484592765569687, + "rewards//mean": 0.81396484375, + "rewards//std": 0.026816053315997124, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1416, + "grad_norm": 1.3265764713287354, + "kl": 0.20112445391714573, + "learning_rate": 9.571591130203037e-07, + "loss": 0.0201, + "num_tokens": 8190932.0, + "reward": 0.853515625, + "reward_std": 0.025886092334985733, + "rewards//mean": 0.853515625, + "rewards//std": 0.029454562813043594, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1418, + "grad_norm": 1.241817831993103, + "kl": 0.20056075416505337, + "learning_rate": 9.570305023618415e-07, + "loss": 0.0201, + "num_tokens": 8202524.0, + "reward": 0.79925537109375, + "reward_std": 0.024715334177017212, + "rewards//mean": 0.79925537109375, + "rewards//std": 0.036128781735897064, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.142, + "grad_norm": 1.2905417680740356, + "kl": 0.1693292148411274, + "learning_rate": 9.569017076115475e-07, + "loss": 0.0169, + "num_tokens": 8213988.0, + "reward": 0.83428955078125, + "reward_std": 0.026783134788274765, + "rewards//mean": 0.83428955078125, + "rewards//std": 0.037453241646289825, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.34375, + "epoch": 0.1422, + "grad_norm": 1.2343063354492188, + "kl": 0.17518947832286358, + "learning_rate": 9.567727288213004e-07, + "loss": 0.0182, + "num_tokens": 8225554.0, + "reward": 0.8265380859375, + "reward_std": 0.037371762096881866, + "rewards//mean": 0.8265380859375, + "rewards//std": 0.041285451501607895, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1424, + "grad_norm": 1.241417646408081, + "kl": 0.18328845873475075, + "learning_rate": 9.566435660430527e-07, + "loss": 0.0183, + "num_tokens": 8237082.0, + "reward": 0.82208251953125, + "reward_std": 0.021123241633176804, + "rewards//mean": 0.82208251953125, + "rewards//std": 0.027899935841560364, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1426, + "grad_norm": 1.2042380571365356, + "kl": 0.17893170472234488, + "learning_rate": 9.565142193288312e-07, + "loss": 0.0179, + "num_tokens": 8248674.0, + "reward": 0.84600830078125, + "reward_std": 0.02578631415963173, + "rewards//mean": 0.84600830078125, + "rewards//std": 0.036357246339321136, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1428, + "grad_norm": 1.2684085369110107, + "kl": 0.14973272755742073, + "learning_rate": 9.563846887307368e-07, + "loss": 0.015, + "num_tokens": 8260378.0, + "reward": 0.83740234375, + "reward_std": 0.03190933167934418, + "rewards//mean": 0.83740234375, + "rewards//std": 0.03965632617473602, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.143, + "grad_norm": 1.2363412380218506, + "kl": 0.1784159429371357, + "learning_rate": 9.562549743009442e-07, + "loss": 0.0178, + "num_tokens": 8272010.0, + "reward": 0.8409423828125, + "reward_std": 0.024695763364434242, + "rewards//mean": 0.8409423828125, + "rewards//std": 0.02605532482266426, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.625, + "epoch": 0.1432, + "grad_norm": 1.090523600578308, + "kl": 0.19104338064789772, + "learning_rate": 9.561250760917025e-07, + "loss": 0.0024, + "num_tokens": 8283442.0, + "reward": 0.85076904296875, + "reward_std": 0.02661699242889881, + "rewards//mean": 0.85076904296875, + "rewards//std": 0.0297342911362648, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1434, + "grad_norm": 1.2694945335388184, + "kl": 0.16356963478028774, + "learning_rate": 9.55994994155335e-07, + "loss": 0.0164, + "num_tokens": 8295026.0, + "reward": 0.79937744140625, + "reward_std": 0.021362604573369026, + "rewards//mean": 0.79937744140625, + "rewards//std": 0.036327674984931946, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1436, + "grad_norm": 1.254648208618164, + "kl": 0.17165067326277494, + "learning_rate": 9.558647285442381e-07, + "loss": 0.0172, + "num_tokens": 8306602.0, + "reward": 0.84173583984375, + "reward_std": 0.038012079894542694, + "rewards//mean": 0.84173583984375, + "rewards//std": 0.039987754076719284, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1438, + "grad_norm": 1.2779313325881958, + "kl": 0.1906194444745779, + "learning_rate": 9.55734279310883e-07, + "loss": 0.0191, + "num_tokens": 8318138.0, + "reward": 0.837646484375, + "reward_std": 0.023986198008060455, + "rewards//mean": 0.837646484375, + "rewards//std": 0.04530841112136841, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.144, + "grad_norm": 1.1905559301376343, + "kl": 0.17019027844071388, + "learning_rate": 9.55603646507815e-07, + "loss": 0.017, + "num_tokens": 8329722.0, + "reward": 0.83990478515625, + "reward_std": 0.03434247523546219, + "rewards//mean": 0.83990478515625, + "rewards//std": 0.03580470383167267, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1442, + "grad_norm": 1.2171837091445923, + "kl": 0.16821262426674366, + "learning_rate": 9.554728301876524e-07, + "loss": 0.0168, + "num_tokens": 8341466.0, + "reward": 0.80462646484375, + "reward_std": 0.024228330701589584, + "rewards//mean": 0.80462646484375, + "rewards//std": 0.03087175264954567, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1444, + "grad_norm": 1.5284759998321533, + "kl": 0.18386178836226463, + "learning_rate": 9.553418304030885e-07, + "loss": 0.0184, + "num_tokens": 8353154.0, + "reward": 0.82196044921875, + "reward_std": 0.03375793248414993, + "rewards//mean": 0.82196044921875, + "rewards//std": 0.04037241265177727, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1446, + "grad_norm": 1.2429957389831543, + "kl": 0.1795783657580614, + "learning_rate": 9.552106472068897e-07, + "loss": 0.018, + "num_tokens": 8364682.0, + "reward": 0.8236083984375, + "reward_std": 0.024629266932606697, + "rewards//mean": 0.8236083984375, + "rewards//std": 0.028757382184267044, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1448, + "grad_norm": 1.1352893114089966, + "kl": 0.16375361569225788, + "learning_rate": 9.550792806518967e-07, + "loss": 0.0164, + "num_tokens": 8376178.0, + "reward": 0.8455810546875, + "reward_std": 0.026610061526298523, + "rewards//mean": 0.8455810546875, + "rewards//std": 0.03073396533727646, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.145, + "grad_norm": 1.2181274890899658, + "kl": 0.1624374706298113, + "learning_rate": 9.549477307910236e-07, + "loss": 0.0162, + "num_tokens": 8387762.0, + "reward": 0.84930419921875, + "reward_std": 0.02219606190919876, + "rewards//mean": 0.84930419921875, + "rewards//std": 0.03038393147289753, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1452, + "grad_norm": 1.3764952421188354, + "kl": 0.19610262662172318, + "learning_rate": 9.548159976772592e-07, + "loss": 0.0196, + "num_tokens": 8399386.0, + "reward": 0.8214111328125, + "reward_std": 0.016503429040312767, + "rewards//mean": 0.8214111328125, + "rewards//std": 0.022728921845555305, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.1454, + "grad_norm": 1.3817435503005981, + "kl": 0.17917301505804062, + "learning_rate": 9.546840813636652e-07, + "loss": 0.0096, + "num_tokens": 8410911.0, + "reward": 0.826416015625, + "reward_std": 0.022155366837978363, + "rewards//mean": 0.826416015625, + "rewards//std": 0.025609083473682404, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1456, + "grad_norm": 1.356307029724121, + "kl": 0.1846072170883417, + "learning_rate": 9.545519819033777e-07, + "loss": 0.0185, + "num_tokens": 8422519.0, + "reward": 0.85198974609375, + "reward_std": 0.029504388570785522, + "rewards//mean": 0.85198974609375, + "rewards//std": 0.03308065980672836, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1458, + "grad_norm": 1.4418641328811646, + "kl": 0.17426180839538574, + "learning_rate": 9.544196993496062e-07, + "loss": 0.0174, + "num_tokens": 8434087.0, + "reward": 0.8358154296875, + "reward_std": 0.026010572910308838, + "rewards//mean": 0.8358154296875, + "rewards//std": 0.03230912983417511, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.146, + "grad_norm": 1.335216999053955, + "kl": 0.16472369898110628, + "learning_rate": 9.54287233755634e-07, + "loss": -0.0045, + "num_tokens": 8445570.0, + "reward": 0.7838134765625, + "reward_std": 0.025014303624629974, + "rewards//mean": 0.7838134765625, + "rewards//std": 0.03474544361233711, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1462, + "grad_norm": 1.3251895904541016, + "kl": 0.17055476736277342, + "learning_rate": 9.541545851748185e-07, + "loss": 0.0171, + "num_tokens": 8457258.0, + "reward": 0.8050537109375, + "reward_std": 0.028060048818588257, + "rewards//mean": 0.8050537109375, + "rewards//std": 0.03385215625166893, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1464, + "grad_norm": 1.123653769493103, + "kl": 0.16580532118678093, + "learning_rate": 9.540217536605905e-07, + "loss": 0.0166, + "num_tokens": 8468890.0, + "reward": 0.79364013671875, + "reward_std": 0.015876561403274536, + "rewards//mean": 0.79364013671875, + "rewards//std": 0.025263242423534393, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1466, + "grad_norm": 1.3068742752075195, + "kl": 0.17570959776639938, + "learning_rate": 9.538887392664543e-07, + "loss": 0.0176, + "num_tokens": 8480458.0, + "reward": 0.848876953125, + "reward_std": 0.023471511900424957, + "rewards//mean": 0.848876953125, + "rewards//std": 0.03522765636444092, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1468, + "grad_norm": 1.412705898284912, + "kl": 0.1709743943065405, + "learning_rate": 9.537555420459881e-07, + "loss": 0.0171, + "num_tokens": 8492130.0, + "reward": 0.83489990234375, + "reward_std": 0.025170793756842613, + "rewards//mean": 0.83489990234375, + "rewards//std": 0.03429657593369484, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.147, + "grad_norm": 1.3378205299377441, + "kl": 0.17610961105674505, + "learning_rate": 9.53622162052844e-07, + "loss": 0.0176, + "num_tokens": 8503786.0, + "reward": 0.83740234375, + "reward_std": 0.030726861208677292, + "rewards//mean": 0.83740234375, + "rewards//std": 0.0369827039539814, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1472, + "grad_norm": 1.5207569599151611, + "kl": 0.15441089775413275, + "learning_rate": 9.534885993407474e-07, + "loss": 0.0154, + "num_tokens": 8515490.0, + "reward": 0.835693359375, + "reward_std": 0.024892468005418777, + "rewards//mean": 0.835693359375, + "rewards//std": 0.02986997365951538, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1474, + "grad_norm": 1.3241339921951294, + "kl": 0.18604753352701664, + "learning_rate": 9.53354853963497e-07, + "loss": 0.0186, + "num_tokens": 8527026.0, + "reward": 0.8370361328125, + "reward_std": 0.024627283215522766, + "rewards//mean": 0.8370361328125, + "rewards//std": 0.030165238305926323, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1476, + "grad_norm": 1.296309232711792, + "kl": 0.18079857900738716, + "learning_rate": 9.532209259749658e-07, + "loss": 0.0181, + "num_tokens": 8538642.0, + "reward": 0.82574462890625, + "reward_std": 0.023920666426420212, + "rewards//mean": 0.82574462890625, + "rewards//std": 0.03461553156375885, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1478, + "grad_norm": 1.3901861906051636, + "kl": 0.1848797108978033, + "learning_rate": 9.530868154290996e-07, + "loss": 0.0185, + "num_tokens": 8550138.0, + "reward": 0.79083251953125, + "reward_std": 0.02546101249754429, + "rewards//mean": 0.79083251953125, + "rewards//std": 0.029330378398299217, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.148, + "grad_norm": 1.22592031955719, + "kl": 0.18400348350405693, + "learning_rate": 9.529525223799184e-07, + "loss": 0.0184, + "num_tokens": 8561738.0, + "reward": 0.8138427734375, + "reward_std": 0.024427076801657677, + "rewards//mean": 0.8138427734375, + "rewards//std": 0.027971802279353142, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1482, + "grad_norm": 1.2646276950836182, + "kl": 0.17172999307513237, + "learning_rate": 9.528180468815154e-07, + "loss": 0.0172, + "num_tokens": 8573370.0, + "reward": 0.7904052734375, + "reward_std": 0.026446310803294182, + "rewards//mean": 0.7904052734375, + "rewards//std": 0.03296405076980591, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1484, + "grad_norm": 1.1366411447525024, + "kl": 0.18617643974721432, + "learning_rate": 9.526833889880572e-07, + "loss": 0.0186, + "num_tokens": 8584978.0, + "reward": 0.84368896484375, + "reward_std": 0.024517877027392387, + "rewards//mean": 0.84368896484375, + "rewards//std": 0.03456564247608185, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1486, + "grad_norm": 1.2210453748703003, + "kl": 0.19094041921198368, + "learning_rate": 9.525485487537841e-07, + "loss": 0.0191, + "num_tokens": 8596746.0, + "reward": 0.842041015625, + "reward_std": 0.024002017453312874, + "rewards//mean": 0.842041015625, + "rewards//std": 0.025872545316815376, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1488, + "grad_norm": 2.274757146835327, + "kl": 0.1527025420218706, + "learning_rate": 9.524135262330098e-07, + "loss": 0.0153, + "num_tokens": 8608322.0, + "reward": 0.8310546875, + "reward_std": 0.023079730570316315, + "rewards//mean": 0.8310546875, + "rewards//std": 0.030081065371632576, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.149, + "grad_norm": 1.2644245624542236, + "kl": 0.19074799120426178, + "learning_rate": 9.522783214801211e-07, + "loss": 0.0233, + "num_tokens": 8619851.0, + "reward": 0.780517578125, + "reward_std": 0.024575456976890564, + "rewards//mean": 0.780517578125, + "rewards//std": 0.03063851036131382, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1492, + "grad_norm": 1.300154447555542, + "kl": 0.18553532846271992, + "learning_rate": 9.521429345495786e-07, + "loss": 0.0186, + "num_tokens": 8631379.0, + "reward": 0.8173828125, + "reward_std": 0.02072273939847946, + "rewards//mean": 0.8173828125, + "rewards//std": 0.02680250257253647, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1494, + "grad_norm": 1.2147470712661743, + "kl": 0.1892506703734398, + "learning_rate": 9.520073654959162e-07, + "loss": 0.0189, + "num_tokens": 8642907.0, + "reward": 0.82562255859375, + "reward_std": 0.027032852172851562, + "rewards//mean": 0.82562255859375, + "rewards//std": 0.031128602102398872, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1496, + "grad_norm": 1.2872682809829712, + "kl": 0.17386277485638857, + "learning_rate": 9.518716143737409e-07, + "loss": 0.0174, + "num_tokens": 8654459.0, + "reward": 0.81646728515625, + "reward_std": 0.020333830267190933, + "rewards//mean": 0.81646728515625, + "rewards//std": 0.02941747196018696, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1498, + "grad_norm": 1.3712259531021118, + "kl": 0.1707743788138032, + "learning_rate": 9.517356812377335e-07, + "loss": 0.0171, + "num_tokens": 8666123.0, + "reward": 0.83990478515625, + "reward_std": 0.03214757889509201, + "rewards//mean": 0.83990478515625, + "rewards//std": 0.032432761043310165, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.15, + "grad_norm": 1.322297215461731, + "kl": 0.16831934917718172, + "learning_rate": 9.515995661426477e-07, + "loss": 0.0168, + "num_tokens": 8677579.0, + "reward": 0.82452392578125, + "reward_std": 0.033078353852033615, + "rewards//mean": 0.82452392578125, + "rewards//std": 0.0341058112680912, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1502, + "grad_norm": 1.1649476289749146, + "kl": 0.17943279445171356, + "learning_rate": 9.514632691433106e-07, + "loss": 0.0179, + "num_tokens": 8689027.0, + "reward": 0.8360595703125, + "reward_std": 0.026994312182068825, + "rewards//mean": 0.8360595703125, + "rewards//std": 0.038481347262859344, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1504, + "grad_norm": 1.347505807876587, + "kl": 0.17317680642008781, + "learning_rate": 9.513267902946227e-07, + "loss": 0.0173, + "num_tokens": 8700627.0, + "reward": 0.79425048828125, + "reward_std": 0.01325594075024128, + "rewards//mean": 0.79425048828125, + "rewards//std": 0.017772428691387177, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1506, + "grad_norm": 1.1201480627059937, + "kl": 0.18075205758213997, + "learning_rate": 9.511901296515576e-07, + "loss": 0.0181, + "num_tokens": 8712331.0, + "reward": 0.81866455078125, + "reward_std": 0.024437861517071724, + "rewards//mean": 0.81866455078125, + "rewards//std": 0.031263984739780426, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1508, + "grad_norm": 1.4168167114257812, + "kl": 0.17249411903321743, + "learning_rate": 9.510532872691623e-07, + "loss": 0.0172, + "num_tokens": 8723947.0, + "reward": 0.77142333984375, + "reward_std": 0.019046833738684654, + "rewards//mean": 0.77142333984375, + "rewards//std": 0.022734498605132103, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.151, + "grad_norm": 1.4444797039031982, + "kl": 0.16399505455046892, + "learning_rate": 9.509162632025569e-07, + "loss": 0.0164, + "num_tokens": 8735563.0, + "reward": 0.77020263671875, + "reward_std": 0.02936282381415367, + "rewards//mean": 0.77020263671875, + "rewards//std": 0.041826531291007996, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1512, + "grad_norm": 1.252707839012146, + "kl": 0.16560514271259308, + "learning_rate": 9.507790575069345e-07, + "loss": 0.0166, + "num_tokens": 8747099.0, + "reward": 0.8271484375, + "reward_std": 0.02346424013376236, + "rewards//mean": 0.8271484375, + "rewards//std": 0.027595041319727898, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1514, + "grad_norm": 1.4619033336639404, + "kl": 0.16299812402576208, + "learning_rate": 9.506416702375617e-07, + "loss": 0.0163, + "num_tokens": 8758651.0, + "reward": 0.8160400390625, + "reward_std": 0.04155194014310837, + "rewards//mean": 0.8160400390625, + "rewards//std": 0.04832876846194267, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1516, + "grad_norm": 1.3096078634262085, + "kl": 0.1623738892376423, + "learning_rate": 9.505041014497779e-07, + "loss": 0.0162, + "num_tokens": 8770187.0, + "reward": 0.82391357421875, + "reward_std": 0.022546377032995224, + "rewards//mean": 0.82391357421875, + "rewards//std": 0.030035173520445824, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1518, + "grad_norm": 1.270916223526001, + "kl": 0.17929606512188911, + "learning_rate": 9.503663511989962e-07, + "loss": 0.0179, + "num_tokens": 8781819.0, + "reward": 0.83221435546875, + "reward_std": 0.023652032017707825, + "rewards//mean": 0.83221435546875, + "rewards//std": 0.03671274706721306, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.152, + "grad_norm": 1.5011277198791504, + "kl": 0.18146441876888275, + "learning_rate": 9.502284195407018e-07, + "loss": 0.0177, + "num_tokens": 8793401.0, + "reward": 0.83441162109375, + "reward_std": 0.03127669543027878, + "rewards//mean": 0.83441162109375, + "rewards//std": 0.0403352752327919, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1522, + "grad_norm": 1.2292985916137695, + "kl": 0.2007558885961771, + "learning_rate": 9.500903065304539e-07, + "loss": 0.0201, + "num_tokens": 8804969.0, + "reward": 0.8363037109375, + "reward_std": 0.01916363462805748, + "rewards//mean": 0.8363037109375, + "rewards//std": 0.025852352380752563, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1524, + "grad_norm": 1.304197907447815, + "kl": 0.16484358720481396, + "learning_rate": 9.499520122238845e-07, + "loss": 0.0165, + "num_tokens": 8816513.0, + "reward": 0.8189697265625, + "reward_std": 0.02555353380739689, + "rewards//mean": 0.8189697265625, + "rewards//std": 0.02785031497478485, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1526, + "grad_norm": 1.3729115724563599, + "kl": 0.17573919519782066, + "learning_rate": 9.498135366766982e-07, + "loss": 0.0176, + "num_tokens": 8828081.0, + "reward": 0.7899169921875, + "reward_std": 0.02667072042822838, + "rewards//mean": 0.7899169921875, + "rewards//std": 0.03668000176548958, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.1528, + "grad_norm": 1.256002426147461, + "kl": 0.16181069612503052, + "learning_rate": 9.496748799446732e-07, + "loss": 0.0147, + "num_tokens": 8839496.0, + "reward": 0.7803955078125, + "reward_std": 0.023356515914201736, + "rewards//mean": 0.7803955078125, + "rewards//std": 0.025031263008713722, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.153, + "grad_norm": 1.5748552083969116, + "kl": 0.13824183866381645, + "learning_rate": 9.495360420836602e-07, + "loss": 0.0138, + "num_tokens": 8851128.0, + "reward": 0.849853515625, + "reward_std": 0.02891279198229313, + "rewards//mean": 0.849853515625, + "rewards//std": 0.04545251652598381, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.1532, + "grad_norm": 1.2162176370620728, + "kl": 0.1895651388913393, + "learning_rate": 9.493970231495834e-07, + "loss": 0.0099, + "num_tokens": 8862823.0, + "reward": 0.8292236328125, + "reward_std": 0.024165930226445198, + "rewards//mean": 0.8292236328125, + "rewards//std": 0.02986668050289154, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.453125, + "epoch": 0.1534, + "grad_norm": 1.4698997735977173, + "kl": 0.19412676990032196, + "learning_rate": 9.492578231984393e-07, + "loss": -0.0048, + "num_tokens": 8874356.0, + "reward": 0.82269287109375, + "reward_std": 0.02139069139957428, + "rewards//mean": 0.82269287109375, + "rewards//std": 0.024395687505602837, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1536, + "grad_norm": 1.3765045404434204, + "kl": 0.1934224423021078, + "learning_rate": 9.491184422862979e-07, + "loss": 0.0193, + "num_tokens": 8885932.0, + "reward": 0.84686279296875, + "reward_std": 0.032382749021053314, + "rewards//mean": 0.84686279296875, + "rewards//std": 0.03860834613442421, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1538, + "grad_norm": 1.3879393339157104, + "kl": 0.1794891282916069, + "learning_rate": 9.489788804693015e-07, + "loss": 0.0179, + "num_tokens": 8897580.0, + "reward": 0.8306884765625, + "reward_std": 0.02584928646683693, + "rewards//mean": 0.8306884765625, + "rewards//std": 0.03275028243660927, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.154, + "grad_norm": 1.2732008695602417, + "kl": 0.1790152583271265, + "learning_rate": 9.488391378036659e-07, + "loss": 0.0179, + "num_tokens": 8909124.0, + "reward": 0.84259033203125, + "reward_std": 0.03226698189973831, + "rewards//mean": 0.84259033203125, + "rewards//std": 0.04510914161801338, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1542, + "grad_norm": 1.3645622730255127, + "kl": 0.19990921020507812, + "learning_rate": 9.486992143456791e-07, + "loss": 0.02, + "num_tokens": 8920732.0, + "reward": 0.81494140625, + "reward_std": 0.020696304738521576, + "rewards//mean": 0.81494140625, + "rewards//std": 0.02944222465157509, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1544, + "grad_norm": 1.3314356803894043, + "kl": 0.17549362778663635, + "learning_rate": 9.485591101517026e-07, + "loss": 0.0175, + "num_tokens": 8932276.0, + "reward": 0.82623291015625, + "reward_std": 0.03083675540983677, + "rewards//mean": 0.82623291015625, + "rewards//std": 0.03327684476971626, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1546, + "grad_norm": 1.4883568286895752, + "kl": 0.1700681447982788, + "learning_rate": 9.4841882527817e-07, + "loss": 0.017, + "num_tokens": 8943740.0, + "reward": 0.78997802734375, + "reward_std": 0.02163659781217575, + "rewards//mean": 0.78997802734375, + "rewards//std": 0.024732163175940514, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1548, + "grad_norm": 1.2712140083312988, + "kl": 0.18809767253696918, + "learning_rate": 9.482783597815882e-07, + "loss": 0.0188, + "num_tokens": 8955468.0, + "reward": 0.83526611328125, + "reward_std": 0.02711787074804306, + "rewards//mean": 0.83526611328125, + "rewards//std": 0.029834922403097153, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.155, + "grad_norm": 1.5599696636199951, + "kl": 0.1908977758139372, + "learning_rate": 9.481377137185369e-07, + "loss": 0.0191, + "num_tokens": 8967028.0, + "reward": 0.79681396484375, + "reward_std": 0.021781332790851593, + "rewards//mean": 0.79681396484375, + "rewards//std": 0.02403435856103897, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.1552, + "grad_norm": 1.671816349029541, + "kl": 0.16823633294552565, + "learning_rate": 9.479968871456679e-07, + "loss": 0.0204, + "num_tokens": 8978602.0, + "reward": 0.82330322265625, + "reward_std": 0.02421765774488449, + "rewards//mean": 0.82330322265625, + "rewards//std": 0.0272127166390419, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1554, + "grad_norm": 1.3550877571105957, + "kl": 0.18189279548823833, + "learning_rate": 9.478558801197064e-07, + "loss": 0.0182, + "num_tokens": 8990162.0, + "reward": 0.8570556640625, + "reward_std": 0.03602011501789093, + "rewards//mean": 0.8570556640625, + "rewards//std": 0.03819547966122627, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.1556, + "grad_norm": 1.1510846614837646, + "kl": 0.18019517324864864, + "learning_rate": 9.4771469269745e-07, + "loss": 0.0135, + "num_tokens": 9001705.0, + "reward": 0.83282470703125, + "reward_std": 0.025815792381763458, + "rewards//mean": 0.83282470703125, + "rewards//std": 0.030908506363630295, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1558, + "grad_norm": 1.3361252546310425, + "kl": 0.2106065135449171, + "learning_rate": 9.475733249357688e-07, + "loss": 0.0211, + "num_tokens": 9013289.0, + "reward": 0.804931640625, + "reward_std": 0.018057070672512054, + "rewards//mean": 0.804931640625, + "rewards//std": 0.020254168659448624, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.156, + "grad_norm": 1.173181414604187, + "kl": 0.17866794019937515, + "learning_rate": 9.474317768916059e-07, + "loss": 0.0179, + "num_tokens": 9024865.0, + "reward": 0.77288818359375, + "reward_std": 0.014918150380253792, + "rewards//mean": 0.77288818359375, + "rewards//std": 0.023977607488632202, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1562, + "grad_norm": 1.0811703205108643, + "kl": 0.19562941789627075, + "learning_rate": 9.472900486219768e-07, + "loss": 0.0196, + "num_tokens": 9036441.0, + "reward": 0.81781005859375, + "reward_std": 0.026281028985977173, + "rewards//mean": 0.81781005859375, + "rewards//std": 0.038822222501039505, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1564, + "grad_norm": 1.1803419589996338, + "kl": 0.17828824557363987, + "learning_rate": 9.471481401839696e-07, + "loss": 0.0178, + "num_tokens": 9047921.0, + "reward": 0.84521484375, + "reward_std": 0.027898795902729034, + "rewards//mean": 0.84521484375, + "rewards//std": 0.03330963850021362, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1566, + "grad_norm": 1.3803973197937012, + "kl": 0.19156293384730816, + "learning_rate": 9.470060516347449e-07, + "loss": 0.0192, + "num_tokens": 9059521.0, + "reward": 0.80560302734375, + "reward_std": 0.026911262422800064, + "rewards//mean": 0.80560302734375, + "rewards//std": 0.032842766493558884, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1568, + "grad_norm": 1.9258133172988892, + "kl": 0.24980668164789677, + "learning_rate": 9.468637830315362e-07, + "loss": 0.025, + "num_tokens": 9071241.0, + "reward": 0.8172607421875, + "reward_std": 0.03509318456053734, + "rewards//mean": 0.8172607421875, + "rewards//std": 0.044698070734739304, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.157, + "grad_norm": 1.3370437622070312, + "kl": 0.19191950373351574, + "learning_rate": 9.467213344316491e-07, + "loss": 0.0192, + "num_tokens": 9082793.0, + "reward": 0.80364990234375, + "reward_std": 0.029320277273654938, + "rewards//mean": 0.80364990234375, + "rewards//std": 0.03916926309466362, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1572, + "grad_norm": 1.3925843238830566, + "kl": 0.17249862290918827, + "learning_rate": 9.465787058924619e-07, + "loss": 0.0172, + "num_tokens": 9094361.0, + "reward": 0.839599609375, + "reward_std": 0.030935652554035187, + "rewards//mean": 0.839599609375, + "rewards//std": 0.031280018389225006, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1574, + "grad_norm": 1.3290650844573975, + "kl": 0.1533953296020627, + "learning_rate": 9.464358974714252e-07, + "loss": 0.0153, + "num_tokens": 9105817.0, + "reward": 0.83038330078125, + "reward_std": 0.022056907415390015, + "rewards//mean": 0.83038330078125, + "rewards//std": 0.026280570775270462, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1576, + "grad_norm": 1.4990400075912476, + "kl": 0.17519839480519295, + "learning_rate": 9.462929092260628e-07, + "loss": 0.0175, + "num_tokens": 9117449.0, + "reward": 0.77978515625, + "reward_std": 0.023663587868213654, + "rewards//mean": 0.77978515625, + "rewards//std": 0.030437255278229713, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1578, + "grad_norm": 1.2342989444732666, + "kl": 0.19596992805600166, + "learning_rate": 9.461497412139696e-07, + "loss": 0.0196, + "num_tokens": 9129033.0, + "reward": 0.81463623046875, + "reward_std": 0.020205602049827576, + "rewards//mean": 0.81463623046875, + "rewards//std": 0.029214540496468544, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.158, + "grad_norm": 1.1298415660858154, + "kl": 0.16865910589694977, + "learning_rate": 9.460063934928141e-07, + "loss": 0.0099, + "num_tokens": 9140678.0, + "reward": 0.8265380859375, + "reward_std": 0.03245843946933746, + "rewards//mean": 0.8265380859375, + "rewards//std": 0.041775334626436234, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1582, + "grad_norm": 1.4504992961883545, + "kl": 0.16007444355636835, + "learning_rate": 9.458628661203366e-07, + "loss": 0.016, + "num_tokens": 9152318.0, + "reward": 0.8155517578125, + "reward_std": 0.03135843575000763, + "rewards//mean": 0.8155517578125, + "rewards//std": 0.03532785177230835, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1584, + "grad_norm": 1.4048022031784058, + "kl": 0.20269559137523174, + "learning_rate": 9.4571915915435e-07, + "loss": 0.0203, + "num_tokens": 9163846.0, + "reward": 0.78515625, + "reward_std": 0.028918344527482986, + "rewards//mean": 0.78515625, + "rewards//std": 0.03113352507352829, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.1586, + "grad_norm": 1.203253149986267, + "kl": 0.18948723003268242, + "learning_rate": 9.455752726527392e-07, + "loss": 0.0031, + "num_tokens": 9175461.0, + "reward": 0.8409423828125, + "reward_std": 0.034914735704660416, + "rewards//mean": 0.8409423828125, + "rewards//std": 0.041987862437963486, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.1588, + "grad_norm": 1.3904287815093994, + "kl": 0.19710040278732777, + "learning_rate": 9.454312066734622e-07, + "loss": 0.017, + "num_tokens": 9186954.0, + "reward": 0.82769775390625, + "reward_std": 0.031004343181848526, + "rewards//mean": 0.82769775390625, + "rewards//std": 0.03586975112557411, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.159, + "grad_norm": 1.2679919004440308, + "kl": 0.15744167752563953, + "learning_rate": 9.452869612745483e-07, + "loss": 0.0157, + "num_tokens": 9198482.0, + "reward": 0.82421875, + "reward_std": 0.024487320333719254, + "rewards//mean": 0.82421875, + "rewards//std": 0.030353587120771408, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.1592, + "grad_norm": 1.4385877847671509, + "kl": 0.1916065514087677, + "learning_rate": 9.451425365140994e-07, + "loss": 0.0186, + "num_tokens": 9210145.0, + "reward": 0.77886962890625, + "reward_std": 0.027607262134552002, + "rewards//mean": 0.77886962890625, + "rewards//std": 0.04041925445199013, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1594, + "grad_norm": 1.2831510305404663, + "kl": 0.20297333225607872, + "learning_rate": 9.449979324502903e-07, + "loss": 0.0203, + "num_tokens": 9221649.0, + "reward": 0.80487060546875, + "reward_std": 0.021879777312278748, + "rewards//mean": 0.80487060546875, + "rewards//std": 0.02952379733324051, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1596, + "grad_norm": 1.3586130142211914, + "kl": 0.20058048143982887, + "learning_rate": 9.448531491413672e-07, + "loss": 0.0201, + "num_tokens": 9233121.0, + "reward": 0.85235595703125, + "reward_std": 0.026314621791243553, + "rewards//mean": 0.85235595703125, + "rewards//std": 0.03280171751976013, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1598, + "grad_norm": 1.3591395616531372, + "kl": 0.1786019429564476, + "learning_rate": 9.447081866456487e-07, + "loss": 0.0179, + "num_tokens": 9244689.0, + "reward": 0.825439453125, + "reward_std": 0.028772998601198196, + "rewards//mean": 0.825439453125, + "rewards//std": 0.035814058035612106, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.16, + "grad_norm": 1.253978967666626, + "kl": 0.20082730427384377, + "learning_rate": 9.445630450215259e-07, + "loss": 0.0201, + "num_tokens": 9256209.0, + "reward": 0.82159423828125, + "reward_std": 0.026922937482595444, + "rewards//mean": 0.82159423828125, + "rewards//std": 0.030433712527155876, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1602, + "grad_norm": 1.3288284540176392, + "kl": 0.2113108839839697, + "learning_rate": 9.444177243274617e-07, + "loss": 0.0211, + "num_tokens": 9267809.0, + "reward": 0.827392578125, + "reward_std": 0.023141957819461823, + "rewards//mean": 0.827392578125, + "rewards//std": 0.02646489255130291, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1604, + "grad_norm": 1.440895676612854, + "kl": 0.20162693783640862, + "learning_rate": 9.442722246219913e-07, + "loss": 0.0202, + "num_tokens": 9279417.0, + "reward": 0.83221435546875, + "reward_std": 0.032214052975177765, + "rewards//mean": 0.83221435546875, + "rewards//std": 0.03879765048623085, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1606, + "grad_norm": 1.1488749980926514, + "kl": 0.17655073013156652, + "learning_rate": 9.441265459637219e-07, + "loss": 0.0177, + "num_tokens": 9290985.0, + "reward": 0.832763671875, + "reward_std": 0.021728865802288055, + "rewards//mean": 0.832763671875, + "rewards//std": 0.023082159459590912, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1608, + "grad_norm": 1.4027955532073975, + "kl": 0.20192556455731392, + "learning_rate": 9.43980688411333e-07, + "loss": 0.0202, + "num_tokens": 9302513.0, + "reward": 0.84210205078125, + "reward_std": 0.03821011632680893, + "rewards//mean": 0.84210205078125, + "rewards//std": 0.04755115881562233, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.161, + "grad_norm": 1.1659181118011475, + "kl": 0.18879262823611498, + "learning_rate": 9.438346520235758e-07, + "loss": 0.0189, + "num_tokens": 9314193.0, + "reward": 0.81292724609375, + "reward_std": 0.021953755989670753, + "rewards//mean": 0.81292724609375, + "rewards//std": 0.026228101924061775, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1612, + "grad_norm": 1.589997410774231, + "kl": 0.23570151440799236, + "learning_rate": 9.436884368592739e-07, + "loss": 0.0236, + "num_tokens": 9325777.0, + "reward": 0.82891845703125, + "reward_std": 0.025986898690462112, + "rewards//mean": 0.82891845703125, + "rewards//std": 0.0374039001762867, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.1614, + "grad_norm": 1.2216014862060547, + "kl": 0.1839399505406618, + "learning_rate": 9.435420429773227e-07, + "loss": 0.0165, + "num_tokens": 9337361.0, + "reward": 0.8370361328125, + "reward_std": 0.038737837225198746, + "rewards//mean": 0.8370361328125, + "rewards//std": 0.047737542539834976, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1616, + "grad_norm": 1.321151852607727, + "kl": 0.19852513261139393, + "learning_rate": 9.433954704366896e-07, + "loss": 0.0199, + "num_tokens": 9348905.0, + "reward": 0.84271240234375, + "reward_std": 0.027416184544563293, + "rewards//mean": 0.84271240234375, + "rewards//std": 0.03505788743495941, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1618, + "grad_norm": 1.2991150617599487, + "kl": 0.20047416538000107, + "learning_rate": 9.43248719296414e-07, + "loss": 0.02, + "num_tokens": 9360545.0, + "reward": 0.8446044921875, + "reward_std": 0.03275500237941742, + "rewards//mean": 0.8446044921875, + "rewards//std": 0.03899714723229408, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.162, + "grad_norm": 1.2282274961471558, + "kl": 0.1995092649012804, + "learning_rate": 9.431017896156073e-07, + "loss": 0.02, + "num_tokens": 9372073.0, + "reward": 0.836669921875, + "reward_std": 0.040674738585948944, + "rewards//mean": 0.836669921875, + "rewards//std": 0.05108262598514557, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1622, + "grad_norm": 1.303344964981079, + "kl": 0.2110609356313944, + "learning_rate": 9.429546814534528e-07, + "loss": 0.0211, + "num_tokens": 9383681.0, + "reward": 0.8291015625, + "reward_std": 0.028057757765054703, + "rewards//mean": 0.8291015625, + "rewards//std": 0.03388991951942444, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1624, + "grad_norm": 1.2771289348602295, + "kl": 0.18383082561194897, + "learning_rate": 9.428073948692054e-07, + "loss": 0.0184, + "num_tokens": 9395257.0, + "reward": 0.81939697265625, + "reward_std": 0.029788529500365257, + "rewards//mean": 0.81939697265625, + "rewards//std": 0.037472233176231384, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1626, + "grad_norm": 1.3700964450836182, + "kl": 0.2048053052276373, + "learning_rate": 9.426599299221924e-07, + "loss": 0.0205, + "num_tokens": 9406857.0, + "reward": 0.802734375, + "reward_std": 0.01909080520272255, + "rewards//mean": 0.802734375, + "rewards//std": 0.025372734293341637, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1628, + "grad_norm": 1.2927051782608032, + "kl": 0.18815220147371292, + "learning_rate": 9.425122866718127e-07, + "loss": 0.0188, + "num_tokens": 9418441.0, + "reward": 0.83154296875, + "reward_std": 0.027286551892757416, + "rewards//mean": 0.83154296875, + "rewards//std": 0.04035858437418938, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.163, + "grad_norm": 1.3853554725646973, + "kl": 0.17085476219654083, + "learning_rate": 9.423644651775368e-07, + "loss": 0.0171, + "num_tokens": 9429969.0, + "reward": 0.79656982421875, + "reward_std": 0.02473006770014763, + "rewards//mean": 0.79656982421875, + "rewards//std": 0.0293912161141634, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1632, + "grad_norm": 1.3091953992843628, + "kl": 0.19516406580805779, + "learning_rate": 9.422164654989071e-07, + "loss": 0.0195, + "num_tokens": 9441561.0, + "reward": 0.8343505859375, + "reward_std": 0.028405632823705673, + "rewards//mean": 0.8343505859375, + "rewards//std": 0.03250904008746147, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1634, + "grad_norm": 1.2616578340530396, + "kl": 0.2186471363529563, + "learning_rate": 9.420682876955381e-07, + "loss": 0.0219, + "num_tokens": 9453145.0, + "reward": 0.81622314453125, + "reward_std": 0.025131845846772194, + "rewards//mean": 0.81622314453125, + "rewards//std": 0.033053647726774216, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1636, + "grad_norm": 1.2948408126831055, + "kl": 0.22272520698606968, + "learning_rate": 9.419199318271156e-07, + "loss": 0.0223, + "num_tokens": 9464697.0, + "reward": 0.8409423828125, + "reward_std": 0.026716060936450958, + "rewards//mean": 0.8409423828125, + "rewards//std": 0.03562823310494423, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1638, + "grad_norm": 1.3377302885055542, + "kl": 0.17857146076858044, + "learning_rate": 9.417713979533974e-07, + "loss": 0.0179, + "num_tokens": 9476297.0, + "reward": 0.8109130859375, + "reward_std": 0.030736980959773064, + "rewards//mean": 0.8109130859375, + "rewards//std": 0.0405275784432888, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.164, + "grad_norm": 1.2774569988250732, + "kl": 0.202437624335289, + "learning_rate": 9.41622686134213e-07, + "loss": 0.0202, + "num_tokens": 9487889.0, + "reward": 0.8310546875, + "reward_std": 0.025606922805309296, + "rewards//mean": 0.8310546875, + "rewards//std": 0.027027472853660583, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1642, + "grad_norm": 1.1363234519958496, + "kl": 0.18414502777159214, + "learning_rate": 9.414737964294634e-07, + "loss": 0.0184, + "num_tokens": 9499489.0, + "reward": 0.78924560546875, + "reward_std": 0.019017454236745834, + "rewards//mean": 0.78924560546875, + "rewards//std": 0.020176947116851807, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.1644, + "grad_norm": 1.396458387374878, + "kl": 0.20549481362104416, + "learning_rate": 9.413247288991215e-07, + "loss": 0.0212, + "num_tokens": 9511130.0, + "reward": 0.8135986328125, + "reward_std": 0.02763247862458229, + "rewards//mean": 0.8135986328125, + "rewards//std": 0.030690591782331467, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1646, + "grad_norm": 1.5152183771133423, + "kl": 0.2166987583041191, + "learning_rate": 9.411754836032314e-07, + "loss": 0.0217, + "num_tokens": 9522690.0, + "reward": 0.828125, + "reward_std": 0.021165717393159866, + "rewards//mean": 0.828125, + "rewards//std": 0.024616902694106102, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1648, + "grad_norm": 1.2884459495544434, + "kl": 0.21427873708307743, + "learning_rate": 9.410260606019094e-07, + "loss": 0.0214, + "num_tokens": 9534386.0, + "reward": 0.86151123046875, + "reward_std": 0.034659095108509064, + "rewards//mean": 0.86151123046875, + "rewards//std": 0.038095228374004364, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.165, + "grad_norm": 1.2330397367477417, + "kl": 0.19154553674161434, + "learning_rate": 9.408764599553428e-07, + "loss": 0.0206, + "num_tokens": 9545967.0, + "reward": 0.83795166015625, + "reward_std": 0.03395526856184006, + "rewards//mean": 0.83795166015625, + "rewards//std": 0.03617149218916893, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1652, + "grad_norm": 1.2450987100601196, + "kl": 0.23155339807271957, + "learning_rate": 9.40726681723791e-07, + "loss": 0.0232, + "num_tokens": 9557671.0, + "reward": 0.8221435546875, + "reward_std": 0.020683759823441505, + "rewards//mean": 0.8221435546875, + "rewards//std": 0.0316578708589077, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1654, + "grad_norm": 1.382367730140686, + "kl": 0.19356225430965424, + "learning_rate": 9.405767259675844e-07, + "loss": 0.0194, + "num_tokens": 9569207.0, + "reward": 0.84552001953125, + "reward_std": 0.031045135110616684, + "rewards//mean": 0.84552001953125, + "rewards//std": 0.03799336776137352, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1656, + "grad_norm": 1.6347821950912476, + "kl": 0.2131052352488041, + "learning_rate": 9.404265927471253e-07, + "loss": 0.0213, + "num_tokens": 9580751.0, + "reward": 0.79217529296875, + "reward_std": 0.032313503324985504, + "rewards//mean": 0.79217529296875, + "rewards//std": 0.03793754801154137, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1658, + "grad_norm": 1.4837064743041992, + "kl": 0.2008308358490467, + "learning_rate": 9.402762821228874e-07, + "loss": 0.0201, + "num_tokens": 9592375.0, + "reward": 0.83929443359375, + "reward_std": 0.03470452129840851, + "rewards//mean": 0.83929443359375, + "rewards//std": 0.03978661447763443, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.166, + "grad_norm": 1.5616933107376099, + "kl": 0.20950329862535, + "learning_rate": 9.401257941554156e-07, + "loss": 0.021, + "num_tokens": 9603911.0, + "reward": 0.84490966796875, + "reward_std": 0.03284776583313942, + "rewards//mean": 0.84490966796875, + "rewards//std": 0.044274553656578064, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1662, + "grad_norm": 1.4034616947174072, + "kl": 0.21213417686522007, + "learning_rate": 9.399751289053266e-07, + "loss": 0.0212, + "num_tokens": 9615495.0, + "reward": 0.8465576171875, + "reward_std": 0.0331294909119606, + "rewards//mean": 0.8465576171875, + "rewards//std": 0.03933725878596306, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.1664, + "grad_norm": 1.3618806600570679, + "kl": 0.17845665849745274, + "learning_rate": 9.398242864333083e-07, + "loss": 0.0185, + "num_tokens": 9627121.0, + "reward": 0.83843994140625, + "reward_std": 0.02868497185409069, + "rewards//mean": 0.83843994140625, + "rewards//std": 0.03379235044121742, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1666, + "grad_norm": 1.5605050325393677, + "kl": 0.1785293910652399, + "learning_rate": 9.396732668001199e-07, + "loss": 0.0179, + "num_tokens": 9638785.0, + "reward": 0.82659912109375, + "reward_std": 0.020667292177677155, + "rewards//mean": 0.82659912109375, + "rewards//std": 0.02671757899224758, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1668, + "grad_norm": 1.3000471591949463, + "kl": 0.2153011690825224, + "learning_rate": 9.395220700665922e-07, + "loss": 0.0215, + "num_tokens": 9650465.0, + "reward": 0.85546875, + "reward_std": 0.033757373690605164, + "rewards//mean": 0.85546875, + "rewards//std": 0.04080916568636894, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.167, + "grad_norm": 1.3008617162704468, + "kl": 0.20071235857903957, + "learning_rate": 9.393706962936274e-07, + "loss": 0.0069, + "num_tokens": 9662045.0, + "reward": 0.8121337890625, + "reward_std": 0.020532922819256783, + "rewards//mean": 0.8121337890625, + "rewards//std": 0.025293558835983276, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1672, + "grad_norm": 1.2494481801986694, + "kl": 0.16858448460698128, + "learning_rate": 9.392191455421987e-07, + "loss": 0.0169, + "num_tokens": 9673525.0, + "reward": 0.81878662109375, + "reward_std": 0.018561791628599167, + "rewards//mean": 0.81878662109375, + "rewards//std": 0.03405117616057396, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1674, + "grad_norm": 1.3018120527267456, + "kl": 0.2102839332073927, + "learning_rate": 9.390674178733507e-07, + "loss": 0.021, + "num_tokens": 9685109.0, + "reward": 0.82757568359375, + "reward_std": 0.031177718192338943, + "rewards//mean": 0.82757568359375, + "rewards//std": 0.033130042254924774, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.1676, + "grad_norm": 1.4380300045013428, + "kl": 0.21796843316406012, + "learning_rate": 9.389155133481992e-07, + "loss": 0.012, + "num_tokens": 9696594.0, + "reward": 0.85906982421875, + "reward_std": 0.03473721072077751, + "rewards//mean": 0.85906982421875, + "rewards//std": 0.03910157456994057, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1678, + "grad_norm": 1.3134464025497437, + "kl": 0.21699927560985088, + "learning_rate": 9.387634320279314e-07, + "loss": 0.0217, + "num_tokens": 9708290.0, + "reward": 0.8397216796875, + "reward_std": 0.023894421756267548, + "rewards//mean": 0.8397216796875, + "rewards//std": 0.028763698413968086, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.168, + "grad_norm": 1.1642162799835205, + "kl": 0.1650447305291891, + "learning_rate": 9.386111739738056e-07, + "loss": 0.0165, + "num_tokens": 9719834.0, + "reward": 0.85052490234375, + "reward_std": 0.03135347738862038, + "rewards//mean": 0.85052490234375, + "rewards//std": 0.04387649521231651, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1682, + "grad_norm": 1.4073882102966309, + "kl": 0.19880811870098114, + "learning_rate": 9.384587392471514e-07, + "loss": 0.0199, + "num_tokens": 9731370.0, + "reward": 0.81231689453125, + "reward_std": 0.02056782692670822, + "rewards//mean": 0.81231689453125, + "rewards//std": 0.028283609077334404, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1684, + "grad_norm": 1.2902801036834717, + "kl": 0.19633889570832253, + "learning_rate": 9.383061279093696e-07, + "loss": 0.0196, + "num_tokens": 9742954.0, + "reward": 0.82269287109375, + "reward_std": 0.030749669298529625, + "rewards//mean": 0.82269287109375, + "rewards//std": 0.03664795309305191, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1686, + "grad_norm": 1.3936102390289307, + "kl": 0.18943569250404835, + "learning_rate": 9.381533400219317e-07, + "loss": 0.0189, + "num_tokens": 9754514.0, + "reward": 0.83880615234375, + "reward_std": 0.032567478716373444, + "rewards//mean": 0.83880615234375, + "rewards//std": 0.046580810099840164, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1688, + "grad_norm": 1.4527981281280518, + "kl": 0.18527862429618835, + "learning_rate": 9.38000375646381e-07, + "loss": 0.0185, + "num_tokens": 9766090.0, + "reward": 0.8370361328125, + "reward_std": 0.03027765080332756, + "rewards//mean": 0.8370361328125, + "rewards//std": 0.03364942595362663, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.169, + "grad_norm": 1.3724623918533325, + "kl": 0.18950714450329542, + "learning_rate": 9.378472348443314e-07, + "loss": 0.019, + "num_tokens": 9777650.0, + "reward": 0.81561279296875, + "reward_std": 0.023939888924360275, + "rewards//mean": 0.81561279296875, + "rewards//std": 0.031041434034705162, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1692, + "grad_norm": 1.2238541841506958, + "kl": 0.19523492082953453, + "learning_rate": 9.376939176774677e-07, + "loss": 0.0195, + "num_tokens": 9789282.0, + "reward": 0.836181640625, + "reward_std": 0.026551909744739532, + "rewards//mean": 0.836181640625, + "rewards//std": 0.037941984832286835, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.1694, + "grad_norm": 1.5132311582565308, + "kl": 0.2114522736519575, + "learning_rate": 9.375404242075466e-07, + "loss": 0.0203, + "num_tokens": 9800882.0, + "reward": 0.77752685546875, + "reward_std": 0.030909229069948196, + "rewards//mean": 0.77752685546875, + "rewards//std": 0.03394297510385513, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1696, + "grad_norm": 1.3832175731658936, + "kl": 0.19432266987860203, + "learning_rate": 9.373867544963948e-07, + "loss": 0.0194, + "num_tokens": 9812442.0, + "reward": 0.83135986328125, + "reward_std": 0.03154782950878143, + "rewards//mean": 0.83135986328125, + "rewards//std": 0.0377286821603775, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1698, + "grad_norm": 1.5525991916656494, + "kl": 0.21941151469945908, + "learning_rate": 9.372329086059107e-07, + "loss": 0.0219, + "num_tokens": 9824034.0, + "reward": 0.7952880859375, + "reward_std": 0.018947824835777283, + "rewards//mean": 0.7952880859375, + "rewards//std": 0.021187452599406242, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.17, + "grad_norm": 1.315627098083496, + "kl": 0.21704397536814213, + "learning_rate": 9.370788865980632e-07, + "loss": 0.0217, + "num_tokens": 9835554.0, + "reward": 0.8505859375, + "reward_std": 0.026237601414322853, + "rewards//mean": 0.8505859375, + "rewards//std": 0.02810811810195446, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1702, + "grad_norm": 1.3297349214553833, + "kl": 0.1934987735003233, + "learning_rate": 9.369246885348925e-07, + "loss": 0.0193, + "num_tokens": 9847130.0, + "reward": 0.81829833984375, + "reward_std": 0.030703112483024597, + "rewards//mean": 0.81829833984375, + "rewards//std": 0.037051353603601456, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1704, + "grad_norm": 1.4149894714355469, + "kl": 0.20298204012215137, + "learning_rate": 9.367703144785095e-07, + "loss": 0.0203, + "num_tokens": 9858778.0, + "reward": 0.830322265625, + "reward_std": 0.01958218775689602, + "rewards//mean": 0.830322265625, + "rewards//std": 0.02645573951303959, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1706, + "grad_norm": 1.3075114488601685, + "kl": 0.21162652224302292, + "learning_rate": 9.366157644910959e-07, + "loss": 0.0212, + "num_tokens": 9870362.0, + "reward": 0.840576171875, + "reward_std": 0.0191647931933403, + "rewards//mean": 0.840576171875, + "rewards//std": 0.029147686436772346, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1708, + "grad_norm": 1.1829180717468262, + "kl": 0.2044578641653061, + "learning_rate": 9.364610386349047e-07, + "loss": 0.0204, + "num_tokens": 9881954.0, + "reward": 0.82159423828125, + "reward_std": 0.027132539078593254, + "rewards//mean": 0.82159423828125, + "rewards//std": 0.029643533751368523, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.171, + "grad_norm": 1.2216097116470337, + "kl": 0.23206710256636143, + "learning_rate": 9.363061369722594e-07, + "loss": 0.0232, + "num_tokens": 9893554.0, + "reward": 0.80194091796875, + "reward_std": 0.020441655069589615, + "rewards//mean": 0.80194091796875, + "rewards//std": 0.02296438440680504, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.1712, + "grad_norm": 1.3271172046661377, + "kl": 0.18786464631557465, + "learning_rate": 9.361510595655544e-07, + "loss": 0.0185, + "num_tokens": 9905113.0, + "reward": 0.807373046875, + "reward_std": 0.031616728752851486, + "rewards//mean": 0.807373046875, + "rewards//std": 0.0364043191075325, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1714, + "grad_norm": 1.6254206895828247, + "kl": 0.1954382136464119, + "learning_rate": 9.359958064772546e-07, + "loss": 0.0195, + "num_tokens": 9916721.0, + "reward": 0.81195068359375, + "reward_std": 0.028249869123101234, + "rewards//mean": 0.81195068359375, + "rewards//std": 0.03321036323904991, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1716, + "grad_norm": 1.458098292350769, + "kl": 0.20321644097566605, + "learning_rate": 9.35840377769896e-07, + "loss": 0.0203, + "num_tokens": 9928225.0, + "reward": 0.82196044921875, + "reward_std": 0.03398178517818451, + "rewards//mean": 0.82196044921875, + "rewards//std": 0.048693664371967316, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1718, + "grad_norm": 1.189704418182373, + "kl": 0.20781412906944752, + "learning_rate": 9.356847735060856e-07, + "loss": 0.0208, + "num_tokens": 9939817.0, + "reward": 0.851806640625, + "reward_std": 0.022624783217906952, + "rewards//mean": 0.851806640625, + "rewards//std": 0.03326871246099472, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.172, + "grad_norm": 1.3017792701721191, + "kl": 0.2185022346675396, + "learning_rate": 9.355289937485004e-07, + "loss": 0.0219, + "num_tokens": 9951425.0, + "reward": 0.8502197265625, + "reward_std": 0.02418399602174759, + "rewards//mean": 0.8502197265625, + "rewards//std": 0.03674432635307312, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1722, + "grad_norm": 1.3798669576644897, + "kl": 0.20723901502788067, + "learning_rate": 9.353730385598886e-07, + "loss": 0.0207, + "num_tokens": 9962929.0, + "reward": 0.8140869140625, + "reward_std": 0.02418222278356552, + "rewards//mean": 0.8140869140625, + "rewards//std": 0.029476908966898918, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1724, + "grad_norm": 1.279707670211792, + "kl": 0.22935540229082108, + "learning_rate": 9.35216908003069e-07, + "loss": 0.0229, + "num_tokens": 9974489.0, + "reward": 0.81842041015625, + "reward_std": 0.028273917734622955, + "rewards//mean": 0.81842041015625, + "rewards//std": 0.03018549270927906, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1726, + "grad_norm": 1.4835363626480103, + "kl": 0.2051690388470888, + "learning_rate": 9.350606021409308e-07, + "loss": 0.0205, + "num_tokens": 9986089.0, + "reward": 0.832763671875, + "reward_std": 0.018340151757001877, + "rewards//mean": 0.832763671875, + "rewards//std": 0.024238893762230873, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1728, + "grad_norm": 1.2629104852676392, + "kl": 0.20622162707149982, + "learning_rate": 9.349041210364341e-07, + "loss": 0.0206, + "num_tokens": 9997745.0, + "reward": 0.8443603515625, + "reward_std": 0.029976854100823402, + "rewards//mean": 0.8443603515625, + "rewards//std": 0.03828098997473717, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.173, + "grad_norm": 1.235669732093811, + "kl": 0.19178683497011662, + "learning_rate": 9.347474647526095e-07, + "loss": 0.0192, + "num_tokens": 10009281.0, + "reward": 0.85888671875, + "reward_std": 0.03273060545325279, + "rewards//mean": 0.85888671875, + "rewards//std": 0.03470555692911148, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1732, + "grad_norm": 1.387406349182129, + "kl": 0.2223089076578617, + "learning_rate": 9.34590633352558e-07, + "loss": 0.0222, + "num_tokens": 10020913.0, + "reward": 0.83172607421875, + "reward_std": 0.02781064249575138, + "rewards//mean": 0.83172607421875, + "rewards//std": 0.036453716456890106, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1734, + "grad_norm": 1.382559061050415, + "kl": 0.21379771828651428, + "learning_rate": 9.344336268994515e-07, + "loss": 0.0214, + "num_tokens": 10032489.0, + "reward": 0.84002685546875, + "reward_std": 0.031474411487579346, + "rewards//mean": 0.84002685546875, + "rewards//std": 0.03724328801035881, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1736, + "grad_norm": 1.3722155094146729, + "kl": 0.18677115626633167, + "learning_rate": 9.342764454565319e-07, + "loss": 0.0187, + "num_tokens": 10044057.0, + "reward": 0.85546875, + "reward_std": 0.031831733882427216, + "rewards//mean": 0.85546875, + "rewards//std": 0.03924189880490303, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1738, + "grad_norm": 1.3306219577789307, + "kl": 0.23422635532915592, + "learning_rate": 9.341190890871121e-07, + "loss": 0.0234, + "num_tokens": 10055649.0, + "reward": 0.8323974609375, + "reward_std": 0.022451017051935196, + "rewards//mean": 0.8323974609375, + "rewards//std": 0.03444087877869606, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.174, + "grad_norm": 1.241603970527649, + "kl": 0.20044328086078167, + "learning_rate": 9.339615578545752e-07, + "loss": 0.02, + "num_tokens": 10067233.0, + "reward": 0.837646484375, + "reward_std": 0.026573751121759415, + "rewards//mean": 0.837646484375, + "rewards//std": 0.03809488192200661, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1742, + "grad_norm": 1.3087713718414307, + "kl": 0.20635857246816158, + "learning_rate": 9.338038518223745e-07, + "loss": 0.0206, + "num_tokens": 10078809.0, + "reward": 0.83880615234375, + "reward_std": 0.027794383466243744, + "rewards//mean": 0.83880615234375, + "rewards//std": 0.035735297948122025, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1744, + "grad_norm": 1.3721296787261963, + "kl": 0.2434449251741171, + "learning_rate": 9.336459710540343e-07, + "loss": 0.0243, + "num_tokens": 10090537.0, + "reward": 0.82501220703125, + "reward_std": 0.028863223269581795, + "rewards//mean": 0.82501220703125, + "rewards//std": 0.03781605139374733, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1746, + "grad_norm": 1.3388464450836182, + "kl": 0.2306215465068817, + "learning_rate": 9.334879156131488e-07, + "loss": 0.0231, + "num_tokens": 10102137.0, + "reward": 0.8333740234375, + "reward_std": 0.023839851841330528, + "rewards//mean": 0.8333740234375, + "rewards//std": 0.02987276203930378, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1748, + "grad_norm": 1.5618817806243896, + "kl": 0.23544391430914402, + "learning_rate": 9.333296855633827e-07, + "loss": 0.0235, + "num_tokens": 10113689.0, + "reward": 0.8369140625, + "reward_std": 0.026631250977516174, + "rewards//mean": 0.8369140625, + "rewards//std": 0.02866273932158947, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.175, + "grad_norm": 1.5715306997299194, + "kl": 0.2069468442350626, + "learning_rate": 9.331712809684711e-07, + "loss": 0.0207, + "num_tokens": 10125225.0, + "reward": 0.835205078125, + "reward_std": 0.029446696862578392, + "rewards//mean": 0.835205078125, + "rewards//std": 0.032976217567920685, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1752, + "grad_norm": 1.2985596656799316, + "kl": 0.2570105120539665, + "learning_rate": 9.330127018922193e-07, + "loss": 0.0257, + "num_tokens": 10136825.0, + "reward": 0.83203125, + "reward_std": 0.02960948646068573, + "rewards//mean": 0.83203125, + "rewards//std": 0.043963395059108734, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1754, + "grad_norm": 1.2525960206985474, + "kl": 0.23863573744893074, + "learning_rate": 9.32853948398503e-07, + "loss": 0.0239, + "num_tokens": 10148345.0, + "reward": 0.83203125, + "reward_std": 0.02629118785262108, + "rewards//mean": 0.83203125, + "rewards//std": 0.029123786836862564, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1756, + "grad_norm": 1.3539098501205444, + "kl": 0.21196098811924458, + "learning_rate": 9.32695020551268e-07, + "loss": 0.0212, + "num_tokens": 10159777.0, + "reward": 0.83282470703125, + "reward_std": 0.023515939712524414, + "rewards//mean": 0.83282470703125, + "rewards//std": 0.026885326951742172, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1758, + "grad_norm": 1.1708202362060547, + "kl": 0.2280040830373764, + "learning_rate": 9.325359184145305e-07, + "loss": 0.0228, + "num_tokens": 10171345.0, + "reward": 0.835205078125, + "reward_std": 0.014545939862728119, + "rewards//mean": 0.835205078125, + "rewards//std": 0.02176438830792904, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.176, + "grad_norm": 1.333824872970581, + "kl": 0.216582290828228, + "learning_rate": 9.323766420523767e-07, + "loss": 0.0217, + "num_tokens": 10182945.0, + "reward": 0.83880615234375, + "reward_std": 0.023271074518561363, + "rewards//mean": 0.83880615234375, + "rewards//std": 0.03040335513651371, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.1762, + "grad_norm": 1.4554129838943481, + "kl": 0.20934331603348255, + "learning_rate": 9.322171915289633e-07, + "loss": 0.0145, + "num_tokens": 10194506.0, + "reward": 0.81494140625, + "reward_std": 0.03118913248181343, + "rewards//mean": 0.81494140625, + "rewards//std": 0.039644110947847366, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1764, + "grad_norm": 1.3626899719238281, + "kl": 0.21591765992343426, + "learning_rate": 9.320575669085169e-07, + "loss": 0.0216, + "num_tokens": 10205994.0, + "reward": 0.81268310546875, + "reward_std": 0.01955730840563774, + "rewards//mean": 0.81268310546875, + "rewards//std": 0.02399338409304619, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1766, + "grad_norm": 1.4751920700073242, + "kl": 0.2083579022437334, + "learning_rate": 9.31897768255334e-07, + "loss": 0.0208, + "num_tokens": 10217570.0, + "reward": 0.8095703125, + "reward_std": 0.02480398863554001, + "rewards//mean": 0.8095703125, + "rewards//std": 0.039579909294843674, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1768, + "grad_norm": 1.2111749649047852, + "kl": 0.19959122128784657, + "learning_rate": 9.317377956337818e-07, + "loss": 0.02, + "num_tokens": 10229066.0, + "reward": 0.8223876953125, + "reward_std": 0.01600227877497673, + "rewards//mean": 0.8223876953125, + "rewards//std": 0.022882914170622826, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.177, + "grad_norm": 1.4769270420074463, + "kl": 0.23066280782222748, + "learning_rate": 9.315776491082972e-07, + "loss": 0.0231, + "num_tokens": 10240690.0, + "reward": 0.8170166015625, + "reward_std": 0.03036269173026085, + "rewards//mean": 0.8170166015625, + "rewards//std": 0.0356673002243042, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1772, + "grad_norm": 1.4759421348571777, + "kl": 0.20866534672677517, + "learning_rate": 9.314173287433872e-07, + "loss": 0.0209, + "num_tokens": 10252266.0, + "reward": 0.8035888671875, + "reward_std": 0.02413165755569935, + "rewards//mean": 0.8035888671875, + "rewards//std": 0.03658082336187363, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1774, + "grad_norm": 1.4411066770553589, + "kl": 0.2635006923228502, + "learning_rate": 9.312568346036287e-07, + "loss": 0.0264, + "num_tokens": 10263738.0, + "reward": 0.8623046875, + "reward_std": 0.023539962247014046, + "rewards//mean": 0.8623046875, + "rewards//std": 0.033138323575258255, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1776, + "grad_norm": 1.4998944997787476, + "kl": 0.20943459682166576, + "learning_rate": 9.310961667536688e-07, + "loss": 0.0209, + "num_tokens": 10275370.0, + "reward": 0.83941650390625, + "reward_std": 0.027500297874212265, + "rewards//mean": 0.83941650390625, + "rewards//std": 0.0334012545645237, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1778, + "grad_norm": 1.5260571241378784, + "kl": 0.18918770737946033, + "learning_rate": 9.309353252582245e-07, + "loss": 0.0189, + "num_tokens": 10286946.0, + "reward": 0.82440185546875, + "reward_std": 0.021143317222595215, + "rewards//mean": 0.82440185546875, + "rewards//std": 0.027817342430353165, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.178, + "grad_norm": 2.149465560913086, + "kl": 0.23479856364428997, + "learning_rate": 9.307743101820827e-07, + "loss": 0.0235, + "num_tokens": 10298522.0, + "reward": 0.85369873046875, + "reward_std": 0.02191685326397419, + "rewards//mean": 0.85369873046875, + "rewards//std": 0.03168051689863205, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1782, + "grad_norm": 1.4006909132003784, + "kl": 0.21995011530816555, + "learning_rate": 9.306131215901003e-07, + "loss": 0.022, + "num_tokens": 10310074.0, + "reward": 0.8365478515625, + "reward_std": 0.026711855083703995, + "rewards//mean": 0.8365478515625, + "rewards//std": 0.029142232611775398, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1784, + "grad_norm": 1.3469215631484985, + "kl": 0.22437157668173313, + "learning_rate": 9.304517595472039e-07, + "loss": 0.0224, + "num_tokens": 10321666.0, + "reward": 0.84320068359375, + "reward_std": 0.03722769767045975, + "rewards//mean": 0.84320068359375, + "rewards//std": 0.04081431031227112, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1786, + "grad_norm": 1.371563196182251, + "kl": 0.264225734397769, + "learning_rate": 9.302902241183903e-07, + "loss": 0.0264, + "num_tokens": 10333170.0, + "reward": 0.81939697265625, + "reward_std": 0.01781880296766758, + "rewards//mean": 0.81939697265625, + "rewards//std": 0.025890018790960312, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1788, + "grad_norm": 1.3066415786743164, + "kl": 0.2567511536180973, + "learning_rate": 9.301285153687259e-07, + "loss": 0.0257, + "num_tokens": 10344882.0, + "reward": 0.85113525390625, + "reward_std": 0.0328216478228569, + "rewards//mean": 0.85113525390625, + "rewards//std": 0.03935971483588219, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.179, + "grad_norm": 1.5192610025405884, + "kl": 0.23922481946647167, + "learning_rate": 9.29966633363347e-07, + "loss": 0.0239, + "num_tokens": 10356538.0, + "reward": 0.8370361328125, + "reward_std": 0.02499016746878624, + "rewards//mean": 0.8370361328125, + "rewards//std": 0.031323302537202835, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1792, + "grad_norm": 1.353825569152832, + "kl": 0.20837223157286644, + "learning_rate": 9.298045781674595e-07, + "loss": 0.0208, + "num_tokens": 10368146.0, + "reward": 0.8343505859375, + "reward_std": 0.03159391134977341, + "rewards//mean": 0.8343505859375, + "rewards//std": 0.045109350234270096, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1794, + "grad_norm": 1.577677845954895, + "kl": 0.2534589134156704, + "learning_rate": 9.296423498463395e-07, + "loss": 0.0253, + "num_tokens": 10379738.0, + "reward": 0.8184814453125, + "reward_std": 0.016925301402807236, + "rewards//mean": 0.8184814453125, + "rewards//std": 0.025543684139847755, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1796, + "grad_norm": 1.3585562705993652, + "kl": 0.24171092547476292, + "learning_rate": 9.294799484653322e-07, + "loss": 0.0242, + "num_tokens": 10391258.0, + "reward": 0.8438720703125, + "reward_std": 0.019304184243083, + "rewards//mean": 0.8438720703125, + "rewards//std": 0.025694947689771652, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1798, + "grad_norm": 1.365598440170288, + "kl": 0.20991803146898746, + "learning_rate": 9.29317374089853e-07, + "loss": 0.021, + "num_tokens": 10402850.0, + "reward": 0.83392333984375, + "reward_std": 0.028618749231100082, + "rewards//mean": 0.83392333984375, + "rewards//std": 0.0352628193795681, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.18, + "grad_norm": 1.4758015871047974, + "kl": 0.24983062036335468, + "learning_rate": 9.291546267853869e-07, + "loss": 0.025, + "num_tokens": 10414378.0, + "reward": 0.80535888671875, + "reward_std": 0.02032564952969551, + "rewards//mean": 0.80535888671875, + "rewards//std": 0.027014518156647682, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1802, + "grad_norm": 1.431591510772705, + "kl": 0.22699766606092453, + "learning_rate": 9.289917066174885e-07, + "loss": 0.0227, + "num_tokens": 10426074.0, + "reward": 0.8541259765625, + "reward_std": 0.020868558436632156, + "rewards//mean": 0.8541259765625, + "rewards//std": 0.026275169104337692, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1804, + "grad_norm": 1.5241913795471191, + "kl": 0.224903741851449, + "learning_rate": 9.288286136517819e-07, + "loss": 0.0225, + "num_tokens": 10437634.0, + "reward": 0.84674072265625, + "reward_std": 0.02979208528995514, + "rewards//mean": 0.84674072265625, + "rewards//std": 0.03806223347783089, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1806, + "grad_norm": 1.3658450841903687, + "kl": 0.23717258125543594, + "learning_rate": 9.28665347953961e-07, + "loss": 0.0237, + "num_tokens": 10449274.0, + "reward": 0.81878662109375, + "reward_std": 0.03554638475179672, + "rewards//mean": 0.81878662109375, + "rewards//std": 0.04256732761859894, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1808, + "grad_norm": 1.3473302125930786, + "kl": 0.23083961568772793, + "learning_rate": 9.285019095897893e-07, + "loss": 0.0231, + "num_tokens": 10460858.0, + "reward": 0.8486328125, + "reward_std": 0.02642989531159401, + "rewards//mean": 0.8486328125, + "rewards//std": 0.039260413497686386, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.181, + "grad_norm": 1.239672064781189, + "kl": 0.2134381365031004, + "learning_rate": 9.283382986250996e-07, + "loss": 0.0213, + "num_tokens": 10472274.0, + "reward": 0.8397216796875, + "reward_std": 0.026190834119915962, + "rewards//mean": 0.8397216796875, + "rewards//std": 0.033588189631700516, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1812, + "grad_norm": 1.544689655303955, + "kl": 0.22864597290754318, + "learning_rate": 9.281745151257945e-07, + "loss": 0.0229, + "num_tokens": 10483778.0, + "reward": 0.83990478515625, + "reward_std": 0.02726859226822853, + "rewards//mean": 0.83990478515625, + "rewards//std": 0.029376275837421417, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1814, + "grad_norm": 1.4144614934921265, + "kl": 0.22864620946347713, + "learning_rate": 9.280105591578458e-07, + "loss": 0.0229, + "num_tokens": 10495330.0, + "reward": 0.8265380859375, + "reward_std": 0.02843199297785759, + "rewards//mean": 0.8265380859375, + "rewards//std": 0.030454905703663826, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1816, + "grad_norm": 1.4661659002304077, + "kl": 0.2105198372155428, + "learning_rate": 9.278464307872951e-07, + "loss": 0.0211, + "num_tokens": 10506930.0, + "reward": 0.7884521484375, + "reward_std": 0.026415321975946426, + "rewards//mean": 0.7884521484375, + "rewards//std": 0.03366921469569206, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1818, + "grad_norm": 1.6999280452728271, + "kl": 0.253387538716197, + "learning_rate": 9.276821300802533e-07, + "loss": 0.0253, + "num_tokens": 10518426.0, + "reward": 0.847412109375, + "reward_std": 0.024253420531749725, + "rewards//mean": 0.847412109375, + "rewards//std": 0.032240886241197586, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.182, + "grad_norm": 1.4599846601486206, + "kl": 0.20741555467247963, + "learning_rate": 9.275176571029006e-07, + "loss": 0.0207, + "num_tokens": 10529922.0, + "reward": 0.81231689453125, + "reward_std": 0.017622938379645348, + "rewards//mean": 0.81231689453125, + "rewards//std": 0.03010866604745388, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1822, + "grad_norm": 1.4206233024597168, + "kl": 0.26763366535305977, + "learning_rate": 9.273530119214867e-07, + "loss": 0.0268, + "num_tokens": 10541498.0, + "reward": 0.819091796875, + "reward_std": 0.02835727483034134, + "rewards//mean": 0.819091796875, + "rewards//std": 0.03829779475927353, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1824, + "grad_norm": 1.2443784475326538, + "kl": 0.24584674276411533, + "learning_rate": 9.271881946023308e-07, + "loss": 0.0246, + "num_tokens": 10553098.0, + "reward": 0.854248046875, + "reward_std": 0.014156204648315907, + "rewards//mean": 0.854248046875, + "rewards//std": 0.015990804880857468, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1826, + "grad_norm": 1.5235764980316162, + "kl": 0.21083076484501362, + "learning_rate": 9.270232052118212e-07, + "loss": 0.0211, + "num_tokens": 10564754.0, + "reward": 0.811279296875, + "reward_std": 0.02116667851805687, + "rewards//mean": 0.811279296875, + "rewards//std": 0.034826599061489105, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1828, + "grad_norm": 1.3781168460845947, + "kl": 0.24342180974781513, + "learning_rate": 9.268580438164155e-07, + "loss": 0.0243, + "num_tokens": 10576306.0, + "reward": 0.8267822265625, + "reward_std": 0.01846298761665821, + "rewards//mean": 0.8267822265625, + "rewards//std": 0.030743815004825592, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.183, + "grad_norm": 1.4158135652542114, + "kl": 0.2642615344375372, + "learning_rate": 9.266927104826408e-07, + "loss": 0.0264, + "num_tokens": 10587906.0, + "reward": 0.85235595703125, + "reward_std": 0.022961223497986794, + "rewards//mean": 0.85235595703125, + "rewards//std": 0.028469251468777657, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1832, + "grad_norm": 1.3171461820602417, + "kl": 0.2255435287952423, + "learning_rate": 9.265272052770935e-07, + "loss": 0.0226, + "num_tokens": 10599546.0, + "reward": 0.85369873046875, + "reward_std": 0.019306007772684097, + "rewards//mean": 0.85369873046875, + "rewards//std": 0.02329164370894432, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1834, + "grad_norm": 1.4169479608535767, + "kl": 0.2041472364217043, + "learning_rate": 9.263615282664388e-07, + "loss": 0.0204, + "num_tokens": 10611146.0, + "reward": 0.83935546875, + "reward_std": 0.036017969250679016, + "rewards//mean": 0.83935546875, + "rewards//std": 0.04276534542441368, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1836, + "grad_norm": 1.4358748197555542, + "kl": 0.23849374800920486, + "learning_rate": 9.261956795174115e-07, + "loss": 0.0238, + "num_tokens": 10623034.0, + "reward": 0.83831787109375, + "reward_std": 0.026731174439191818, + "rewards//mean": 0.83831787109375, + "rewards//std": 0.03713379055261612, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1838, + "grad_norm": 1.3678029775619507, + "kl": 0.25326537899672985, + "learning_rate": 9.260296590968156e-07, + "loss": 0.0253, + "num_tokens": 10634618.0, + "reward": 0.84991455078125, + "reward_std": 0.021243005990982056, + "rewards//mean": 0.84991455078125, + "rewards//std": 0.028218237683176994, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.184, + "grad_norm": 1.5360409021377563, + "kl": 0.20387842319905758, + "learning_rate": 9.258634670715237e-07, + "loss": 0.0204, + "num_tokens": 10646186.0, + "reward": 0.8411865234375, + "reward_std": 0.028553113341331482, + "rewards//mean": 0.8411865234375, + "rewards//std": 0.03371953219175339, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1842, + "grad_norm": 1.4992399215698242, + "kl": 0.24092578887939453, + "learning_rate": 9.256971035084784e-07, + "loss": 0.0241, + "num_tokens": 10657746.0, + "reward": 0.83251953125, + "reward_std": 0.02355344593524933, + "rewards//mean": 0.83251953125, + "rewards//std": 0.029177792370319366, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1844, + "grad_norm": 1.4638704061508179, + "kl": 0.20854973047971725, + "learning_rate": 9.255305684746907e-07, + "loss": 0.0209, + "num_tokens": 10669202.0, + "reward": 0.845703125, + "reward_std": 0.019243042916059494, + "rewards//mean": 0.845703125, + "rewards//std": 0.02923169918358326, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1846, + "grad_norm": 1.3434497117996216, + "kl": 0.25090443529188633, + "learning_rate": 9.253638620372408e-07, + "loss": 0.0251, + "num_tokens": 10680770.0, + "reward": 0.86053466796875, + "reward_std": 0.027412422001361847, + "rewards//mean": 0.86053466796875, + "rewards//std": 0.030935920774936676, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1848, + "grad_norm": 1.3991310596466064, + "kl": 0.2381984293460846, + "learning_rate": 9.251969842632783e-07, + "loss": 0.0238, + "num_tokens": 10692314.0, + "reward": 0.85516357421875, + "reward_std": 0.019572610035538673, + "rewards//mean": 0.85516357421875, + "rewards//std": 0.023554034531116486, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.185, + "grad_norm": 1.3182166814804077, + "kl": 0.24672965705394745, + "learning_rate": 9.250299352200212e-07, + "loss": 0.0208, + "num_tokens": 10703896.0, + "reward": 0.84503173828125, + "reward_std": 0.017325744032859802, + "rewards//mean": 0.84503173828125, + "rewards//std": 0.024330446496605873, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1852, + "grad_norm": 1.4422526359558105, + "kl": 0.2612649854272604, + "learning_rate": 9.248627149747572e-07, + "loss": 0.0261, + "num_tokens": 10715480.0, + "reward": 0.843505859375, + "reward_std": 0.030928023159503937, + "rewards//mean": 0.843505859375, + "rewards//std": 0.036636438220739365, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1854, + "grad_norm": 1.5051567554473877, + "kl": 0.20294679515063763, + "learning_rate": 9.246953235948422e-07, + "loss": 0.0203, + "num_tokens": 10727032.0, + "reward": 0.80145263671875, + "reward_std": 0.02881399169564247, + "rewards//mean": 0.80145263671875, + "rewards//std": 0.030214063823223114, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1856, + "grad_norm": 1.3458820581436157, + "kl": 0.2173499148339033, + "learning_rate": 9.245277611477018e-07, + "loss": 0.0217, + "num_tokens": 10738608.0, + "reward": 0.79522705078125, + "reward_std": 0.019501596689224243, + "rewards//mean": 0.79522705078125, + "rewards//std": 0.02036067098379135, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1858, + "grad_norm": 1.4168897867202759, + "kl": 0.26542726531624794, + "learning_rate": 9.2436002770083e-07, + "loss": 0.0265, + "num_tokens": 10750288.0, + "reward": 0.845703125, + "reward_std": 0.027310919016599655, + "rewards//mean": 0.845703125, + "rewards//std": 0.029057180508971214, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.186, + "grad_norm": 1.2301853895187378, + "kl": 0.22849436476826668, + "learning_rate": 9.241921233217897e-07, + "loss": 0.0228, + "num_tokens": 10761896.0, + "reward": 0.84039306640625, + "reward_std": 0.027086947113275528, + "rewards//mean": 0.84039306640625, + "rewards//std": 0.03365229442715645, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1862, + "grad_norm": 1.7055004835128784, + "kl": 0.26082620956003666, + "learning_rate": 9.240240480782129e-07, + "loss": 0.0261, + "num_tokens": 10773448.0, + "reward": 0.80010986328125, + "reward_std": 0.024139974266290665, + "rewards//mean": 0.80010986328125, + "rewards//std": 0.0307936891913414, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1864, + "grad_norm": 1.3342863321304321, + "kl": 0.28830727748572826, + "learning_rate": 9.238558020378003e-07, + "loss": 0.0288, + "num_tokens": 10785032.0, + "reward": 0.8203125, + "reward_std": 0.0202197078615427, + "rewards//mean": 0.8203125, + "rewards//std": 0.03553655371069908, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.1866, + "grad_norm": 1.3602207899093628, + "kl": 0.25212294422090054, + "learning_rate": 9.236873852683212e-07, + "loss": 0.0252, + "num_tokens": 10796617.0, + "reward": 0.84942626953125, + "reward_std": 0.021889977157115936, + "rewards//mean": 0.84942626953125, + "rewards//std": 0.02488045021891594, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1868, + "grad_norm": 1.5897393226623535, + "kl": 0.24220798909664154, + "learning_rate": 9.235187978376141e-07, + "loss": 0.0242, + "num_tokens": 10808225.0, + "reward": 0.8284912109375, + "reward_std": 0.02973666973412037, + "rewards//mean": 0.8284912109375, + "rewards//std": 0.03753020986914635, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.187, + "grad_norm": 1.3895819187164307, + "kl": 0.254599591717124, + "learning_rate": 9.233500398135858e-07, + "loss": 0.0255, + "num_tokens": 10819769.0, + "reward": 0.84027099609375, + "reward_std": 0.02176508679986, + "rewards//mean": 0.84027099609375, + "rewards//std": 0.02366495504975319, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1872, + "grad_norm": 1.457568883895874, + "kl": 0.29907106049358845, + "learning_rate": 9.23181111264212e-07, + "loss": 0.0299, + "num_tokens": 10831409.0, + "reward": 0.8319091796875, + "reward_std": 0.02907077595591545, + "rewards//mean": 0.8319091796875, + "rewards//std": 0.030985118821263313, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1874, + "grad_norm": 1.4835996627807617, + "kl": 0.27078401669859886, + "learning_rate": 9.230120122575375e-07, + "loss": 0.0271, + "num_tokens": 10843009.0, + "reward": 0.853515625, + "reward_std": 0.02482033707201481, + "rewards//mean": 0.853515625, + "rewards//std": 0.03294774517416954, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1876, + "grad_norm": 1.429699182510376, + "kl": 0.247146749868989, + "learning_rate": 9.228427428616748e-07, + "loss": 0.0247, + "num_tokens": 10854577.0, + "reward": 0.82635498046875, + "reward_std": 0.013353677466511726, + "rewards//mean": 0.82635498046875, + "rewards//std": 0.017146440222859383, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1878, + "grad_norm": 1.5705890655517578, + "kl": 0.2272945325821638, + "learning_rate": 9.22673303144806e-07, + "loss": 0.0227, + "num_tokens": 10866105.0, + "reward": 0.84344482421875, + "reward_std": 0.026172209531068802, + "rewards//mean": 0.84344482421875, + "rewards//std": 0.0390767902135849, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.188, + "grad_norm": 1.5166245698928833, + "kl": 0.2395645622164011, + "learning_rate": 9.22503693175181e-07, + "loss": 0.024, + "num_tokens": 10877609.0, + "reward": 0.8604736328125, + "reward_std": 0.02679598331451416, + "rewards//mean": 0.8604736328125, + "rewards//std": 0.04327748715877533, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1882, + "grad_norm": 1.5572260618209839, + "kl": 0.25281283631920815, + "learning_rate": 9.223339130211192e-07, + "loss": 0.0253, + "num_tokens": 10889121.0, + "reward": 0.85076904296875, + "reward_std": 0.033230073750019073, + "rewards//mean": 0.85076904296875, + "rewards//std": 0.039476845413446426, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1884, + "grad_norm": 1.536991834640503, + "kl": 0.27264239825308323, + "learning_rate": 9.221639627510075e-07, + "loss": 0.0273, + "num_tokens": 10900689.0, + "reward": 0.86053466796875, + "reward_std": 0.027147211134433746, + "rewards//mean": 0.86053466796875, + "rewards//std": 0.03154068440198898, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1886, + "grad_norm": 1.5237945318222046, + "kl": 0.24652805365622044, + "learning_rate": 9.219938424333023e-07, + "loss": 0.0247, + "num_tokens": 10912369.0, + "reward": 0.8154296875, + "reward_std": 0.023159164935350418, + "rewards//mean": 0.8154296875, + "rewards//std": 0.024958888068795204, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1888, + "grad_norm": 1.4588371515274048, + "kl": 0.22090341709554195, + "learning_rate": 9.218235521365276e-07, + "loss": 0.0221, + "num_tokens": 10923961.0, + "reward": 0.832275390625, + "reward_std": 0.031039055436849594, + "rewards//mean": 0.832275390625, + "rewards//std": 0.03518638014793396, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.189, + "grad_norm": 1.4246803522109985, + "kl": 0.26155198737978935, + "learning_rate": 9.216530919292767e-07, + "loss": 0.0252, + "num_tokens": 10935542.0, + "reward": 0.855712890625, + "reward_std": 0.026804089546203613, + "rewards//mean": 0.855712890625, + "rewards//std": 0.02982940338551998, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1892, + "grad_norm": 1.5320826768875122, + "kl": 0.2740780934691429, + "learning_rate": 9.214824618802107e-07, + "loss": 0.0274, + "num_tokens": 10947118.0, + "reward": 0.84765625, + "reward_std": 0.03615986183285713, + "rewards//mean": 0.84765625, + "rewards//std": 0.04331961274147034, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1894, + "grad_norm": 1.3538939952850342, + "kl": 0.23413689993321896, + "learning_rate": 9.213116620580596e-07, + "loss": 0.0234, + "num_tokens": 10958590.0, + "reward": 0.80804443359375, + "reward_std": 0.017927754670381546, + "rewards//mean": 0.80804443359375, + "rewards//std": 0.020491844043135643, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1896, + "grad_norm": 1.4479038715362549, + "kl": 0.2578558959066868, + "learning_rate": 9.211406925316212e-07, + "loss": 0.0258, + "num_tokens": 10970150.0, + "reward": 0.83087158203125, + "reward_std": 0.026697751134634018, + "rewards//mean": 0.83087158203125, + "rewards//std": 0.03929543495178223, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1898, + "grad_norm": 1.3976179361343384, + "kl": 0.25469505228102207, + "learning_rate": 9.209695533697623e-07, + "loss": 0.0255, + "num_tokens": 10981630.0, + "reward": 0.8448486328125, + "reward_std": 0.030591590330004692, + "rewards//mean": 0.8448486328125, + "rewards//std": 0.03742519021034241, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.19, + "grad_norm": 1.5344494581222534, + "kl": 0.23759310506284237, + "learning_rate": 9.207982446414177e-07, + "loss": 0.0238, + "num_tokens": 10993142.0, + "reward": 0.82586669921875, + "reward_std": 0.023701660335063934, + "rewards//mean": 0.82586669921875, + "rewards//std": 0.030103638768196106, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1902, + "grad_norm": 1.2842615842819214, + "kl": 0.22950215078890324, + "learning_rate": 9.206267664155906e-07, + "loss": 0.023, + "num_tokens": 11004742.0, + "reward": 0.8494873046875, + "reward_std": 0.023130960762500763, + "rewards//mean": 0.8494873046875, + "rewards//std": 0.0389256589114666, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1904, + "grad_norm": 1.5740995407104492, + "kl": 0.23778882436454296, + "learning_rate": 9.20455118761352e-07, + "loss": 0.0238, + "num_tokens": 11016294.0, + "reward": 0.8399658203125, + "reward_std": 0.031984828412532806, + "rewards//mean": 0.8399658203125, + "rewards//std": 0.036218177527189255, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1906, + "grad_norm": 1.550370454788208, + "kl": 0.25749493204057217, + "learning_rate": 9.202833017478421e-07, + "loss": 0.0257, + "num_tokens": 11027926.0, + "reward": 0.82257080078125, + "reward_std": 0.02662583626806736, + "rewards//mean": 0.82257080078125, + "rewards//std": 0.03304174169898033, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1908, + "grad_norm": 1.5513206720352173, + "kl": 0.2526854109019041, + "learning_rate": 9.201113154442683e-07, + "loss": 0.0253, + "num_tokens": 11039526.0, + "reward": 0.8245849609375, + "reward_std": 0.02574532851576805, + "rewards//mean": 0.8245849609375, + "rewards//std": 0.028183143585920334, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.191, + "grad_norm": 1.476563572883606, + "kl": 0.2479921504855156, + "learning_rate": 9.199391599199071e-07, + "loss": 0.0248, + "num_tokens": 11051062.0, + "reward": 0.82403564453125, + "reward_std": 0.02448546327650547, + "rewards//mean": 0.82403564453125, + "rewards//std": 0.03434024378657341, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1912, + "grad_norm": 1.3150067329406738, + "kl": 0.22992484085261822, + "learning_rate": 9.197668352441023e-07, + "loss": 0.023, + "num_tokens": 11062694.0, + "reward": 0.8507080078125, + "reward_std": 0.02680184878408909, + "rewards//mean": 0.8507080078125, + "rewards//std": 0.03297323361039162, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1914, + "grad_norm": 1.6360177993774414, + "kl": 0.25898547656834126, + "learning_rate": 9.195943414862665e-07, + "loss": 0.0259, + "num_tokens": 11074254.0, + "reward": 0.8546142578125, + "reward_std": 0.028100337833166122, + "rewards//mean": 0.8546142578125, + "rewards//std": 0.030895095318555832, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1916, + "grad_norm": 1.5123229026794434, + "kl": 0.220817219465971, + "learning_rate": 9.194216787158804e-07, + "loss": 0.0221, + "num_tokens": 11085894.0, + "reward": 0.83837890625, + "reward_std": 0.020631397143006325, + "rewards//mean": 0.83837890625, + "rewards//std": 0.029793858528137207, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.1918, + "grad_norm": 1.701706886291504, + "kl": 0.28780185617506504, + "learning_rate": 9.192488470024919e-07, + "loss": 0.0215, + "num_tokens": 11097478.0, + "reward": 0.8111572265625, + "reward_std": 0.020230982452630997, + "rewards//mean": 0.8111572265625, + "rewards//std": 0.02557448111474514, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.192, + "grad_norm": 1.4054036140441895, + "kl": 0.2378303837031126, + "learning_rate": 9.190758464157182e-07, + "loss": 0.0238, + "num_tokens": 11109014.0, + "reward": 0.85308837890625, + "reward_std": 0.02944711409509182, + "rewards//mean": 0.85308837890625, + "rewards//std": 0.03284921869635582, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1922, + "grad_norm": 1.6353435516357422, + "kl": 0.2735195513814688, + "learning_rate": 9.189026770252436e-07, + "loss": 0.0274, + "num_tokens": 11120630.0, + "reward": 0.808349609375, + "reward_std": 0.027158590033650398, + "rewards//mean": 0.808349609375, + "rewards//std": 0.03260691091418266, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1924, + "grad_norm": 1.555352807044983, + "kl": 0.24235810711979866, + "learning_rate": 9.187293389008208e-07, + "loss": 0.0242, + "num_tokens": 11132206.0, + "reward": 0.81658935546875, + "reward_std": 0.027483977377414703, + "rewards//mean": 0.81658935546875, + "rewards//std": 0.031191272661089897, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1926, + "grad_norm": 1.4186598062515259, + "kl": 0.27901776880025864, + "learning_rate": 9.185558321122704e-07, + "loss": 0.0279, + "num_tokens": 11143854.0, + "reward": 0.81591796875, + "reward_std": 0.023226400837302208, + "rewards//mean": 0.81591796875, + "rewards//std": 0.031191818416118622, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1928, + "grad_norm": 1.8651043176651, + "kl": 0.27556006237864494, + "learning_rate": 9.183821567294808e-07, + "loss": 0.0276, + "num_tokens": 11155422.0, + "reward": 0.8402099609375, + "reward_std": 0.023136194795370102, + "rewards//mean": 0.8402099609375, + "rewards//std": 0.025245634838938713, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.193, + "grad_norm": 1.4046661853790283, + "kl": 0.2784784957766533, + "learning_rate": 9.182083128224086e-07, + "loss": 0.0278, + "num_tokens": 11167038.0, + "reward": 0.8450927734375, + "reward_std": 0.033003006130456924, + "rewards//mean": 0.8450927734375, + "rewards//std": 0.037963323295116425, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1932, + "grad_norm": 1.601759910583496, + "kl": 0.2553866747766733, + "learning_rate": 9.180343004610779e-07, + "loss": 0.0255, + "num_tokens": 11178622.0, + "reward": 0.81341552734375, + "reward_std": 0.02547498233616352, + "rewards//mean": 0.81341552734375, + "rewards//std": 0.03265046328306198, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1934, + "grad_norm": 1.4256598949432373, + "kl": 0.28865284100174904, + "learning_rate": 9.178601197155811e-07, + "loss": 0.0289, + "num_tokens": 11190246.0, + "reward": 0.81524658203125, + "reward_std": 0.019989553838968277, + "rewards//mean": 0.81524658203125, + "rewards//std": 0.024653080850839615, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1936, + "grad_norm": 1.454540491104126, + "kl": 0.2480900175869465, + "learning_rate": 9.176857706560779e-07, + "loss": 0.0248, + "num_tokens": 11201806.0, + "reward": 0.837890625, + "reward_std": 0.018478848040103912, + "rewards//mean": 0.837890625, + "rewards//std": 0.034799642860889435, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1938, + "grad_norm": 1.3939435482025146, + "kl": 0.21548667177557945, + "learning_rate": 9.175112533527963e-07, + "loss": 0.0215, + "num_tokens": 11213398.0, + "reward": 0.762939453125, + "reward_std": 0.01924007758498192, + "rewards//mean": 0.762939453125, + "rewards//std": 0.023498134687542915, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.194, + "grad_norm": 1.4839928150177002, + "kl": 0.27215964905917645, + "learning_rate": 9.173365678760317e-07, + "loss": 0.0272, + "num_tokens": 11225070.0, + "reward": 0.8189697265625, + "reward_std": 0.03288666158914566, + "rewards//mean": 0.8189697265625, + "rewards//std": 0.045224640518426895, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1942, + "grad_norm": 1.4451377391815186, + "kl": 0.26093851774930954, + "learning_rate": 9.171617142961476e-07, + "loss": 0.0261, + "num_tokens": 11236638.0, + "reward": 0.820556640625, + "reward_std": 0.016359731554985046, + "rewards//mean": 0.820556640625, + "rewards//std": 0.025872545316815376, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1944, + "grad_norm": 1.5053588151931763, + "kl": 0.2702757865190506, + "learning_rate": 9.169866926835747e-07, + "loss": 0.027, + "num_tokens": 11248182.0, + "reward": 0.8214111328125, + "reward_std": 0.022168396040797234, + "rewards//mean": 0.8214111328125, + "rewards//std": 0.025243235751986504, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1946, + "grad_norm": 1.71407949924469, + "kl": 0.2731085419654846, + "learning_rate": 9.16811503108812e-07, + "loss": 0.0273, + "num_tokens": 11259902.0, + "reward": 0.8070068359375, + "reward_std": 0.028648465871810913, + "rewards//mean": 0.8070068359375, + "rewards//std": 0.039240166544914246, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1948, + "grad_norm": 1.3914638757705688, + "kl": 0.27063815109431744, + "learning_rate": 9.166361456424257e-07, + "loss": 0.0271, + "num_tokens": 11271406.0, + "reward": 0.869873046875, + "reward_std": 0.02171408012509346, + "rewards//mean": 0.869873046875, + "rewards//std": 0.033652350306510925, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.195, + "grad_norm": 1.84746253490448, + "kl": 0.25935916788876057, + "learning_rate": 9.164606203550497e-07, + "loss": 0.0259, + "num_tokens": 11283118.0, + "reward": 0.7935791015625, + "reward_std": 0.017970670014619827, + "rewards//mean": 0.7935791015625, + "rewards//std": 0.021790454164147377, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1952, + "grad_norm": 1.437387228012085, + "kl": 0.26467661559581757, + "learning_rate": 9.162849273173856e-07, + "loss": 0.0265, + "num_tokens": 11294830.0, + "reward": 0.8388671875, + "reward_std": 0.028289856389164925, + "rewards//mean": 0.8388671875, + "rewards//std": 0.03691387549042702, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.1954, + "grad_norm": 1.4219521284103394, + "kl": 0.2682777550071478, + "learning_rate": 9.161090666002027e-07, + "loss": 0.0063, + "num_tokens": 11306336.0, + "reward": 0.83367919921875, + "reward_std": 0.024590516462922096, + "rewards//mean": 0.83367919921875, + "rewards//std": 0.029080549255013466, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1956, + "grad_norm": 1.3217118978500366, + "kl": 0.32029956951737404, + "learning_rate": 9.159330382743373e-07, + "loss": 0.032, + "num_tokens": 11317960.0, + "reward": 0.8416748046875, + "reward_std": 0.027541209012269974, + "rewards//mean": 0.8416748046875, + "rewards//std": 0.0317571759223938, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1958, + "grad_norm": 1.396797776222229, + "kl": 0.3137296624481678, + "learning_rate": 9.157568424106941e-07, + "loss": 0.0314, + "num_tokens": 11329560.0, + "reward": 0.8348388671875, + "reward_std": 0.02847890742123127, + "rewards//mean": 0.8348388671875, + "rewards//std": 0.030855873599648476, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.196, + "grad_norm": 1.4196707010269165, + "kl": 0.29022287391126156, + "learning_rate": 9.155804790802443e-07, + "loss": 0.029, + "num_tokens": 11341184.0, + "reward": 0.80999755859375, + "reward_std": 0.02224970981478691, + "rewards//mean": 0.80999755859375, + "rewards//std": 0.024432888254523277, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1962, + "grad_norm": 1.6202120780944824, + "kl": 0.2663966715335846, + "learning_rate": 9.154039483540272e-07, + "loss": 0.0266, + "num_tokens": 11352744.0, + "reward": 0.842041015625, + "reward_std": 0.030963459983468056, + "rewards//mean": 0.842041015625, + "rewards//std": 0.03731115534901619, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1964, + "grad_norm": 1.4408611059188843, + "kl": 0.30217561312019825, + "learning_rate": 9.152272503031495e-07, + "loss": 0.0302, + "num_tokens": 11364344.0, + "reward": 0.79205322265625, + "reward_std": 0.01769936829805374, + "rewards//mean": 0.79205322265625, + "rewards//std": 0.020419321954250336, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1966, + "grad_norm": 1.6943492889404297, + "kl": 0.2804794553667307, + "learning_rate": 9.150503849987851e-07, + "loss": 0.028, + "num_tokens": 11375872.0, + "reward": 0.8526611328125, + "reward_std": 0.02992212027311325, + "rewards//mean": 0.8526611328125, + "rewards//std": 0.04183761402964592, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1968, + "grad_norm": 1.5078070163726807, + "kl": 0.2745595909655094, + "learning_rate": 9.14873352512175e-07, + "loss": 0.0275, + "num_tokens": 11387456.0, + "reward": 0.8321533203125, + "reward_std": 0.024885959923267365, + "rewards//mean": 0.8321533203125, + "rewards//std": 0.02853544056415558, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.197, + "grad_norm": 1.670241355895996, + "kl": 0.326170040294528, + "learning_rate": 9.146961529146284e-07, + "loss": 0.0326, + "num_tokens": 11399008.0, + "reward": 0.82269287109375, + "reward_std": 0.036040764302015305, + "rewards//mean": 0.82269287109375, + "rewards//std": 0.042618509382009506, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1972, + "grad_norm": 1.4633352756500244, + "kl": 0.23471900634467602, + "learning_rate": 9.145187862775208e-07, + "loss": 0.0235, + "num_tokens": 11410536.0, + "reward": 0.82177734375, + "reward_std": 0.031136395409703255, + "rewards//mean": 0.82177734375, + "rewards//std": 0.04439375922083855, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1974, + "grad_norm": 1.360782265663147, + "kl": 0.2552671805024147, + "learning_rate": 9.143412526722958e-07, + "loss": 0.0255, + "num_tokens": 11422016.0, + "reward": 0.81805419921875, + "reward_std": 0.0263263750821352, + "rewards//mean": 0.81805419921875, + "rewards//std": 0.03637722507119179, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1976, + "grad_norm": 1.5993300676345825, + "kl": 0.2882204204797745, + "learning_rate": 9.141635521704636e-07, + "loss": 0.0288, + "num_tokens": 11433656.0, + "reward": 0.80401611328125, + "reward_std": 0.03170066699385643, + "rewards//mean": 0.80401611328125, + "rewards//std": 0.03326638042926788, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1978, + "grad_norm": 1.6832809448242188, + "kl": 0.27016060426831245, + "learning_rate": 9.139856848436023e-07, + "loss": 0.027, + "num_tokens": 11445280.0, + "reward": 0.84600830078125, + "reward_std": 0.033481139689683914, + "rewards//mean": 0.84600830078125, + "rewards//std": 0.039277710020542145, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.198, + "grad_norm": 1.459313988685608, + "kl": 0.2850813753902912, + "learning_rate": 9.138076507633565e-07, + "loss": 0.0285, + "num_tokens": 11456984.0, + "reward": 0.85003662109375, + "reward_std": 0.029661191627383232, + "rewards//mean": 0.85003662109375, + "rewards//std": 0.03589337691664696, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1982, + "grad_norm": 1.527965784072876, + "kl": 0.3036021087318659, + "learning_rate": 9.136294500014385e-07, + "loss": 0.0304, + "num_tokens": 11468584.0, + "reward": 0.79351806640625, + "reward_std": 0.024049388244748116, + "rewards//mean": 0.79351806640625, + "rewards//std": 0.025122037157416344, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1984, + "grad_norm": 1.4633113145828247, + "kl": 0.27564706839621067, + "learning_rate": 9.134510826296276e-07, + "loss": 0.0276, + "num_tokens": 11480232.0, + "reward": 0.80792236328125, + "reward_std": 0.018862998113036156, + "rewards//mean": 0.80792236328125, + "rewards//std": 0.020734917372465134, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1986, + "grad_norm": 1.4859224557876587, + "kl": 0.2999421786516905, + "learning_rate": 9.1327254871977e-07, + "loss": 0.03, + "num_tokens": 11491768.0, + "reward": 0.86273193359375, + "reward_std": 0.035364773124456406, + "rewards//mean": 0.86273193359375, + "rewards//std": 0.04479765519499779, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1988, + "grad_norm": 1.441278100013733, + "kl": 0.3155974745750427, + "learning_rate": 9.130938483437791e-07, + "loss": 0.0316, + "num_tokens": 11503376.0, + "reward": 0.813232421875, + "reward_std": 0.02608354389667511, + "rewards//mean": 0.813232421875, + "rewards//std": 0.03289532661437988, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.199, + "grad_norm": 1.5946621894836426, + "kl": 0.3427728917449713, + "learning_rate": 9.129149815736357e-07, + "loss": 0.0343, + "num_tokens": 11514992.0, + "reward": 0.79266357421875, + "reward_std": 0.018606171011924744, + "rewards//mean": 0.79266357421875, + "rewards//std": 0.02379954233765602, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1992, + "grad_norm": 1.6787025928497314, + "kl": 0.2702176570892334, + "learning_rate": 9.12735948481387e-07, + "loss": 0.027, + "num_tokens": 11526592.0, + "reward": 0.845458984375, + "reward_std": 0.029404642060399055, + "rewards//mean": 0.845458984375, + "rewards//std": 0.03138051927089691, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1994, + "grad_norm": 1.4939308166503906, + "kl": 0.30255262926220894, + "learning_rate": 9.125567491391475e-07, + "loss": 0.0303, + "num_tokens": 11538248.0, + "reward": 0.80999755859375, + "reward_std": 0.03607134893536568, + "rewards//mean": 0.80999755859375, + "rewards//std": 0.0379897803068161, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1996, + "grad_norm": 1.5180847644805908, + "kl": 0.2539266850799322, + "learning_rate": 9.123773836190989e-07, + "loss": 0.0254, + "num_tokens": 11549976.0, + "reward": 0.8385009765625, + "reward_std": 0.021763810887932777, + "rewards//mean": 0.8385009765625, + "rewards//std": 0.025631241500377655, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1998, + "grad_norm": 1.4948091506958008, + "kl": 0.3010984696447849, + "learning_rate": 9.121978519934895e-07, + "loss": 0.0301, + "num_tokens": 11561512.0, + "reward": 0.828125, + "reward_std": 0.027346894145011902, + "rewards//mean": 0.828125, + "rewards//std": 0.0322868674993515, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2, + "grad_norm": 1.8128142356872559, + "kl": 0.35912125185132027, + "learning_rate": 9.120181543346346e-07, + "loss": 0.0359, + "num_tokens": 11573120.0, + "reward": 0.82733154296875, + "reward_std": 0.02956298179924488, + "rewards//mean": 0.82733154296875, + "rewards//std": 0.03194887563586235, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.2002, + "grad_norm": 1.612549901008606, + "kl": 0.2567143440246582, + "learning_rate": 9.118382907149163e-07, + "loss": 0.0288, + "num_tokens": 11584625.0, + "reward": 0.856201171875, + "reward_std": 0.023928582668304443, + "rewards//mean": 0.856201171875, + "rewards//std": 0.02747078612446785, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2004, + "grad_norm": 1.489304542541504, + "kl": 0.279543848708272, + "learning_rate": 9.116582612067838e-07, + "loss": 0.028, + "num_tokens": 11596249.0, + "reward": 0.85235595703125, + "reward_std": 0.023730233311653137, + "rewards//mean": 0.85235595703125, + "rewards//std": 0.028049286454916, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.2006, + "grad_norm": 1.6198344230651855, + "kl": 0.30059848353266716, + "learning_rate": 9.11478065882753e-07, + "loss": 0.0304, + "num_tokens": 11607926.0, + "reward": 0.8358154296875, + "reward_std": 0.03173686936497688, + "rewards//mean": 0.8358154296875, + "rewards//std": 0.04620348662137985, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2008, + "grad_norm": 1.4700788259506226, + "kl": 0.2810402326285839, + "learning_rate": 9.112977048154064e-07, + "loss": 0.0281, + "num_tokens": 11619510.0, + "reward": 0.8291015625, + "reward_std": 0.02051437646150589, + "rewards//mean": 0.8291015625, + "rewards//std": 0.03164280578494072, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.201, + "grad_norm": 1.374066948890686, + "kl": 0.31684623658657074, + "learning_rate": 9.111171780773936e-07, + "loss": 0.0317, + "num_tokens": 11630998.0, + "reward": 0.83709716796875, + "reward_std": 0.025070656090974808, + "rewards//mean": 0.83709716796875, + "rewards//std": 0.03032708168029785, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2012, + "grad_norm": 1.528626561164856, + "kl": 0.3196235168725252, + "learning_rate": 9.109364857414305e-07, + "loss": 0.032, + "num_tokens": 11642590.0, + "reward": 0.83538818359375, + "reward_std": 0.02712974138557911, + "rewards//mean": 0.83538818359375, + "rewards//std": 0.03505529463291168, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2014, + "grad_norm": 1.5497794151306152, + "kl": 0.30868658795952797, + "learning_rate": 9.107556278803002e-07, + "loss": 0.0309, + "num_tokens": 11654166.0, + "reward": 0.81640625, + "reward_std": 0.021374164149165154, + "rewards//mean": 0.81640625, + "rewards//std": 0.02515222132205963, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2016, + "grad_norm": 1.690397024154663, + "kl": 0.30724145844578743, + "learning_rate": 9.10574604566852e-07, + "loss": 0.0307, + "num_tokens": 11665758.0, + "reward": 0.8447265625, + "reward_std": 0.027198366820812225, + "rewards//mean": 0.8447265625, + "rewards//std": 0.03702525049448013, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2018, + "grad_norm": 1.5772255659103394, + "kl": 0.3423797469586134, + "learning_rate": 9.103934158740022e-07, + "loss": 0.0342, + "num_tokens": 11677406.0, + "reward": 0.8746337890625, + "reward_std": 0.02747311070561409, + "rewards//mean": 0.8746337890625, + "rewards//std": 0.031069036573171616, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.202, + "grad_norm": 1.5367565155029297, + "kl": 0.2931125331670046, + "learning_rate": 9.102120618747336e-07, + "loss": 0.0293, + "num_tokens": 11689022.0, + "reward": 0.8402099609375, + "reward_std": 0.017362983897328377, + "rewards//mean": 0.8402099609375, + "rewards//std": 0.020783500745892525, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2022, + "grad_norm": 1.717934489250183, + "kl": 0.2809955831617117, + "learning_rate": 9.100305426420956e-07, + "loss": 0.0281, + "num_tokens": 11700686.0, + "reward": 0.8416748046875, + "reward_std": 0.025934161618351936, + "rewards//mean": 0.8416748046875, + "rewards//std": 0.033225685358047485, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2024, + "grad_norm": 1.582865595817566, + "kl": 0.3195728827267885, + "learning_rate": 9.098488582492039e-07, + "loss": 0.032, + "num_tokens": 11712246.0, + "reward": 0.844970703125, + "reward_std": 0.020601551979780197, + "rewards//mean": 0.844970703125, + "rewards//std": 0.026918597519397736, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2026, + "grad_norm": 1.6010397672653198, + "kl": 0.3720878101885319, + "learning_rate": 9.096670087692411e-07, + "loss": 0.0372, + "num_tokens": 11723814.0, + "reward": 0.83837890625, + "reward_std": 0.02993692457675934, + "rewards//mean": 0.83837890625, + "rewards//std": 0.036060892045497894, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2028, + "grad_norm": 1.5655661821365356, + "kl": 0.30329372361302376, + "learning_rate": 9.094849942754563e-07, + "loss": 0.0303, + "num_tokens": 11735518.0, + "reward": 0.86907958984375, + "reward_std": 0.024044491350650787, + "rewards//mean": 0.86907958984375, + "rewards//std": 0.032807718962430954, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.203, + "grad_norm": 1.4775426387786865, + "kl": 0.34826354309916496, + "learning_rate": 9.093028148411648e-07, + "loss": 0.0348, + "num_tokens": 11747118.0, + "reward": 0.8277587890625, + "reward_std": 0.01870780624449253, + "rewards//mean": 0.8277587890625, + "rewards//std": 0.024692730978131294, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2032, + "grad_norm": 1.3856985569000244, + "kl": 0.30606524273753166, + "learning_rate": 9.091204705397483e-07, + "loss": 0.0306, + "num_tokens": 11758734.0, + "reward": 0.84405517578125, + "reward_std": 0.020666763186454773, + "rewards//mean": 0.84405517578125, + "rewards//std": 0.035559069365262985, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2034, + "grad_norm": 1.345151662826538, + "kl": 0.3306903596967459, + "learning_rate": 9.089379614446553e-07, + "loss": 0.0331, + "num_tokens": 11770334.0, + "reward": 0.8563232421875, + "reward_std": 0.029861964285373688, + "rewards//mean": 0.8563232421875, + "rewards//std": 0.037644583731889725, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2036, + "grad_norm": 1.684406042098999, + "kl": 0.3320971317589283, + "learning_rate": 9.087552876294002e-07, + "loss": 0.0332, + "num_tokens": 11781806.0, + "reward": 0.78387451171875, + "reward_std": 0.029272979125380516, + "rewards//mean": 0.78387451171875, + "rewards//std": 0.03378294035792351, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2038, + "grad_norm": 1.6135855913162231, + "kl": 0.36222201585769653, + "learning_rate": 9.085724491675642e-07, + "loss": 0.0362, + "num_tokens": 11793350.0, + "reward": 0.8441162109375, + "reward_std": 0.030591584742069244, + "rewards//mean": 0.8441162109375, + "rewards//std": 0.03610263764858246, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.204, + "grad_norm": 1.71904718875885, + "kl": 0.3733372837305069, + "learning_rate": 9.083894461327945e-07, + "loss": 0.0373, + "num_tokens": 11804958.0, + "reward": 0.85345458984375, + "reward_std": 0.0318794921040535, + "rewards//mean": 0.85345458984375, + "rewards//std": 0.0389569029211998, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2042, + "grad_norm": 1.5594594478607178, + "kl": 0.32929180562496185, + "learning_rate": 9.082062785988048e-07, + "loss": 0.0329, + "num_tokens": 11816478.0, + "reward": 0.84326171875, + "reward_std": 0.02263738214969635, + "rewards//mean": 0.84326171875, + "rewards//std": 0.03898492082953453, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2044, + "grad_norm": 1.5124237537384033, + "kl": 0.3222947455942631, + "learning_rate": 9.080229466393749e-07, + "loss": 0.0322, + "num_tokens": 11828126.0, + "reward": 0.83343505859375, + "reward_std": 0.025497794151306152, + "rewards//mean": 0.83343505859375, + "rewards//std": 0.029234740883111954, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2046, + "grad_norm": 1.5079679489135742, + "kl": 0.28650621324777603, + "learning_rate": 9.078394503283508e-07, + "loss": 0.0287, + "num_tokens": 11839750.0, + "reward": 0.8197021484375, + "reward_std": 0.024179577827453613, + "rewards//mean": 0.8197021484375, + "rewards//std": 0.03075956553220749, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2048, + "grad_norm": 1.7093572616577148, + "kl": 0.3368173073977232, + "learning_rate": 9.076557897396451e-07, + "loss": 0.0337, + "num_tokens": 11851342.0, + "reward": 0.79461669921875, + "reward_std": 0.03101722151041031, + "rewards//mean": 0.79461669921875, + "rewards//std": 0.0439457893371582, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.205, + "grad_norm": 1.6796950101852417, + "kl": 0.3964366875588894, + "learning_rate": 9.074719649472357e-07, + "loss": 0.0396, + "num_tokens": 11862846.0, + "reward": 0.84075927734375, + "reward_std": 0.03357537463307381, + "rewards//mean": 0.84075927734375, + "rewards//std": 0.03719977289438248, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2052, + "grad_norm": 1.7586326599121094, + "kl": 0.33642302826046944, + "learning_rate": 9.072879760251679e-07, + "loss": 0.0336, + "num_tokens": 11874462.0, + "reward": 0.83087158203125, + "reward_std": 0.02262897416949272, + "rewards//mean": 0.83087158203125, + "rewards//std": 0.02515876479446888, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2054, + "grad_norm": 1.6266380548477173, + "kl": 0.34518929198384285, + "learning_rate": 9.071038230475519e-07, + "loss": 0.0345, + "num_tokens": 11885982.0, + "reward": 0.85565185546875, + "reward_std": 0.03768323361873627, + "rewards//mean": 0.85565185546875, + "rewards//std": 0.04187211021780968, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2056, + "grad_norm": 1.7841829061508179, + "kl": 0.37719152867794037, + "learning_rate": 9.069195060885646e-07, + "loss": 0.0377, + "num_tokens": 11897534.0, + "reward": 0.84942626953125, + "reward_std": 0.034570906311273575, + "rewards//mean": 0.84942626953125, + "rewards//std": 0.042603231966495514, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2058, + "grad_norm": 1.3329662084579468, + "kl": 0.337535435333848, + "learning_rate": 9.067350252224489e-07, + "loss": 0.0338, + "num_tokens": 11909166.0, + "reward": 0.847412109375, + "reward_std": 0.021379493176937103, + "rewards//mean": 0.847412109375, + "rewards//std": 0.029519246891140938, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.206, + "grad_norm": 1.5345518589019775, + "kl": 0.3282114341855049, + "learning_rate": 9.065503805235137e-07, + "loss": 0.0328, + "num_tokens": 11920678.0, + "reward": 0.856689453125, + "reward_std": 0.02740451693534851, + "rewards//mean": 0.856689453125, + "rewards//std": 0.035975996404886246, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2062, + "grad_norm": 1.6773545742034912, + "kl": 0.3718213103711605, + "learning_rate": 9.06365572066134e-07, + "loss": 0.0372, + "num_tokens": 11932302.0, + "reward": 0.79345703125, + "reward_std": 0.027466963976621628, + "rewards//mean": 0.79345703125, + "rewards//std": 0.03137762472033501, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2064, + "grad_norm": 1.6054900884628296, + "kl": 0.3731364421546459, + "learning_rate": 9.061805999247503e-07, + "loss": 0.0373, + "num_tokens": 11943982.0, + "reward": 0.81414794921875, + "reward_std": 0.028354836627840996, + "rewards//mean": 0.81414794921875, + "rewards//std": 0.03088204748928547, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2066, + "grad_norm": 1.577709674835205, + "kl": 0.3381994217634201, + "learning_rate": 9.059954641738697e-07, + "loss": 0.0338, + "num_tokens": 11955502.0, + "reward": 0.83575439453125, + "reward_std": 0.01910077966749668, + "rewards//mean": 0.83575439453125, + "rewards//std": 0.02491449937224388, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2068, + "grad_norm": 1.5768101215362549, + "kl": 0.38721945881843567, + "learning_rate": 9.058101648880645e-07, + "loss": 0.0387, + "num_tokens": 11967142.0, + "reward": 0.79974365234375, + "reward_std": 0.024452224373817444, + "rewards//mean": 0.79974365234375, + "rewards//std": 0.02833333984017372, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.207, + "grad_norm": 1.5980600118637085, + "kl": 0.3104675244539976, + "learning_rate": 9.056247021419734e-07, + "loss": 0.031, + "num_tokens": 11978782.0, + "reward": 0.82598876953125, + "reward_std": 0.026953451335430145, + "rewards//mean": 0.82598876953125, + "rewards//std": 0.028687484562397003, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2072, + "grad_norm": 1.6831042766571045, + "kl": 0.31440913677215576, + "learning_rate": 9.054390760103009e-07, + "loss": 0.0314, + "num_tokens": 11990430.0, + "reward": 0.774169921875, + "reward_std": 0.022085539996623993, + "rewards//mean": 0.774169921875, + "rewards//std": 0.0340530090034008, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2074, + "grad_norm": 1.7500250339508057, + "kl": 0.3853425681591034, + "learning_rate": 9.052532865678171e-07, + "loss": 0.0385, + "num_tokens": 12002054.0, + "reward": 0.8543701171875, + "reward_std": 0.02738739550113678, + "rewards//mean": 0.8543701171875, + "rewards//std": 0.03118380904197693, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2076, + "grad_norm": 1.5628777742385864, + "kl": 0.40412846952676773, + "learning_rate": 9.050673338893577e-07, + "loss": 0.0404, + "num_tokens": 12013694.0, + "reward": 0.85003662109375, + "reward_std": 0.024518586695194244, + "rewards//mean": 0.85003662109375, + "rewards//std": 0.030501779168844223, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2078, + "grad_norm": 1.7481739521026611, + "kl": 0.3863633908331394, + "learning_rate": 9.04881218049825e-07, + "loss": 0.0386, + "num_tokens": 12025286.0, + "reward": 0.8502197265625, + "reward_std": 0.03388494253158569, + "rewards//mean": 0.8502197265625, + "rewards//std": 0.04208437353372574, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.208, + "grad_norm": 1.5094789266586304, + "kl": 0.37182357534766197, + "learning_rate": 9.046949391241858e-07, + "loss": 0.0372, + "num_tokens": 12036878.0, + "reward": 0.8358154296875, + "reward_std": 0.02580290473997593, + "rewards//mean": 0.8358154296875, + "rewards//std": 0.029023557901382446, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2082, + "grad_norm": 1.5412821769714355, + "kl": 0.34309408254921436, + "learning_rate": 9.045084971874737e-07, + "loss": 0.0343, + "num_tokens": 12048510.0, + "reward": 0.8470458984375, + "reward_std": 0.024066146463155746, + "rewards//mean": 0.8470458984375, + "rewards//std": 0.027127820998430252, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2084, + "grad_norm": 1.5631624460220337, + "kl": 0.2986176647245884, + "learning_rate": 9.043218923147873e-07, + "loss": 0.0299, + "num_tokens": 12060094.0, + "reward": 0.8438720703125, + "reward_std": 0.030151329934597015, + "rewards//mean": 0.8438720703125, + "rewards//std": 0.0322866328060627, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2086, + "grad_norm": 1.467309832572937, + "kl": 0.3631850332021713, + "learning_rate": 9.04135124581291e-07, + "loss": 0.0363, + "num_tokens": 12071742.0, + "reward": 0.82421875, + "reward_std": 0.020023783668875694, + "rewards//mean": 0.82421875, + "rewards//std": 0.02492976002395153, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2088, + "grad_norm": 1.3802696466445923, + "kl": 0.3492240961641073, + "learning_rate": 9.039481940622146e-07, + "loss": 0.0349, + "num_tokens": 12083262.0, + "reward": 0.8175048828125, + "reward_std": 0.01870383694767952, + "rewards//mean": 0.8175048828125, + "rewards//std": 0.03702831640839577, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.209, + "grad_norm": 1.3892215490341187, + "kl": 0.3820292018353939, + "learning_rate": 9.037611008328543e-07, + "loss": 0.0382, + "num_tokens": 12094846.0, + "reward": 0.85003662109375, + "reward_std": 0.020683448761701584, + "rewards//mean": 0.85003662109375, + "rewards//std": 0.02716706320643425, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2092, + "grad_norm": 1.568703532218933, + "kl": 0.3684989772737026, + "learning_rate": 9.035738449685706e-07, + "loss": 0.0368, + "num_tokens": 12106478.0, + "reward": 0.8067626953125, + "reward_std": 0.02328130602836609, + "rewards//mean": 0.8067626953125, + "rewards//std": 0.03885715454816818, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2094, + "grad_norm": 1.49580979347229, + "kl": 0.29156962409615517, + "learning_rate": 9.033864265447906e-07, + "loss": 0.0292, + "num_tokens": 12118054.0, + "reward": 0.83135986328125, + "reward_std": 0.0192482341080904, + "rewards//mean": 0.83135986328125, + "rewards//std": 0.022332094609737396, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2096, + "grad_norm": 1.7002832889556885, + "kl": 0.37871207296848297, + "learning_rate": 9.031988456370061e-07, + "loss": 0.0379, + "num_tokens": 12129606.0, + "reward": 0.84954833984375, + "reward_std": 0.02176578715443611, + "rewards//mean": 0.84954833984375, + "rewards//std": 0.030710499733686447, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2098, + "grad_norm": 1.519966721534729, + "kl": 0.3278946727514267, + "learning_rate": 9.030111023207749e-07, + "loss": 0.0328, + "num_tokens": 12141262.0, + "reward": 0.83544921875, + "reward_std": 0.030106481164693832, + "rewards//mean": 0.83544921875, + "rewards//std": 0.03302484378218651, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.21, + "grad_norm": 2.5205934047698975, + "kl": 0.4141816198825836, + "learning_rate": 9.028231966717198e-07, + "loss": 0.0414, + "num_tokens": 12152854.0, + "reward": 0.84765625, + "reward_std": 0.022577233612537384, + "rewards//mean": 0.84765625, + "rewards//std": 0.028279928490519524, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2102, + "grad_norm": 1.4443079233169556, + "kl": 0.3525441400706768, + "learning_rate": 9.026351287655293e-07, + "loss": 0.0353, + "num_tokens": 12164446.0, + "reward": 0.81805419921875, + "reward_std": 0.023032667115330696, + "rewards//mean": 0.81805419921875, + "rewards//std": 0.02640927955508232, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2104, + "grad_norm": 1.6081056594848633, + "kl": 0.3987560346722603, + "learning_rate": 9.02446898677957e-07, + "loss": 0.0399, + "num_tokens": 12176038.0, + "reward": 0.78369140625, + "reward_std": 0.019772734493017197, + "rewards//mean": 0.78369140625, + "rewards//std": 0.02153645269572735, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2106, + "grad_norm": 1.8569371700286865, + "kl": 0.3877057619392872, + "learning_rate": 9.02258506484822e-07, + "loss": 0.0388, + "num_tokens": 12187590.0, + "reward": 0.84783935546875, + "reward_std": 0.03131500631570816, + "rewards//mean": 0.84783935546875, + "rewards//std": 0.03611411154270172, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2108, + "grad_norm": 1.8072841167449951, + "kl": 0.35560574382543564, + "learning_rate": 9.02069952262009e-07, + "loss": 0.0356, + "num_tokens": 12199158.0, + "reward": 0.8228759765625, + "reward_std": 0.025217998772859573, + "rewards//mean": 0.8228759765625, + "rewards//std": 0.031720928847789764, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.211, + "grad_norm": 1.5286349058151245, + "kl": 0.32454751059412956, + "learning_rate": 9.018812360854671e-07, + "loss": 0.0325, + "num_tokens": 12210734.0, + "reward": 0.8134765625, + "reward_std": 0.024643635377287865, + "rewards//mean": 0.8134765625, + "rewards//std": 0.0267934650182724, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2112, + "grad_norm": 1.5693293809890747, + "kl": 0.30025129579007626, + "learning_rate": 9.016923580312113e-07, + "loss": 0.03, + "num_tokens": 12222326.0, + "reward": 0.84222412109375, + "reward_std": 0.02287779003381729, + "rewards//mean": 0.84222412109375, + "rewards//std": 0.03199763968586922, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2114, + "grad_norm": 1.586246132850647, + "kl": 0.37500810995697975, + "learning_rate": 9.015033181753218e-07, + "loss": 0.0375, + "num_tokens": 12233814.0, + "reward": 0.84649658203125, + "reward_std": 0.03556481748819351, + "rewards//mean": 0.84649658203125, + "rewards//std": 0.05128045752644539, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2116, + "grad_norm": 1.5428736209869385, + "kl": 0.4442654848098755, + "learning_rate": 9.013141165939438e-07, + "loss": 0.0444, + "num_tokens": 12245358.0, + "reward": 0.8228759765625, + "reward_std": 0.023100562393665314, + "rewards//mean": 0.8228759765625, + "rewards//std": 0.027798086404800415, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2118, + "grad_norm": 1.7395540475845337, + "kl": 0.41903864592313766, + "learning_rate": 9.011247533632875e-07, + "loss": 0.0419, + "num_tokens": 12256854.0, + "reward": 0.80694580078125, + "reward_std": 0.017781613394618034, + "rewards//mean": 0.80694580078125, + "rewards//std": 0.024683762341737747, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.212, + "grad_norm": 1.5586698055267334, + "kl": 0.30672359094023705, + "learning_rate": 9.009352285596285e-07, + "loss": 0.0307, + "num_tokens": 12268414.0, + "reward": 0.819580078125, + "reward_std": 0.020915091037750244, + "rewards//mean": 0.819580078125, + "rewards//std": 0.033988937735557556, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2122, + "grad_norm": 1.8905858993530273, + "kl": 0.3783415947109461, + "learning_rate": 9.007455422593075e-07, + "loss": 0.0378, + "num_tokens": 12280006.0, + "reward": 0.8385009765625, + "reward_std": 0.016644559800624847, + "rewards//mean": 0.8385009765625, + "rewards//std": 0.018443400040268898, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2124, + "grad_norm": 1.6622267961502075, + "kl": 0.43458464927971363, + "learning_rate": 9.0055569453873e-07, + "loss": 0.0435, + "num_tokens": 12291518.0, + "reward": 0.8544921875, + "reward_std": 0.02952507510781288, + "rewards//mean": 0.8544921875, + "rewards//std": 0.038694947957992554, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2126, + "grad_norm": 1.625455379486084, + "kl": 0.44397810474038124, + "learning_rate": 9.003656854743666e-07, + "loss": 0.0444, + "num_tokens": 12303102.0, + "reward": 0.79534912109375, + "reward_std": 0.027360528707504272, + "rewards//mean": 0.79534912109375, + "rewards//std": 0.033808473497629166, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2128, + "grad_norm": 1.551076889038086, + "kl": 0.3549692705273628, + "learning_rate": 9.00175515142753e-07, + "loss": 0.0355, + "num_tokens": 12314686.0, + "reward": 0.837646484375, + "reward_std": 0.024549121037125587, + "rewards//mean": 0.837646484375, + "rewards//std": 0.02589126117527485, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.213, + "grad_norm": 1.7458794116973877, + "kl": 0.4968990832567215, + "learning_rate": 8.9998518362049e-07, + "loss": 0.0497, + "num_tokens": 12326342.0, + "reward": 0.827880859375, + "reward_std": 0.02495371550321579, + "rewards//mean": 0.827880859375, + "rewards//std": 0.029991356655955315, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2132, + "grad_norm": 1.4401700496673584, + "kl": 0.37138262391090393, + "learning_rate": 8.997946909842424e-07, + "loss": 0.0371, + "num_tokens": 12337966.0, + "reward": 0.85467529296875, + "reward_std": 0.023370331153273582, + "rewards//mean": 0.85467529296875, + "rewards//std": 0.02580040507018566, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2134, + "grad_norm": 1.8219239711761475, + "kl": 0.5037078987807035, + "learning_rate": 8.996040373107414e-07, + "loss": 0.0504, + "num_tokens": 12349686.0, + "reward": 0.75970458984375, + "reward_std": 0.02100123092532158, + "rewards//mean": 0.75970458984375, + "rewards//std": 0.026143701747059822, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2136, + "grad_norm": 1.7383723258972168, + "kl": 0.4188809674233198, + "learning_rate": 8.994132226767819e-07, + "loss": 0.0419, + "num_tokens": 12361294.0, + "reward": 0.83770751953125, + "reward_std": 0.025862548500299454, + "rewards//mean": 0.83770751953125, + "rewards//std": 0.03142721578478813, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2138, + "grad_norm": 1.6443442106246948, + "kl": 0.45394837483763695, + "learning_rate": 8.992222471592239e-07, + "loss": 0.0454, + "num_tokens": 12372814.0, + "reward": 0.84356689453125, + "reward_std": 0.023350508883595467, + "rewards//mean": 0.84356689453125, + "rewards//std": 0.02814626134932041, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.214, + "grad_norm": 1.7132126092910767, + "kl": 0.36163637787103653, + "learning_rate": 8.990311108349926e-07, + "loss": 0.0362, + "num_tokens": 12384462.0, + "reward": 0.84063720703125, + "reward_std": 0.032970838248729706, + "rewards//mean": 0.84063720703125, + "rewards//std": 0.051678016781806946, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2142, + "grad_norm": 1.7438604831695557, + "kl": 0.43554987013339996, + "learning_rate": 8.988398137810776e-07, + "loss": 0.0436, + "num_tokens": 12396110.0, + "reward": 0.79107666015625, + "reward_std": 0.016835883259773254, + "rewards//mean": 0.79107666015625, + "rewards//std": 0.01846667379140854, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2144, + "grad_norm": 1.70172119140625, + "kl": 0.4746701791882515, + "learning_rate": 8.986483560745333e-07, + "loss": 0.0475, + "num_tokens": 12407694.0, + "reward": 0.809814453125, + "reward_std": 0.016994569450616837, + "rewards//mean": 0.809814453125, + "rewards//std": 0.01972099021077156, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2146, + "grad_norm": 1.5826671123504639, + "kl": 0.45854538306593895, + "learning_rate": 8.984567377924789e-07, + "loss": 0.0459, + "num_tokens": 12419294.0, + "reward": 0.79266357421875, + "reward_std": 0.019004901871085167, + "rewards//mean": 0.79266357421875, + "rewards//std": 0.02256675250828266, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2148, + "grad_norm": 1.6033213138580322, + "kl": 0.4664851911365986, + "learning_rate": 8.982649590120981e-07, + "loss": 0.0466, + "num_tokens": 12430782.0, + "reward": 0.8348388671875, + "reward_std": 0.02897431142628193, + "rewards//mean": 0.8348388671875, + "rewards//std": 0.0435953214764595, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.215, + "grad_norm": 1.8995875120162964, + "kl": 0.4676240123808384, + "learning_rate": 8.980730198106394e-07, + "loss": 0.0468, + "num_tokens": 12442430.0, + "reward": 0.8486328125, + "reward_std": 0.030733756721019745, + "rewards//mean": 0.8486328125, + "rewards//std": 0.038594670593738556, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2152, + "grad_norm": 1.9373269081115723, + "kl": 0.44492923468351364, + "learning_rate": 8.97880920265416e-07, + "loss": 0.0445, + "num_tokens": 12454166.0, + "reward": 0.8443603515625, + "reward_std": 0.03373778983950615, + "rewards//mean": 0.8443603515625, + "rewards//std": 0.040200553834438324, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2154, + "grad_norm": 1.7610121965408325, + "kl": 0.4320419989526272, + "learning_rate": 8.976886604538055e-07, + "loss": 0.0432, + "num_tokens": 12465774.0, + "reward": 0.850341796875, + "reward_std": 0.023793164640665054, + "rewards//mean": 0.850341796875, + "rewards//std": 0.0317106656730175, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2156, + "grad_norm": 1.8502085208892822, + "kl": 0.44357960671186447, + "learning_rate": 8.974962404532501e-07, + "loss": 0.0444, + "num_tokens": 12477390.0, + "reward": 0.82879638671875, + "reward_std": 0.023879654705524445, + "rewards//mean": 0.82879638671875, + "rewards//std": 0.0287686325609684, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2158, + "grad_norm": 1.6597334146499634, + "kl": 0.4501512087881565, + "learning_rate": 8.973036603412566e-07, + "loss": 0.045, + "num_tokens": 12488918.0, + "reward": 0.81488037109375, + "reward_std": 0.01935569941997528, + "rewards//mean": 0.81488037109375, + "rewards//std": 0.02623271942138672, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.216, + "grad_norm": 1.5563840866088867, + "kl": 0.46604669094085693, + "learning_rate": 8.971109201953962e-07, + "loss": 0.0466, + "num_tokens": 12500438.0, + "reward": 0.8419189453125, + "reward_std": 0.0194279495626688, + "rewards//mean": 0.8419189453125, + "rewards//std": 0.028856175020337105, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2162, + "grad_norm": 1.712743878364563, + "kl": 0.43832290172576904, + "learning_rate": 8.969180200933047e-07, + "loss": 0.0438, + "num_tokens": 12511958.0, + "reward": 0.82196044921875, + "reward_std": 0.026609115302562714, + "rewards//mean": 0.82196044921875, + "rewards//std": 0.029312824830412865, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2164, + "grad_norm": 1.6794413328170776, + "kl": 0.4114188626408577, + "learning_rate": 8.967249601126821e-07, + "loss": 0.0411, + "num_tokens": 12523462.0, + "reward": 0.8507080078125, + "reward_std": 0.03278736025094986, + "rewards//mean": 0.8507080078125, + "rewards//std": 0.04879137501120567, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2166, + "grad_norm": 1.7458651065826416, + "kl": 0.4774220138788223, + "learning_rate": 8.96531740331293e-07, + "loss": 0.0477, + "num_tokens": 12535038.0, + "reward": 0.81842041015625, + "reward_std": 0.02626705914735794, + "rewards//mean": 0.81842041015625, + "rewards//std": 0.03074203059077263, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2168, + "grad_norm": 1.591037392616272, + "kl": 0.49595411121845245, + "learning_rate": 8.963383608269663e-07, + "loss": 0.0496, + "num_tokens": 12546654.0, + "reward": 0.841796875, + "reward_std": 0.021415051072835922, + "rewards//mean": 0.841796875, + "rewards//std": 0.02933921478688717, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.217, + "grad_norm": 1.6527636051177979, + "kl": 0.4558016322553158, + "learning_rate": 8.961448216775953e-07, + "loss": 0.0456, + "num_tokens": 12558262.0, + "reward": 0.863525390625, + "reward_std": 0.02362733706831932, + "rewards//mean": 0.863525390625, + "rewards//std": 0.0312335267663002, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2172, + "grad_norm": 1.9145891666412354, + "kl": 0.43068813905119896, + "learning_rate": 8.959511229611375e-07, + "loss": 0.0431, + "num_tokens": 12569878.0, + "reward": 0.85528564453125, + "reward_std": 0.026773972436785698, + "rewards//mean": 0.85528564453125, + "rewards//std": 0.029720032587647438, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2174, + "grad_norm": 1.4842029809951782, + "kl": 0.3779629357159138, + "learning_rate": 8.957572647556147e-07, + "loss": 0.0378, + "num_tokens": 12581358.0, + "reward": 0.87689208984375, + "reward_std": 0.018777603283524513, + "rewards//mean": 0.87689208984375, + "rewards//std": 0.027480633929371834, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2176, + "grad_norm": 1.6655865907669067, + "kl": 0.4234855752438307, + "learning_rate": 8.95563247139113e-07, + "loss": 0.0423, + "num_tokens": 12592982.0, + "reward": 0.83819580078125, + "reward_std": 0.023835228756070137, + "rewards//mean": 0.83819580078125, + "rewards//std": 0.03522717207670212, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2178, + "grad_norm": 2.0577099323272705, + "kl": 0.5612453520298004, + "learning_rate": 8.953690701897827e-07, + "loss": 0.0561, + "num_tokens": 12604502.0, + "reward": 0.82537841796875, + "reward_std": 0.023172147572040558, + "rewards//mean": 0.82537841796875, + "rewards//std": 0.024682536721229553, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.218, + "grad_norm": 1.6717076301574707, + "kl": 0.4488404430449009, + "learning_rate": 8.951747339858382e-07, + "loss": 0.0449, + "num_tokens": 12616038.0, + "reward": 0.8287353515625, + "reward_std": 0.016566406935453415, + "rewards//mean": 0.8287353515625, + "rewards//std": 0.022490572184324265, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2182, + "grad_norm": 1.702222228050232, + "kl": 0.45906032994389534, + "learning_rate": 8.94980238605558e-07, + "loss": 0.0459, + "num_tokens": 12627574.0, + "reward": 0.8602294921875, + "reward_std": 0.018478523939847946, + "rewards//mean": 0.8602294921875, + "rewards//std": 0.02520482800900936, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2184, + "grad_norm": 1.8298858404159546, + "kl": 0.5175199918448925, + "learning_rate": 8.947855841272851e-07, + "loss": 0.0518, + "num_tokens": 12639102.0, + "reward": 0.82171630859375, + "reward_std": 0.028728008270263672, + "rewards//mean": 0.82171630859375, + "rewards//std": 0.03689002990722656, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2186, + "grad_norm": 1.7405717372894287, + "kl": 0.4724346101284027, + "learning_rate": 8.94590770629426e-07, + "loss": 0.0472, + "num_tokens": 12650718.0, + "reward": 0.85223388671875, + "reward_std": 0.027512799948453903, + "rewards//mean": 0.85223388671875, + "rewards//std": 0.03861697018146515, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2188, + "grad_norm": 1.7419601678848267, + "kl": 0.4837800897657871, + "learning_rate": 8.943957981904517e-07, + "loss": 0.0484, + "num_tokens": 12662374.0, + "reward": 0.849609375, + "reward_std": 0.025210082530975342, + "rewards//mean": 0.849609375, + "rewards//std": 0.033624451607465744, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.219, + "grad_norm": 2.0632405281066895, + "kl": 0.47850683331489563, + "learning_rate": 8.942006668888971e-07, + "loss": 0.0479, + "num_tokens": 12673838.0, + "reward": 0.80181884765625, + "reward_std": 0.028397826477885246, + "rewards//mean": 0.80181884765625, + "rewards//std": 0.03943425789475441, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2192, + "grad_norm": 2.369358777999878, + "kl": 0.5055650155991316, + "learning_rate": 8.940053768033608e-07, + "loss": 0.0506, + "num_tokens": 12685462.0, + "reward": 0.77520751953125, + "reward_std": 0.020045869052410126, + "rewards//mean": 0.77520751953125, + "rewards//std": 0.02218383364379406, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2194, + "grad_norm": 1.7325767278671265, + "kl": 0.46853526681661606, + "learning_rate": 8.938099280125062e-07, + "loss": 0.0469, + "num_tokens": 12697134.0, + "reward": 0.82147216796875, + "reward_std": 0.025625690817832947, + "rewards//mean": 0.82147216796875, + "rewards//std": 0.030935920774936676, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2196, + "grad_norm": 2.0512795448303223, + "kl": 0.591860506683588, + "learning_rate": 8.936143205950595e-07, + "loss": 0.0592, + "num_tokens": 12708622.0, + "reward": 0.81854248046875, + "reward_std": 0.022866319864988327, + "rewards//mean": 0.81854248046875, + "rewards//std": 0.02519003488123417, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2198, + "grad_norm": 1.9037423133850098, + "kl": 0.5342971533536911, + "learning_rate": 8.934185546298115e-07, + "loss": 0.0534, + "num_tokens": 12720142.0, + "reward": 0.822998046875, + "reward_std": 0.02418438531458378, + "rewards//mean": 0.822998046875, + "rewards//std": 0.034916892647743225, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.22, + "grad_norm": 1.7946803569793701, + "kl": 0.5492523424327374, + "learning_rate": 8.932226301956169e-07, + "loss": 0.0549, + "num_tokens": 12731750.0, + "reward": 0.8360595703125, + "reward_std": 0.03264481574296951, + "rewards//mean": 0.8360595703125, + "rewards//std": 0.03440745919942856, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2202, + "grad_norm": 1.7966572046279907, + "kl": 0.606100283563137, + "learning_rate": 8.930265473713937e-07, + "loss": 0.0606, + "num_tokens": 12743342.0, + "reward": 0.84027099609375, + "reward_std": 0.01869259960949421, + "rewards//mean": 0.84027099609375, + "rewards//std": 0.025497639551758766, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2204, + "grad_norm": 2.0167477130889893, + "kl": 0.4550056532025337, + "learning_rate": 8.928303062361243e-07, + "loss": 0.0455, + "num_tokens": 12754934.0, + "reward": 0.86871337890625, + "reward_std": 0.03328916057944298, + "rewards//mean": 0.86871337890625, + "rewards//std": 0.04071776568889618, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2206, + "grad_norm": 1.4851661920547485, + "kl": 0.3844361323863268, + "learning_rate": 8.926339068688545e-07, + "loss": 0.0384, + "num_tokens": 12766510.0, + "reward": 0.82586669921875, + "reward_std": 0.016555577516555786, + "rewards//mean": 0.82586669921875, + "rewards//std": 0.025111790746450424, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2208, + "grad_norm": 1.8230928182601929, + "kl": 0.4188150726258755, + "learning_rate": 8.924373493486941e-07, + "loss": 0.0419, + "num_tokens": 12778094.0, + "reward": 0.79150390625, + "reward_std": 0.03337853401899338, + "rewards//mean": 0.79150390625, + "rewards//std": 0.04760560393333435, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.221, + "grad_norm": 1.9848703145980835, + "kl": 0.5017548762261868, + "learning_rate": 8.922406337548161e-07, + "loss": 0.0502, + "num_tokens": 12789638.0, + "reward": 0.8511962890625, + "reward_std": 0.03427255526185036, + "rewards//mean": 0.8511962890625, + "rewards//std": 0.04008893668651581, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2212, + "grad_norm": 1.7486228942871094, + "kl": 0.4631642699241638, + "learning_rate": 8.920437601664579e-07, + "loss": 0.0463, + "num_tokens": 12801238.0, + "reward": 0.8192138671875, + "reward_std": 0.022890888154506683, + "rewards//mean": 0.8192138671875, + "rewards//std": 0.029094405472278595, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2214, + "grad_norm": 2.0292675495147705, + "kl": 0.5450425893068314, + "learning_rate": 8.918467286629198e-07, + "loss": 0.0545, + "num_tokens": 12812790.0, + "reward": 0.848876953125, + "reward_std": 0.027213435620069504, + "rewards//mean": 0.848876953125, + "rewards//std": 0.03410276025533676, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2216, + "grad_norm": 1.8587936162948608, + "kl": 0.4826918840408325, + "learning_rate": 8.916495393235665e-07, + "loss": 0.0483, + "num_tokens": 12824254.0, + "reward": 0.78131103515625, + "reward_std": 0.012404240667819977, + "rewards//mean": 0.78131103515625, + "rewards//std": 0.016374217346310616, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2218, + "grad_norm": 1.6395573616027832, + "kl": 0.4285610653460026, + "learning_rate": 8.914521922278255e-07, + "loss": 0.0429, + "num_tokens": 12835822.0, + "reward": 0.805419921875, + "reward_std": 0.026328619569540024, + "rewards//mean": 0.805419921875, + "rewards//std": 0.046108540147542953, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.222, + "grad_norm": 1.6988606452941895, + "kl": 0.4514422044157982, + "learning_rate": 8.912546874551882e-07, + "loss": 0.0451, + "num_tokens": 12847430.0, + "reward": 0.8192138671875, + "reward_std": 0.020680909976363182, + "rewards//mean": 0.8192138671875, + "rewards//std": 0.025021584704518318, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2222, + "grad_norm": 1.6668484210968018, + "kl": 0.4875780865550041, + "learning_rate": 8.910570250852096e-07, + "loss": 0.0488, + "num_tokens": 12858990.0, + "reward": 0.8140869140625, + "reward_std": 0.024537358433008194, + "rewards//mean": 0.8140869140625, + "rewards//std": 0.028303204104304314, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2224, + "grad_norm": 1.6134357452392578, + "kl": 0.3682728596031666, + "learning_rate": 8.908592051975081e-07, + "loss": 0.0368, + "num_tokens": 12870606.0, + "reward": 0.84661865234375, + "reward_std": 0.022064659744501114, + "rewards//mean": 0.84661865234375, + "rewards//std": 0.036346837878227234, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2226, + "grad_norm": 1.6185276508331299, + "kl": 0.5388368144631386, + "learning_rate": 8.906612278717655e-07, + "loss": 0.0539, + "num_tokens": 12882158.0, + "reward": 0.8057861328125, + "reward_std": 0.018660185858607292, + "rewards//mean": 0.8057861328125, + "rewards//std": 0.020439790561795235, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2228, + "grad_norm": 1.748561978340149, + "kl": 0.48594076931476593, + "learning_rate": 8.90463093187727e-07, + "loss": 0.0486, + "num_tokens": 12893694.0, + "reward": 0.80755615234375, + "reward_std": 0.0244353748857975, + "rewards//mean": 0.80755615234375, + "rewards//std": 0.025517817586660385, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.223, + "grad_norm": 1.6020493507385254, + "kl": 0.44102535024285316, + "learning_rate": 8.902648012252012e-07, + "loss": 0.0441, + "num_tokens": 12905294.0, + "reward": 0.83782958984375, + "reward_std": 0.014234259724617004, + "rewards//mean": 0.83782958984375, + "rewards//std": 0.017133191227912903, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2232, + "grad_norm": 1.5825769901275635, + "kl": 0.45053045079112053, + "learning_rate": 8.900663520640603e-07, + "loss": 0.0451, + "num_tokens": 12916846.0, + "reward": 0.84661865234375, + "reward_std": 0.03633493557572365, + "rewards//mean": 0.84661865234375, + "rewards//std": 0.04123569279909134, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2234, + "grad_norm": 1.811830997467041, + "kl": 0.560225110501051, + "learning_rate": 8.898677457842394e-07, + "loss": 0.056, + "num_tokens": 12928406.0, + "reward": 0.843994140625, + "reward_std": 0.01658232882618904, + "rewards//mean": 0.843994140625, + "rewards//std": 0.021585598587989807, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2236, + "grad_norm": 2.006256580352783, + "kl": 0.5062027648091316, + "learning_rate": 8.896689824657371e-07, + "loss": 0.0506, + "num_tokens": 12939902.0, + "reward": 0.82269287109375, + "reward_std": 0.024794355034828186, + "rewards//mean": 0.82269287109375, + "rewards//std": 0.029037311673164368, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2238, + "grad_norm": 1.8893800973892212, + "kl": 0.5069471411406994, + "learning_rate": 8.894700621886152e-07, + "loss": 0.0507, + "num_tokens": 12951502.0, + "reward": 0.80218505859375, + "reward_std": 0.015398219227790833, + "rewards//mean": 0.80218505859375, + "rewards//std": 0.0205213725566864, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.224, + "grad_norm": 1.6032190322875977, + "kl": 0.35301624424755573, + "learning_rate": 8.892709850329989e-07, + "loss": 0.0353, + "num_tokens": 12963030.0, + "reward": 0.84246826171875, + "reward_std": 0.024849925190210342, + "rewards//mean": 0.84246826171875, + "rewards//std": 0.03457659110426903, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2242, + "grad_norm": 1.7753796577453613, + "kl": 0.479705024510622, + "learning_rate": 8.890717510790762e-07, + "loss": 0.048, + "num_tokens": 12974582.0, + "reward": 0.783447265625, + "reward_std": 0.014403380453586578, + "rewards//mean": 0.783447265625, + "rewards//std": 0.019989358261227608, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2244, + "grad_norm": 1.8169294595718384, + "kl": 0.3892389629036188, + "learning_rate": 8.888723604070989e-07, + "loss": 0.0389, + "num_tokens": 12986158.0, + "reward": 0.8404541015625, + "reward_std": 0.022764001041650772, + "rewards//mean": 0.8404541015625, + "rewards//std": 0.026247501373291016, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2246, + "grad_norm": 1.9298598766326904, + "kl": 0.43493365682661533, + "learning_rate": 8.886728130973813e-07, + "loss": 0.0435, + "num_tokens": 12997678.0, + "reward": 0.8538818359375, + "reward_std": 0.031498104333877563, + "rewards//mean": 0.8538818359375, + "rewards//std": 0.0390421487390995, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2248, + "grad_norm": 2.130866289138794, + "kl": 0.4265870861709118, + "learning_rate": 8.884731092303011e-07, + "loss": 0.0427, + "num_tokens": 13009358.0, + "reward": 0.87017822265625, + "reward_std": 0.028217127546668053, + "rewards//mean": 0.87017822265625, + "rewards//std": 0.03341303765773773, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.225, + "grad_norm": 2.1394131183624268, + "kl": 0.5736663118004799, + "learning_rate": 8.882732488862987e-07, + "loss": 0.0574, + "num_tokens": 13020942.0, + "reward": 0.8271484375, + "reward_std": 0.021237272769212723, + "rewards//mean": 0.8271484375, + "rewards//std": 0.02909882739186287, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2252, + "grad_norm": 1.766954779624939, + "kl": 0.4534395709633827, + "learning_rate": 8.880732321458784e-07, + "loss": 0.0453, + "num_tokens": 13032454.0, + "reward": 0.79833984375, + "reward_std": 0.017106231302022934, + "rewards//mean": 0.79833984375, + "rewards//std": 0.03410009667277336, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2254, + "grad_norm": 1.6560237407684326, + "kl": 0.48381608724594116, + "learning_rate": 8.878730590896065e-07, + "loss": 0.0484, + "num_tokens": 13044086.0, + "reward": 0.8447265625, + "reward_std": 0.026898950338363647, + "rewards//mean": 0.8447265625, + "rewards//std": 0.03945119306445122, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2256, + "grad_norm": 1.837586760520935, + "kl": 0.42162058129906654, + "learning_rate": 8.876727297981127e-07, + "loss": 0.0422, + "num_tokens": 13055598.0, + "reward": 0.85931396484375, + "reward_std": 0.023832857608795166, + "rewards//mean": 0.85931396484375, + "rewards//std": 0.03706769272685051, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2258, + "grad_norm": 1.765622615814209, + "kl": 0.6093264296650887, + "learning_rate": 8.874722443520898e-07, + "loss": 0.0609, + "num_tokens": 13067174.0, + "reward": 0.8104248046875, + "reward_std": 0.02622900903224945, + "rewards//mean": 0.8104248046875, + "rewards//std": 0.035679180175065994, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.226, + "grad_norm": 1.9755301475524902, + "kl": 0.584816113114357, + "learning_rate": 8.872716028322931e-07, + "loss": 0.0585, + "num_tokens": 13078838.0, + "reward": 0.79498291015625, + "reward_std": 0.023835178464651108, + "rewards//mean": 0.79498291015625, + "rewards//std": 0.028569569811224937, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2262, + "grad_norm": 2.0668110847473145, + "kl": 0.5126761607825756, + "learning_rate": 8.870708053195413e-07, + "loss": 0.0513, + "num_tokens": 13090414.0, + "reward": 0.82220458984375, + "reward_std": 0.017578698694705963, + "rewards//mean": 0.82220458984375, + "rewards//std": 0.02312532067298889, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2264, + "grad_norm": 1.9796454906463623, + "kl": 0.5130621790885925, + "learning_rate": 8.868698518947151e-07, + "loss": 0.0513, + "num_tokens": 13101942.0, + "reward": 0.8297119140625, + "reward_std": 0.02258753404021263, + "rewards//mean": 0.8297119140625, + "rewards//std": 0.029038159176707268, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2266, + "grad_norm": 1.6562920808792114, + "kl": 0.45547511242330074, + "learning_rate": 8.866687426387591e-07, + "loss": 0.0455, + "num_tokens": 13113502.0, + "reward": 0.8426513671875, + "reward_std": 0.021378610283136368, + "rewards//mean": 0.8426513671875, + "rewards//std": 0.027572816237807274, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2268, + "grad_norm": 2.617424488067627, + "kl": 0.48680391162633896, + "learning_rate": 8.864674776326797e-07, + "loss": 0.0487, + "num_tokens": 13125094.0, + "reward": 0.8328857421875, + "reward_std": 0.026014059782028198, + "rewards//mean": 0.8328857421875, + "rewards//std": 0.03182193636894226, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.227, + "grad_norm": 1.5617382526397705, + "kl": 0.4363003596663475, + "learning_rate": 8.862660569575464e-07, + "loss": 0.0436, + "num_tokens": 13136718.0, + "reward": 0.858642578125, + "reward_std": 0.028137363493442535, + "rewards//mean": 0.858642578125, + "rewards//std": 0.03824083134531975, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2272, + "grad_norm": 1.832319736480713, + "kl": 0.5111656077206135, + "learning_rate": 8.860644806944917e-07, + "loss": 0.0511, + "num_tokens": 13148278.0, + "reward": 0.84161376953125, + "reward_std": 0.020673556253314018, + "rewards//mean": 0.84161376953125, + "rewards//std": 0.028279326856136322, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2274, + "grad_norm": 1.622735619544983, + "kl": 0.5068413577973843, + "learning_rate": 8.858627489247104e-07, + "loss": 0.0507, + "num_tokens": 13159814.0, + "reward": 0.85546875, + "reward_std": 0.031534429639577866, + "rewards//mean": 0.85546875, + "rewards//std": 0.037311967462301254, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2276, + "grad_norm": 1.854464054107666, + "kl": 0.528043732047081, + "learning_rate": 8.856608617294599e-07, + "loss": 0.0528, + "num_tokens": 13171470.0, + "reward": 0.81256103515625, + "reward_std": 0.02018117532134056, + "rewards//mean": 0.81256103515625, + "rewards//std": 0.026213092729449272, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2278, + "grad_norm": 1.7685812711715698, + "kl": 0.646508228033781, + "learning_rate": 8.854588191900604e-07, + "loss": 0.0647, + "num_tokens": 13183062.0, + "reward": 0.86181640625, + "reward_std": 0.03179919347167015, + "rewards//mean": 0.86181640625, + "rewards//std": 0.033185794949531555, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.228, + "grad_norm": 1.8012447357177734, + "kl": 0.5501644909381866, + "learning_rate": 8.852566213878946e-07, + "loss": 0.055, + "num_tokens": 13194590.0, + "reward": 0.85296630859375, + "reward_std": 0.02474537119269371, + "rewards//mean": 0.85296630859375, + "rewards//std": 0.03410625830292702, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2282, + "grad_norm": 1.964414358139038, + "kl": 0.5641217827796936, + "learning_rate": 8.850542684044078e-07, + "loss": 0.0564, + "num_tokens": 13206142.0, + "reward": 0.85540771484375, + "reward_std": 0.027292389422655106, + "rewards//mean": 0.85540771484375, + "rewards//std": 0.034404102712869644, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2284, + "grad_norm": 1.5214049816131592, + "kl": 0.49270905554294586, + "learning_rate": 8.848517603211078e-07, + "loss": 0.0493, + "num_tokens": 13217654.0, + "reward": 0.8314208984375, + "reward_std": 0.021566614508628845, + "rewards//mean": 0.8314208984375, + "rewards//std": 0.02669582888484001, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2286, + "grad_norm": 1.748920202255249, + "kl": 0.44411441311240196, + "learning_rate": 8.846490972195646e-07, + "loss": 0.0444, + "num_tokens": 13229222.0, + "reward": 0.831298828125, + "reward_std": 0.024141106754541397, + "rewards//mean": 0.831298828125, + "rewards//std": 0.03571246936917305, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2288, + "grad_norm": 1.4863101243972778, + "kl": 0.43802759051322937, + "learning_rate": 8.844462791814112e-07, + "loss": 0.0438, + "num_tokens": 13240774.0, + "reward": 0.8603515625, + "reward_std": 0.020699240267276764, + "rewards//mean": 0.8603515625, + "rewards//std": 0.026069553568959236, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.229, + "grad_norm": 1.9027873277664185, + "kl": 0.5113161541521549, + "learning_rate": 8.842433062883425e-07, + "loss": 0.0511, + "num_tokens": 13252270.0, + "reward": 0.87841796875, + "reward_std": 0.022273510694503784, + "rewards//mean": 0.87841796875, + "rewards//std": 0.028043415397405624, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2292, + "grad_norm": 1.944124698638916, + "kl": 0.5541365928947926, + "learning_rate": 8.840401786221159e-07, + "loss": 0.0554, + "num_tokens": 13263798.0, + "reward": 0.8154296875, + "reward_std": 0.02015933394432068, + "rewards//mean": 0.8154296875, + "rewards//std": 0.029577648267149925, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2294, + "grad_norm": 1.5942072868347168, + "kl": 0.5337507352232933, + "learning_rate": 8.838368962645513e-07, + "loss": 0.0534, + "num_tokens": 13275470.0, + "reward": 0.82177734375, + "reward_std": 0.026391366496682167, + "rewards//mean": 0.82177734375, + "rewards//std": 0.03368562087416649, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2296, + "grad_norm": 1.9979602098464966, + "kl": 0.5074471235275269, + "learning_rate": 8.836334592975308e-07, + "loss": 0.0507, + "num_tokens": 13287070.0, + "reward": 0.85247802734375, + "reward_std": 0.0324978269636631, + "rewards//mean": 0.85247802734375, + "rewards//std": 0.03606923669576645, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2298, + "grad_norm": 2.087287664413452, + "kl": 0.5601102821528912, + "learning_rate": 8.834298678029988e-07, + "loss": 0.056, + "num_tokens": 13298598.0, + "reward": 0.82598876953125, + "reward_std": 0.030006704851984978, + "rewards//mean": 0.82598876953125, + "rewards//std": 0.03676219284534454, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.23, + "grad_norm": 1.858852505683899, + "kl": 0.7096199281513691, + "learning_rate": 8.83226121862962e-07, + "loss": 0.071, + "num_tokens": 13310094.0, + "reward": 0.83807373046875, + "reward_std": 0.02160453051328659, + "rewards//mean": 0.83807373046875, + "rewards//std": 0.03184066340327263, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2302, + "grad_norm": 1.6396856307983398, + "kl": 0.5055105723440647, + "learning_rate": 8.83022221559489e-07, + "loss": 0.0506, + "num_tokens": 13321614.0, + "reward": 0.84283447265625, + "reward_std": 0.018205661326646805, + "rewards//mean": 0.84283447265625, + "rewards//std": 0.025437012314796448, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2304, + "grad_norm": 1.816128134727478, + "kl": 0.5360119640827179, + "learning_rate": 8.82818166974711e-07, + "loss": 0.0536, + "num_tokens": 13333230.0, + "reward": 0.8572998046875, + "reward_std": 0.027248838916420937, + "rewards//mean": 0.8572998046875, + "rewards//std": 0.03298424929380417, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2306, + "grad_norm": 2.404366970062256, + "kl": 0.4920961260795593, + "learning_rate": 8.826139581908211e-07, + "loss": 0.0492, + "num_tokens": 13344774.0, + "reward": 0.81243896484375, + "reward_std": 0.018001887947320938, + "rewards//mean": 0.81243896484375, + "rewards//std": 0.02338051237165928, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2308, + "grad_norm": 1.5869191884994507, + "kl": 0.522806815803051, + "learning_rate": 8.824095952900746e-07, + "loss": 0.0523, + "num_tokens": 13356326.0, + "reward": 0.834228515625, + "reward_std": 0.023220248520374298, + "rewards//mean": 0.834228515625, + "rewards//std": 0.029205795377492905, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.231, + "grad_norm": 1.6923807859420776, + "kl": 0.5766335502266884, + "learning_rate": 8.822050783547889e-07, + "loss": 0.0577, + "num_tokens": 13367902.0, + "reward": 0.8260498046875, + "reward_std": 0.029900819063186646, + "rewards//mean": 0.8260498046875, + "rewards//std": 0.03673608601093292, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2312, + "grad_norm": 1.934389352798462, + "kl": 0.5814130045473576, + "learning_rate": 8.820004074673433e-07, + "loss": 0.0581, + "num_tokens": 13379446.0, + "reward": 0.84075927734375, + "reward_std": 0.02260933257639408, + "rewards//mean": 0.84075927734375, + "rewards//std": 0.024839654564857483, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2314, + "grad_norm": 1.89406418800354, + "kl": 0.5375013612210751, + "learning_rate": 8.817955827101792e-07, + "loss": 0.0538, + "num_tokens": 13390982.0, + "reward": 0.81756591796875, + "reward_std": 0.023839078843593597, + "rewards//mean": 0.81756591796875, + "rewards//std": 0.03537040576338768, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2316, + "grad_norm": 1.9660600423812866, + "kl": 0.43939148634672165, + "learning_rate": 8.815906041658001e-07, + "loss": 0.0439, + "num_tokens": 13402534.0, + "reward": 0.82989501953125, + "reward_std": 0.015210457146167755, + "rewards//mean": 0.82989501953125, + "rewards//std": 0.024468179792165756, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2318, + "grad_norm": 1.847939372062683, + "kl": 0.4682850018143654, + "learning_rate": 8.813854719167712e-07, + "loss": 0.0468, + "num_tokens": 13414094.0, + "reward": 0.79412841796875, + "reward_std": 0.019630486145615578, + "rewards//mean": 0.79412841796875, + "rewards//std": 0.025973495095968246, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.232, + "grad_norm": 1.8225617408752441, + "kl": 0.5591977387666702, + "learning_rate": 8.8118018604572e-07, + "loss": 0.0559, + "num_tokens": 13425614.0, + "reward": 0.83746337890625, + "reward_std": 0.02308744005858898, + "rewards//mean": 0.83746337890625, + "rewards//std": 0.026010185480117798, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2322, + "grad_norm": 1.9267091751098633, + "kl": 0.5767880156636238, + "learning_rate": 8.809747466353355e-07, + "loss": 0.0577, + "num_tokens": 13437150.0, + "reward": 0.8663330078125, + "reward_std": 0.026002775877714157, + "rewards//mean": 0.8663330078125, + "rewards//std": 0.03886650502681732, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2324, + "grad_norm": 1.8646221160888672, + "kl": 0.5168589688837528, + "learning_rate": 8.807691537683684e-07, + "loss": 0.0517, + "num_tokens": 13448654.0, + "reward": 0.81732177734375, + "reward_std": 0.02325308695435524, + "rewards//mean": 0.81732177734375, + "rewards//std": 0.025711657479405403, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2326, + "grad_norm": 1.7662862539291382, + "kl": 0.5557834766805172, + "learning_rate": 8.805634075276317e-07, + "loss": 0.0556, + "num_tokens": 13460230.0, + "reward": 0.83148193359375, + "reward_std": 0.026801910251379013, + "rewards//mean": 0.83148193359375, + "rewards//std": 0.03526883199810982, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2328, + "grad_norm": 1.875901222229004, + "kl": 0.7082985006272793, + "learning_rate": 8.80357507996e-07, + "loss": 0.0708, + "num_tokens": 13471766.0, + "reward": 0.8485107421875, + "reward_std": 0.026136239990592003, + "rewards//mean": 0.8485107421875, + "rewards//std": 0.03325847163796425, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.233, + "grad_norm": 2.0127696990966797, + "kl": 0.5911875292658806, + "learning_rate": 8.801514552564095e-07, + "loss": 0.0591, + "num_tokens": 13483454.0, + "reward": 0.83782958984375, + "reward_std": 0.023232024163007736, + "rewards//mean": 0.83782958984375, + "rewards//std": 0.028099432587623596, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2332, + "grad_norm": 1.9522804021835327, + "kl": 0.4845363721251488, + "learning_rate": 8.799452493918585e-07, + "loss": 0.0485, + "num_tokens": 13494950.0, + "reward": 0.80792236328125, + "reward_std": 0.02611805871129036, + "rewards//mean": 0.80792236328125, + "rewards//std": 0.031424324959516525, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2334, + "grad_norm": 1.6965774297714233, + "kl": 0.6002986170351505, + "learning_rate": 8.797388904854063e-07, + "loss": 0.06, + "num_tokens": 13506550.0, + "reward": 0.81854248046875, + "reward_std": 0.022551588714122772, + "rewards//mean": 0.81854248046875, + "rewards//std": 0.02830500900745392, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2336, + "grad_norm": 1.7328457832336426, + "kl": 0.5794375725090504, + "learning_rate": 8.795323786201745e-07, + "loss": 0.0579, + "num_tokens": 13518070.0, + "reward": 0.8355712890625, + "reward_std": 0.026901211589574814, + "rewards//mean": 0.8355712890625, + "rewards//std": 0.030510524287819862, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2338, + "grad_norm": 2.020327568054199, + "kl": 0.5418264009058475, + "learning_rate": 8.79325713879346e-07, + "loss": 0.0542, + "num_tokens": 13529574.0, + "reward": 0.82635498046875, + "reward_std": 0.018335867673158646, + "rewards//mean": 0.82635498046875, + "rewards//std": 0.02063392475247383, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.234, + "grad_norm": 1.6266428232192993, + "kl": 0.6873955056071281, + "learning_rate": 8.791188963461652e-07, + "loss": 0.0687, + "num_tokens": 13541094.0, + "reward": 0.83404541015625, + "reward_std": 0.02436729148030281, + "rewards//mean": 0.83404541015625, + "rewards//std": 0.02671191282570362, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2342, + "grad_norm": 1.6216081380844116, + "kl": 0.48947588726878166, + "learning_rate": 8.789119261039384e-07, + "loss": 0.0489, + "num_tokens": 13552718.0, + "reward": 0.8551025390625, + "reward_std": 0.026444513350725174, + "rewards//mean": 0.8551025390625, + "rewards//std": 0.0439453125, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2344, + "grad_norm": 2.2501487731933594, + "kl": 0.6184299401938915, + "learning_rate": 8.78704803236033e-07, + "loss": 0.0618, + "num_tokens": 13564390.0, + "reward": 0.7249755859375, + "reward_std": 0.01488354243338108, + "rewards//mean": 0.7249755859375, + "rewards//std": 0.02026425674557686, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2346, + "grad_norm": 1.770888328552246, + "kl": 0.5948951914906502, + "learning_rate": 8.784975278258782e-07, + "loss": 0.0595, + "num_tokens": 13575958.0, + "reward": 0.83099365234375, + "reward_std": 0.02826734073460102, + "rewards//mean": 0.83099365234375, + "rewards//std": 0.04106500744819641, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2348, + "grad_norm": 1.9469608068466187, + "kl": 0.5286225564777851, + "learning_rate": 8.782900999569645e-07, + "loss": 0.0529, + "num_tokens": 13587526.0, + "reward": 0.880859375, + "reward_std": 0.025688577443361282, + "rewards//mean": 0.880859375, + "rewards//std": 0.031909581273794174, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.235, + "grad_norm": 1.7816431522369385, + "kl": 0.5336294695734978, + "learning_rate": 8.780825197128437e-07, + "loss": 0.0534, + "num_tokens": 13599126.0, + "reward": 0.823486328125, + "reward_std": 0.028794273734092712, + "rewards//mean": 0.823486328125, + "rewards//std": 0.035928841680288315, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2352, + "grad_norm": 1.6381821632385254, + "kl": 0.5366844944655895, + "learning_rate": 8.778747871771291e-07, + "loss": 0.0537, + "num_tokens": 13610654.0, + "reward": 0.8123779296875, + "reward_std": 0.015442204661667347, + "rewards//mean": 0.8123779296875, + "rewards//std": 0.018001457676291466, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2354, + "grad_norm": 2.4310050010681152, + "kl": 0.6755405254662037, + "learning_rate": 8.776669024334955e-07, + "loss": 0.0676, + "num_tokens": 13622262.0, + "reward": 0.7958984375, + "reward_std": 0.02643211930990219, + "rewards//mean": 0.7958984375, + "rewards//std": 0.03709713742136955, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2356, + "grad_norm": 1.7078548669815063, + "kl": 0.6196857243776321, + "learning_rate": 8.774588655656787e-07, + "loss": 0.062, + "num_tokens": 13633870.0, + "reward": 0.80242919921875, + "reward_std": 0.02625448815524578, + "rewards//mean": 0.80242919921875, + "rewards//std": 0.04446084052324295, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2358, + "grad_norm": 1.7734005451202393, + "kl": 0.6356759741902351, + "learning_rate": 8.772506766574761e-07, + "loss": 0.0636, + "num_tokens": 13645494.0, + "reward": 0.83831787109375, + "reward_std": 0.020368501543998718, + "rewards//mean": 0.83831787109375, + "rewards//std": 0.027810268104076385, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.236, + "grad_norm": 1.8894531726837158, + "kl": 0.6454313397407532, + "learning_rate": 8.770423357927462e-07, + "loss": 0.0645, + "num_tokens": 13657166.0, + "reward": 0.76220703125, + "reward_std": 0.02428710088133812, + "rewards//mean": 0.76220703125, + "rewards//std": 0.030785392969846725, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2362, + "grad_norm": 1.7988823652267456, + "kl": 0.602322019636631, + "learning_rate": 8.768338430554082e-07, + "loss": 0.0602, + "num_tokens": 13668718.0, + "reward": 0.8607177734375, + "reward_std": 0.027906734496355057, + "rewards//mean": 0.8607177734375, + "rewards//std": 0.039026640355587006, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2364, + "grad_norm": 1.671142578125, + "kl": 0.484476774930954, + "learning_rate": 8.766251985294434e-07, + "loss": 0.0484, + "num_tokens": 13680342.0, + "reward": 0.8175048828125, + "reward_std": 0.0160528477281332, + "rewards//mean": 0.8175048828125, + "rewards//std": 0.01954326033592224, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2366, + "grad_norm": 1.6413307189941406, + "kl": 0.6595504134893417, + "learning_rate": 8.764164022988937e-07, + "loss": 0.066, + "num_tokens": 13691910.0, + "reward": 0.83154296875, + "reward_std": 0.026556245982646942, + "rewards//mean": 0.83154296875, + "rewards//std": 0.035314351320266724, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2368, + "grad_norm": 1.9784523248672485, + "kl": 0.6585122123360634, + "learning_rate": 8.762074544478621e-07, + "loss": 0.0659, + "num_tokens": 13703414.0, + "reward": 0.8402099609375, + "reward_std": 0.034546516835689545, + "rewards//mean": 0.8402099609375, + "rewards//std": 0.03863681107759476, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.237, + "grad_norm": 1.6256033182144165, + "kl": 0.5444382168352604, + "learning_rate": 8.75998355060513e-07, + "loss": 0.0544, + "num_tokens": 13714934.0, + "reward": 0.833984375, + "reward_std": 0.017698053270578384, + "rewards//mean": 0.833984375, + "rewards//std": 0.030710561200976372, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2372, + "grad_norm": 1.725518822669983, + "kl": 0.6924906857311726, + "learning_rate": 8.757891042210712e-07, + "loss": 0.0692, + "num_tokens": 13726534.0, + "reward": 0.833984375, + "reward_std": 0.02560281567275524, + "rewards//mean": 0.833984375, + "rewards//std": 0.032526031136512756, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2374, + "grad_norm": 2.268855571746826, + "kl": 0.7819020375609398, + "learning_rate": 8.755797020138234e-07, + "loss": 0.0782, + "num_tokens": 13738110.0, + "reward": 0.83734130859375, + "reward_std": 0.02450389787554741, + "rewards//mean": 0.83734130859375, + "rewards//std": 0.031268343329429626, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2376, + "grad_norm": 1.852020502090454, + "kl": 0.5299231596291065, + "learning_rate": 8.753701485231164e-07, + "loss": 0.053, + "num_tokens": 13749638.0, + "reward": 0.79864501953125, + "reward_std": 0.023467281833291054, + "rewards//mean": 0.79864501953125, + "rewards//std": 0.02852662093937397, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2378, + "grad_norm": 1.9790624380111694, + "kl": 0.578454777598381, + "learning_rate": 8.751604438333586e-07, + "loss": 0.0578, + "num_tokens": 13761254.0, + "reward": 0.84967041015625, + "reward_std": 0.026303719729185104, + "rewards//mean": 0.84967041015625, + "rewards//std": 0.031757354736328125, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.238, + "grad_norm": 1.916062831878662, + "kl": 0.5277891010046005, + "learning_rate": 8.749505880290188e-07, + "loss": 0.0528, + "num_tokens": 13772814.0, + "reward": 0.864501953125, + "reward_std": 0.0257189329713583, + "rewards//mean": 0.864501953125, + "rewards//std": 0.035027701407670975, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2382, + "grad_norm": 1.619443416595459, + "kl": 0.6445678994059563, + "learning_rate": 8.74740581194627e-07, + "loss": 0.0645, + "num_tokens": 13784318.0, + "reward": 0.831298828125, + "reward_std": 0.030266281217336655, + "rewards//mean": 0.831298828125, + "rewards//std": 0.036649659276008606, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2384, + "grad_norm": 1.9051553010940552, + "kl": 0.667740985751152, + "learning_rate": 8.745304234147739e-07, + "loss": 0.0668, + "num_tokens": 13795934.0, + "reward": 0.85150146484375, + "reward_std": 0.0359485037624836, + "rewards//mean": 0.85150146484375, + "rewards//std": 0.04290665686130524, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2386, + "grad_norm": 1.9644160270690918, + "kl": 0.7187744304537773, + "learning_rate": 8.743201147741111e-07, + "loss": 0.0719, + "num_tokens": 13807518.0, + "reward": 0.852294921875, + "reward_std": 0.03092549555003643, + "rewards//mean": 0.852294921875, + "rewards//std": 0.040968358516693115, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2388, + "grad_norm": 1.7586549520492554, + "kl": 0.677355445921421, + "learning_rate": 8.741096553573506e-07, + "loss": 0.0677, + "num_tokens": 13819070.0, + "reward": 0.842041015625, + "reward_std": 0.029408015310764313, + "rewards//mean": 0.842041015625, + "rewards//std": 0.0345192477107048, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.239, + "grad_norm": 2.0752999782562256, + "kl": 0.7956772148609161, + "learning_rate": 8.73899045249266e-07, + "loss": 0.0796, + "num_tokens": 13830654.0, + "reward": 0.82806396484375, + "reward_std": 0.02175304852426052, + "rewards//mean": 0.82806396484375, + "rewards//std": 0.027826592326164246, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2392, + "grad_norm": 2.0089430809020996, + "kl": 0.7155617661774158, + "learning_rate": 8.736882845346905e-07, + "loss": 0.0716, + "num_tokens": 13842294.0, + "reward": 0.87042236328125, + "reward_std": 0.030703812837600708, + "rewards//mean": 0.87042236328125, + "rewards//std": 0.03579159453511238, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2394, + "grad_norm": 1.9589171409606934, + "kl": 0.8125139400362968, + "learning_rate": 8.734773732985185e-07, + "loss": 0.0813, + "num_tokens": 13853862.0, + "reward": 0.825927734375, + "reward_std": 0.02438437193632126, + "rewards//mean": 0.825927734375, + "rewards//std": 0.029715511947870255, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2396, + "grad_norm": 1.8188412189483643, + "kl": 0.6533200666308403, + "learning_rate": 8.732663116257055e-07, + "loss": 0.0653, + "num_tokens": 13865406.0, + "reward": 0.85076904296875, + "reward_std": 0.029847215861082077, + "rewards//mean": 0.85076904296875, + "rewards//std": 0.03565344959497452, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2398, + "grad_norm": 2.083422899246216, + "kl": 0.8664126358926296, + "learning_rate": 8.730550996012667e-07, + "loss": 0.0866, + "num_tokens": 13876958.0, + "reward": 0.83026123046875, + "reward_std": 0.028779443353414536, + "rewards//mean": 0.83026123046875, + "rewards//std": 0.03580089658498764, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.24, + "grad_norm": 2.0167880058288574, + "kl": 0.6934840604662895, + "learning_rate": 8.728437373102784e-07, + "loss": 0.0693, + "num_tokens": 13888430.0, + "reward": 0.84649658203125, + "reward_std": 0.023455552756786346, + "rewards//mean": 0.84649658203125, + "rewards//std": 0.033303674310445786, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2402, + "grad_norm": 2.024256467819214, + "kl": 0.8196956068277359, + "learning_rate": 8.726322248378774e-07, + "loss": 0.082, + "num_tokens": 13899966.0, + "reward": 0.81964111328125, + "reward_std": 0.019603151828050613, + "rewards//mean": 0.81964111328125, + "rewards//std": 0.023173056542873383, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2404, + "grad_norm": 1.9013267755508423, + "kl": 0.5444635301828384, + "learning_rate": 8.724205622692606e-07, + "loss": 0.0544, + "num_tokens": 13911558.0, + "reward": 0.84906005859375, + "reward_std": 0.02106660045683384, + "rewards//mean": 0.84906005859375, + "rewards//std": 0.027929762378335, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2406, + "grad_norm": 2.382042407989502, + "kl": 0.7071444466710091, + "learning_rate": 8.72208749689686e-07, + "loss": 0.0707, + "num_tokens": 13923030.0, + "reward": 0.84307861328125, + "reward_std": 0.025349654257297516, + "rewards//mean": 0.84307861328125, + "rewards//std": 0.03140890598297119, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2408, + "grad_norm": 1.6057301759719849, + "kl": 0.5377060994505882, + "learning_rate": 8.719967871844715e-07, + "loss": 0.0538, + "num_tokens": 13934550.0, + "reward": 0.8233642578125, + "reward_std": 0.018593499436974525, + "rewards//mean": 0.8233642578125, + "rewards//std": 0.03539634123444557, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.241, + "grad_norm": 1.7450426816940308, + "kl": 0.6269400306046009, + "learning_rate": 8.717846748389955e-07, + "loss": 0.0627, + "num_tokens": 13946078.0, + "reward": 0.84857177734375, + "reward_std": 0.028637666255235672, + "rewards//mean": 0.84857177734375, + "rewards//std": 0.03298993036150932, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2412, + "grad_norm": 2.0041379928588867, + "kl": 0.5766609013080597, + "learning_rate": 8.71572412738697e-07, + "loss": 0.0577, + "num_tokens": 13957694.0, + "reward": 0.8433837890625, + "reward_std": 0.0259174145758152, + "rewards//mean": 0.8433837890625, + "rewards//std": 0.032292257994413376, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2414, + "grad_norm": 1.8091542720794678, + "kl": 0.6735386848449707, + "learning_rate": 8.713600009690751e-07, + "loss": 0.0674, + "num_tokens": 13969254.0, + "reward": 0.82977294921875, + "reward_std": 0.02369583770632744, + "rewards//mean": 0.82977294921875, + "rewards//std": 0.02806331403553486, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2416, + "grad_norm": 1.9654127359390259, + "kl": 0.7157592475414276, + "learning_rate": 8.711474396156892e-07, + "loss": 0.0716, + "num_tokens": 13980718.0, + "reward": 0.845458984375, + "reward_std": 0.029256979003548622, + "rewards//mean": 0.845458984375, + "rewards//std": 0.03302759304642677, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2418, + "grad_norm": 1.9403640031814575, + "kl": 0.6588780656456947, + "learning_rate": 8.709347287641592e-07, + "loss": 0.0659, + "num_tokens": 13992270.0, + "reward": 0.86834716796875, + "reward_std": 0.0222182497382164, + "rewards//mean": 0.86834716796875, + "rewards//std": 0.026835734024643898, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.242, + "grad_norm": 1.8353345394134521, + "kl": 0.5468274392187595, + "learning_rate": 8.707218685001646e-07, + "loss": 0.0547, + "num_tokens": 14003726.0, + "reward": 0.8331298828125, + "reward_std": 0.02605665847659111, + "rewards//mean": 0.8331298828125, + "rewards//std": 0.035614632070064545, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2422, + "grad_norm": 1.714673399925232, + "kl": 0.5521312952041626, + "learning_rate": 8.705088589094458e-07, + "loss": 0.0552, + "num_tokens": 14015222.0, + "reward": 0.82275390625, + "reward_std": 0.025304019451141357, + "rewards//mean": 0.82275390625, + "rewards//std": 0.030761782079935074, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2424, + "grad_norm": 2.0660033226013184, + "kl": 0.709751732647419, + "learning_rate": 8.702957000778029e-07, + "loss": 0.071, + "num_tokens": 14026894.0, + "reward": 0.8641357421875, + "reward_std": 0.030700907111167908, + "rewards//mean": 0.8641357421875, + "rewards//std": 0.03713282197713852, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2426, + "grad_norm": 1.8678991794586182, + "kl": 0.6772042028605938, + "learning_rate": 8.700823920910963e-07, + "loss": 0.0677, + "num_tokens": 14038462.0, + "reward": 0.8238525390625, + "reward_std": 0.025208866223692894, + "rewards//mean": 0.8238525390625, + "rewards//std": 0.02971423789858818, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2428, + "grad_norm": 2.0188560485839844, + "kl": 0.6635024473071098, + "learning_rate": 8.698689350352464e-07, + "loss": 0.0664, + "num_tokens": 14050046.0, + "reward": 0.77984619140625, + "reward_std": 0.015271836891770363, + "rewards//mean": 0.77984619140625, + "rewards//std": 0.01729324646294117, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.243, + "grad_norm": 1.8857295513153076, + "kl": 0.6723480373620987, + "learning_rate": 8.696553289962337e-07, + "loss": 0.0672, + "num_tokens": 14061598.0, + "reward": 0.84320068359375, + "reward_std": 0.021219002082943916, + "rewards//mean": 0.84320068359375, + "rewards//std": 0.02870383858680725, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2432, + "grad_norm": 1.9906548261642456, + "kl": 0.6423120945692062, + "learning_rate": 8.694415740600988e-07, + "loss": 0.0642, + "num_tokens": 14073054.0, + "reward": 0.86151123046875, + "reward_std": 0.030571434646844864, + "rewards//mean": 0.86151123046875, + "rewards//std": 0.0332254022359848, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2434, + "grad_norm": 1.9622759819030762, + "kl": 0.9124999716877937, + "learning_rate": 8.69227670312942e-07, + "loss": 0.0912, + "num_tokens": 14084622.0, + "reward": 0.85009765625, + "reward_std": 0.03449724614620209, + "rewards//mean": 0.85009765625, + "rewards//std": 0.04296311363577843, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2436, + "grad_norm": 1.8976361751556396, + "kl": 0.6636247523128986, + "learning_rate": 8.690136178409235e-07, + "loss": 0.0664, + "num_tokens": 14096206.0, + "reward": 0.8087158203125, + "reward_std": 0.023846780881285667, + "rewards//mean": 0.8087158203125, + "rewards//std": 0.029305916279554367, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2438, + "grad_norm": 2.1729679107666016, + "kl": 0.7021920680999756, + "learning_rate": 8.687994167302641e-07, + "loss": 0.0702, + "num_tokens": 14107830.0, + "reward": 0.84130859375, + "reward_std": 0.03023730032145977, + "rewards//mean": 0.84130859375, + "rewards//std": 0.036087747663259506, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.244, + "grad_norm": 1.6231763362884521, + "kl": 0.6219140626490116, + "learning_rate": 8.685850670672438e-07, + "loss": 0.0622, + "num_tokens": 14119382.0, + "reward": 0.78173828125, + "reward_std": 0.02080589346587658, + "rewards//mean": 0.78173828125, + "rewards//std": 0.033258698880672455, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2442, + "grad_norm": 1.7624930143356323, + "kl": 0.6449780836701393, + "learning_rate": 8.683705689382024e-07, + "loss": 0.0645, + "num_tokens": 14131070.0, + "reward": 0.85205078125, + "reward_std": 0.025351418182253838, + "rewards//mean": 0.85205078125, + "rewards//std": 0.029622651636600494, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2444, + "grad_norm": 1.6551330089569092, + "kl": 0.7684353739023209, + "learning_rate": 8.6815592242954e-07, + "loss": 0.0768, + "num_tokens": 14142622.0, + "reward": 0.82763671875, + "reward_std": 0.021627753973007202, + "rewards//mean": 0.82763671875, + "rewards//std": 0.0277831070125103, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2446, + "grad_norm": 2.1114614009857178, + "kl": 0.7929595038294792, + "learning_rate": 8.67941127627716e-07, + "loss": 0.0793, + "num_tokens": 14154326.0, + "reward": 0.83514404296875, + "reward_std": 0.021678436547517776, + "rewards//mean": 0.83514404296875, + "rewards//std": 0.029050342738628387, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2448, + "grad_norm": 1.8161859512329102, + "kl": 0.6906046643853188, + "learning_rate": 8.677261846192499e-07, + "loss": 0.0691, + "num_tokens": 14165870.0, + "reward": 0.8524169921875, + "reward_std": 0.03033709153532982, + "rewards//mean": 0.8524169921875, + "rewards//std": 0.039832860231399536, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.245, + "grad_norm": 2.7754170894622803, + "kl": 0.7622793167829514, + "learning_rate": 8.675110934907204e-07, + "loss": 0.0762, + "num_tokens": 14177438.0, + "reward": 0.78131103515625, + "reward_std": 0.016899388283491135, + "rewards//mean": 0.78131103515625, + "rewards//std": 0.02705875039100647, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2452, + "grad_norm": 1.8769587278366089, + "kl": 0.6619786322116852, + "learning_rate": 8.672958543287666e-07, + "loss": 0.0662, + "num_tokens": 14188926.0, + "reward": 0.79132080078125, + "reward_std": 0.021131880581378937, + "rewards//mean": 0.79132080078125, + "rewards//std": 0.029063886031508446, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2454, + "grad_norm": 1.946630597114563, + "kl": 0.8278003223240376, + "learning_rate": 8.670804672200865e-07, + "loss": 0.0828, + "num_tokens": 14200486.0, + "reward": 0.849365234375, + "reward_std": 0.03149009495973587, + "rewards//mean": 0.849365234375, + "rewards//std": 0.04051057994365692, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2456, + "grad_norm": 2.033825635910034, + "kl": 0.5929355435073376, + "learning_rate": 8.668649322514381e-07, + "loss": 0.0593, + "num_tokens": 14212070.0, + "reward": 0.863037109375, + "reward_std": 0.026542015373706818, + "rewards//mean": 0.863037109375, + "rewards//std": 0.028771329671144485, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2458, + "grad_norm": 1.7719309329986572, + "kl": 0.6300454065203667, + "learning_rate": 8.666492495096389e-07, + "loss": 0.063, + "num_tokens": 14223622.0, + "reward": 0.852783203125, + "reward_std": 0.01754189282655716, + "rewards//mean": 0.852783203125, + "rewards//std": 0.019573058933019638, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.246, + "grad_norm": 1.6789416074752808, + "kl": 0.7727434374392033, + "learning_rate": 8.664334190815659e-07, + "loss": 0.0773, + "num_tokens": 14235214.0, + "reward": 0.81646728515625, + "reward_std": 0.031234916299581528, + "rewards//mean": 0.81646728515625, + "rewards//std": 0.0430644266307354, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2462, + "grad_norm": 1.8657196760177612, + "kl": 0.6603197976946831, + "learning_rate": 8.662174410541554e-07, + "loss": 0.066, + "num_tokens": 14246750.0, + "reward": 0.82073974609375, + "reward_std": 0.028914058580994606, + "rewards//mean": 0.82073974609375, + "rewards//std": 0.0446527935564518, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2464, + "grad_norm": 1.8770426511764526, + "kl": 0.6110378913581371, + "learning_rate": 8.660013155144035e-07, + "loss": 0.0611, + "num_tokens": 14258222.0, + "reward": 0.86767578125, + "reward_std": 0.026850439608097076, + "rewards//mean": 0.86767578125, + "rewards//std": 0.03265608474612236, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2466, + "grad_norm": 2.163252592086792, + "kl": 0.5980824083089828, + "learning_rate": 8.657850425493654e-07, + "loss": 0.0598, + "num_tokens": 14269774.0, + "reward": 0.85675048828125, + "reward_std": 0.03183293715119362, + "rewards//mean": 0.85675048828125, + "rewards//std": 0.03584906831383705, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2468, + "grad_norm": 1.9437925815582275, + "kl": 0.6944381780922413, + "learning_rate": 8.65568622246156e-07, + "loss": 0.0694, + "num_tokens": 14281366.0, + "reward": 0.81524658203125, + "reward_std": 0.017386969178915024, + "rewards//mean": 0.81524658203125, + "rewards//std": 0.026214823126792908, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.247, + "grad_norm": 2.0990514755249023, + "kl": 0.7417266666889191, + "learning_rate": 8.653520546919493e-07, + "loss": 0.0742, + "num_tokens": 14292870.0, + "reward": 0.84405517578125, + "reward_std": 0.022277237847447395, + "rewards//mean": 0.84405517578125, + "rewards//std": 0.03336634114384651, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2472, + "grad_norm": 1.6867579221725464, + "kl": 0.6393818110227585, + "learning_rate": 8.651353399739787e-07, + "loss": 0.0639, + "num_tokens": 14304438.0, + "reward": 0.8179931640625, + "reward_std": 0.032317254692316055, + "rewards//mean": 0.8179931640625, + "rewards//std": 0.041873782873153687, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2474, + "grad_norm": 2.2249996662139893, + "kl": 0.7258444353938103, + "learning_rate": 8.649184781795367e-07, + "loss": 0.0726, + "num_tokens": 14316022.0, + "reward": 0.83282470703125, + "reward_std": 0.024811681360006332, + "rewards//mean": 0.83282470703125, + "rewards//std": 0.030074460431933403, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2476, + "grad_norm": 2.4835891723632812, + "kl": 0.5359928831458092, + "learning_rate": 8.647014693959753e-07, + "loss": 0.0536, + "num_tokens": 14327590.0, + "reward": 0.86700439453125, + "reward_std": 0.031063305214047432, + "rewards//mean": 0.86700439453125, + "rewards//std": 0.04242377355694771, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2478, + "grad_norm": 2.1372406482696533, + "kl": 0.8225936405360699, + "learning_rate": 8.644843137107057e-07, + "loss": 0.0823, + "num_tokens": 14339206.0, + "reward": 0.84912109375, + "reward_std": 0.027008865028619766, + "rewards//mean": 0.84912109375, + "rewards//std": 0.03407878428697586, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.248, + "grad_norm": 2.162055015563965, + "kl": 0.6919315718114376, + "learning_rate": 8.642670112111981e-07, + "loss": 0.0692, + "num_tokens": 14350878.0, + "reward": 0.83148193359375, + "reward_std": 0.02278326079249382, + "rewards//mean": 0.83148193359375, + "rewards//std": 0.027803735807538033, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2482, + "grad_norm": 2.1309685707092285, + "kl": 0.723419014364481, + "learning_rate": 8.64049561984982e-07, + "loss": 0.0723, + "num_tokens": 14362510.0, + "reward": 0.8270263671875, + "reward_std": 0.02243286371231079, + "rewards//mean": 0.8270263671875, + "rewards//std": 0.025272004306316376, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2484, + "grad_norm": 2.293038845062256, + "kl": 0.603964664041996, + "learning_rate": 8.638319661196459e-07, + "loss": 0.0604, + "num_tokens": 14374038.0, + "reward": 0.8211669921875, + "reward_std": 0.015530337579548359, + "rewards//mean": 0.8211669921875, + "rewards//std": 0.020177416503429413, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2486, + "grad_norm": 2.0297491550445557, + "kl": 0.8791936933994293, + "learning_rate": 8.636142237028372e-07, + "loss": 0.0879, + "num_tokens": 14385646.0, + "reward": 0.8372802734375, + "reward_std": 0.02020917274057865, + "rewards//mean": 0.8372802734375, + "rewards//std": 0.023583978414535522, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2488, + "grad_norm": 1.9647690057754517, + "kl": 0.6428892388939857, + "learning_rate": 8.633963348222628e-07, + "loss": 0.0643, + "num_tokens": 14397254.0, + "reward": 0.85882568359375, + "reward_std": 0.021637044847011566, + "rewards//mean": 0.85882568359375, + "rewards//std": 0.026281146332621574, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.249, + "grad_norm": 1.7031581401824951, + "kl": 0.6981330662965775, + "learning_rate": 8.631782995656882e-07, + "loss": 0.0698, + "num_tokens": 14408822.0, + "reward": 0.845458984375, + "reward_std": 0.024189136922359467, + "rewards//mean": 0.845458984375, + "rewards//std": 0.0316188782453537, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2492, + "grad_norm": 2.0920865535736084, + "kl": 0.8467460870742798, + "learning_rate": 8.62960118020938e-07, + "loss": 0.0847, + "num_tokens": 14420454.0, + "reward": 0.8643798828125, + "reward_std": 0.024705827236175537, + "rewards//mean": 0.8643798828125, + "rewards//std": 0.03816217556595802, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2494, + "grad_norm": 2.2228102684020996, + "kl": 0.7079585790634155, + "learning_rate": 8.627417902758956e-07, + "loss": 0.0708, + "num_tokens": 14432118.0, + "reward": 0.82269287109375, + "reward_std": 0.023624982684850693, + "rewards//mean": 0.82269287109375, + "rewards//std": 0.0295089241117239, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2496, + "grad_norm": 2.357789993286133, + "kl": 0.7086575329303741, + "learning_rate": 8.625233164185034e-07, + "loss": 0.0709, + "num_tokens": 14443646.0, + "reward": 0.81268310546875, + "reward_std": 0.016040772199630737, + "rewards//mean": 0.81268310546875, + "rewards//std": 0.017082756385207176, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2498, + "grad_norm": 1.7868108749389648, + "kl": 0.7243727557361126, + "learning_rate": 8.623046965367628e-07, + "loss": 0.0724, + "num_tokens": 14455166.0, + "reward": 0.82965087890625, + "reward_std": 0.022280355915427208, + "rewards//mean": 0.82965087890625, + "rewards//std": 0.03033456765115261, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.25, + "grad_norm": 1.8304318189620972, + "kl": 0.7058295086026192, + "learning_rate": 8.620859307187338e-07, + "loss": 0.0706, + "num_tokens": 14466710.0, + "reward": 0.80303955078125, + "reward_std": 0.01985947974026203, + "rewards//mean": 0.80303955078125, + "rewards//std": 0.03326820209622383, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2502, + "grad_norm": 1.7283879518508911, + "kl": 0.7791228853166103, + "learning_rate": 8.61867019052535e-07, + "loss": 0.0779, + "num_tokens": 14478454.0, + "reward": 0.84075927734375, + "reward_std": 0.017473798245191574, + "rewards//mean": 0.84075927734375, + "rewards//std": 0.02084195986390114, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2504, + "grad_norm": 1.6074273586273193, + "kl": 0.5401689633727074, + "learning_rate": 8.616479616263444e-07, + "loss": 0.054, + "num_tokens": 14490038.0, + "reward": 0.79022216796875, + "reward_std": 0.01963425800204277, + "rewards//mean": 0.79022216796875, + "rewards//std": 0.025673359632492065, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2506, + "grad_norm": 1.697751760482788, + "kl": 0.9818354584276676, + "learning_rate": 8.61428758528398e-07, + "loss": 0.0982, + "num_tokens": 14501494.0, + "reward": 0.8427734375, + "reward_std": 0.02710142731666565, + "rewards//mean": 0.8427734375, + "rewards//std": 0.03407523036003113, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.2508, + "grad_norm": 2.795522689819336, + "kl": 0.8979740664362907, + "learning_rate": 8.612094098469909e-07, + "loss": 0.1048, + "num_tokens": 14513032.0, + "reward": 0.8037109375, + "reward_std": 0.02146185375750065, + "rewards//mean": 0.8037109375, + "rewards//std": 0.026684768497943878, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.251, + "grad_norm": 1.737756371498108, + "kl": 0.7534324191510677, + "learning_rate": 8.609899156704767e-07, + "loss": 0.0753, + "num_tokens": 14524688.0, + "reward": 0.83984375, + "reward_std": 0.02179780974984169, + "rewards//mean": 0.83984375, + "rewards//std": 0.02734817937016487, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2512, + "grad_norm": 1.8348870277404785, + "kl": 0.8592146411538124, + "learning_rate": 8.607702760872677e-07, + "loss": 0.0859, + "num_tokens": 14536416.0, + "reward": 0.86468505859375, + "reward_std": 0.0331537127494812, + "rewards//mean": 0.86468505859375, + "rewards//std": 0.04344449192285538, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2514, + "grad_norm": 1.972006916999817, + "kl": 0.44329238310456276, + "learning_rate": 8.605504911858346e-07, + "loss": 0.0443, + "num_tokens": 14547904.0, + "reward": 0.8280029296875, + "reward_std": 0.01577191986143589, + "rewards//mean": 0.8280029296875, + "rewards//std": 0.02195378951728344, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2516, + "grad_norm": 1.5956956148147583, + "kl": 0.6774386018514633, + "learning_rate": 8.603305610547069e-07, + "loss": 0.0677, + "num_tokens": 14559560.0, + "reward": 0.8282470703125, + "reward_std": 0.021658379584550858, + "rewards//mean": 0.8282470703125, + "rewards//std": 0.03193969279527664, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2518, + "grad_norm": 1.950666069984436, + "kl": 0.7736416086554527, + "learning_rate": 8.601104857824722e-07, + "loss": 0.0774, + "num_tokens": 14571112.0, + "reward": 0.8671875, + "reward_std": 0.03331301361322403, + "rewards//mean": 0.8671875, + "rewards//std": 0.03916776552796364, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.252, + "grad_norm": 2.1226625442504883, + "kl": 0.7521033398807049, + "learning_rate": 8.598902654577768e-07, + "loss": 0.0752, + "num_tokens": 14582656.0, + "reward": 0.8304443359375, + "reward_std": 0.035131338983774185, + "rewards//mean": 0.8304443359375, + "rewards//std": 0.03977353125810623, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2522, + "grad_norm": 1.9919055700302124, + "kl": 0.9239918887615204, + "learning_rate": 8.596699001693255e-07, + "loss": 0.0924, + "num_tokens": 14594256.0, + "reward": 0.822265625, + "reward_std": 0.028113458305597305, + "rewards//mean": 0.822265625, + "rewards//std": 0.030615774914622307, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2524, + "grad_norm": 2.0443613529205322, + "kl": 0.7695573456585407, + "learning_rate": 8.594493900058816e-07, + "loss": 0.077, + "num_tokens": 14605768.0, + "reward": 0.83990478515625, + "reward_std": 0.02607611007988453, + "rewards//mean": 0.83990478515625, + "rewards//std": 0.03370713070034981, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2526, + "grad_norm": 2.2552483081817627, + "kl": 0.718209944665432, + "learning_rate": 8.592287350562663e-07, + "loss": 0.0718, + "num_tokens": 14617280.0, + "reward": 0.85662841796875, + "reward_std": 0.03402780741453171, + "rewards//mean": 0.85662841796875, + "rewards//std": 0.04153088852763176, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2528, + "grad_norm": 1.8305773735046387, + "kl": 0.6529898308217525, + "learning_rate": 8.590079354093593e-07, + "loss": 0.0653, + "num_tokens": 14628856.0, + "reward": 0.851806640625, + "reward_std": 0.016398467123508453, + "rewards//mean": 0.851806640625, + "rewards//std": 0.01941152662038803, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.253, + "grad_norm": 1.9111404418945312, + "kl": 0.7864052951335907, + "learning_rate": 8.587869911540992e-07, + "loss": 0.0786, + "num_tokens": 14640448.0, + "reward": 0.8695068359375, + "reward_std": 0.02823832258582115, + "rewards//mean": 0.8695068359375, + "rewards//std": 0.03973697870969772, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2532, + "grad_norm": 1.7021249532699585, + "kl": 0.6142213828861713, + "learning_rate": 8.585659023794818e-07, + "loss": 0.0614, + "num_tokens": 14652056.0, + "reward": 0.80804443359375, + "reward_std": 0.022229397669434547, + "rewards//mean": 0.80804443359375, + "rewards//std": 0.02767276018857956, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2534, + "grad_norm": 3.1897542476654053, + "kl": 1.0515189841389656, + "learning_rate": 8.583446691745617e-07, + "loss": 0.1052, + "num_tokens": 14663728.0, + "reward": 0.82159423828125, + "reward_std": 0.026113612577319145, + "rewards//mean": 0.82159423828125, + "rewards//std": 0.035386666655540466, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2536, + "grad_norm": 2.3346645832061768, + "kl": 0.7112926244735718, + "learning_rate": 8.581232916284517e-07, + "loss": 0.0711, + "num_tokens": 14675328.0, + "reward": 0.75714111328125, + "reward_std": 0.026078635826706886, + "rewards//mean": 0.75714111328125, + "rewards//std": 0.03663432225584984, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2538, + "grad_norm": 2.1343886852264404, + "kl": 0.7613390907645226, + "learning_rate": 8.579017698303228e-07, + "loss": 0.0761, + "num_tokens": 14686920.0, + "reward": 0.82403564453125, + "reward_std": 0.03132418170571327, + "rewards//mean": 0.82403564453125, + "rewards//std": 0.04306969791650772, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.254, + "grad_norm": 2.0450212955474854, + "kl": 0.988751731812954, + "learning_rate": 8.576801038694039e-07, + "loss": 0.0989, + "num_tokens": 14698504.0, + "reward": 0.8724365234375, + "reward_std": 0.026708748191595078, + "rewards//mean": 0.8724365234375, + "rewards//std": 0.032423246651887894, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2542, + "grad_norm": 1.7350879907608032, + "kl": 1.0248451679944992, + "learning_rate": 8.574582938349817e-07, + "loss": 0.1025, + "num_tokens": 14710136.0, + "reward": 0.829345703125, + "reward_std": 0.024135252460837364, + "rewards//mean": 0.829345703125, + "rewards//std": 0.02871234156191349, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2544, + "grad_norm": 1.7338560819625854, + "kl": 0.7678513079881668, + "learning_rate": 8.572363398164016e-07, + "loss": 0.0768, + "num_tokens": 14721688.0, + "reward": 0.86578369140625, + "reward_std": 0.023974113166332245, + "rewards//mean": 0.86578369140625, + "rewards//std": 0.02679678425192833, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2546, + "grad_norm": 1.828963279724121, + "kl": 0.6599876172840595, + "learning_rate": 8.570142419030666e-07, + "loss": 0.066, + "num_tokens": 14733320.0, + "reward": 0.8499755859375, + "reward_std": 0.026463264599442482, + "rewards//mean": 0.8499755859375, + "rewards//std": 0.031907450407743454, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2548, + "grad_norm": 2.1462810039520264, + "kl": 0.772374339401722, + "learning_rate": 8.567920001844375e-07, + "loss": 0.0772, + "num_tokens": 14744864.0, + "reward": 0.84771728515625, + "reward_std": 0.0291866734623909, + "rewards//mean": 0.84771728515625, + "rewards//std": 0.03804154694080353, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.255, + "grad_norm": 1.9559959173202515, + "kl": 0.5883542001247406, + "learning_rate": 8.565696147500337e-07, + "loss": 0.0588, + "num_tokens": 14756440.0, + "reward": 0.83026123046875, + "reward_std": 0.013853225857019424, + "rewards//mean": 0.83026123046875, + "rewards//std": 0.027751963585615158, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2552, + "grad_norm": 1.952638864517212, + "kl": 0.7928510718047619, + "learning_rate": 8.563470856894314e-07, + "loss": 0.0793, + "num_tokens": 14768008.0, + "reward": 0.8143310546875, + "reward_std": 0.018022436648607254, + "rewards//mean": 0.8143310546875, + "rewards//std": 0.020516669377684593, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2554, + "grad_norm": 2.2336435317993164, + "kl": 0.72412533685565, + "learning_rate": 8.561244130922657e-07, + "loss": 0.0724, + "num_tokens": 14779608.0, + "reward": 0.85125732421875, + "reward_std": 0.03130761906504631, + "rewards//mean": 0.85125732421875, + "rewards//std": 0.03813573718070984, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2556, + "grad_norm": 1.9266926050186157, + "kl": 0.6441489011049271, + "learning_rate": 8.559015970482291e-07, + "loss": 0.0644, + "num_tokens": 14791208.0, + "reward": 0.8638916015625, + "reward_std": 0.03365616500377655, + "rewards//mean": 0.8638916015625, + "rewards//std": 0.04524605721235275, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2558, + "grad_norm": 2.05189847946167, + "kl": 0.9203442111611366, + "learning_rate": 8.556786376470716e-07, + "loss": 0.092, + "num_tokens": 14802880.0, + "reward": 0.84136962890625, + "reward_std": 0.03770702704787254, + "rewards//mean": 0.84136962890625, + "rewards//std": 0.0456744059920311, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.256, + "grad_norm": 2.1242051124572754, + "kl": 0.9809471145272255, + "learning_rate": 8.554555349786015e-07, + "loss": 0.0981, + "num_tokens": 14814408.0, + "reward": 0.80987548828125, + "reward_std": 0.027789510786533356, + "rewards//mean": 0.80987548828125, + "rewards//std": 0.0330655537545681, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2562, + "grad_norm": 2.0846359729766846, + "kl": 0.6344337239861488, + "learning_rate": 8.552322891326844e-07, + "loss": 0.0634, + "num_tokens": 14825992.0, + "reward": 0.81768798828125, + "reward_std": 0.021996822208166122, + "rewards//mean": 0.81768798828125, + "rewards//std": 0.024469416588544846, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2564, + "grad_norm": 2.2834413051605225, + "kl": 1.0390521213412285, + "learning_rate": 8.550089001992437e-07, + "loss": 0.1039, + "num_tokens": 14837576.0, + "reward": 0.802001953125, + "reward_std": 0.02035699039697647, + "rewards//mean": 0.802001953125, + "rewards//std": 0.026592710986733437, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2566, + "grad_norm": 2.3678369522094727, + "kl": 0.8048441559076309, + "learning_rate": 8.547853682682604e-07, + "loss": 0.0805, + "num_tokens": 14849048.0, + "reward": 0.850341796875, + "reward_std": 0.020504657179117203, + "rewards//mean": 0.850341796875, + "rewards//std": 0.027302753180265427, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2568, + "grad_norm": 2.2163214683532715, + "kl": 0.7477850764989853, + "learning_rate": 8.545616934297733e-07, + "loss": 0.0748, + "num_tokens": 14860584.0, + "reward": 0.857666015625, + "reward_std": 0.03397523984313011, + "rewards//mean": 0.857666015625, + "rewards//std": 0.03890017792582512, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.257, + "grad_norm": 2.2516703605651855, + "kl": 0.7755516842007637, + "learning_rate": 8.543378757738784e-07, + "loss": 0.0776, + "num_tokens": 14872152.0, + "reward": 0.7860107421875, + "reward_std": 0.015467697754502296, + "rewards//mean": 0.7860107421875, + "rewards//std": 0.017791688442230225, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2572, + "grad_norm": 2.052602767944336, + "kl": 1.0396039709448814, + "learning_rate": 8.541139153907295e-07, + "loss": 0.104, + "num_tokens": 14883672.0, + "reward": 0.88226318359375, + "reward_std": 0.02819950506091118, + "rewards//mean": 0.88226318359375, + "rewards//std": 0.030859000980854034, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2574, + "grad_norm": 2.067368268966675, + "kl": 1.114944227039814, + "learning_rate": 8.538898123705379e-07, + "loss": 0.1115, + "num_tokens": 14895216.0, + "reward": 0.8643798828125, + "reward_std": 0.027555547654628754, + "rewards//mean": 0.8643798828125, + "rewards//std": 0.04237258434295654, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2576, + "grad_norm": 2.405848741531372, + "kl": 1.06941569596529, + "learning_rate": 8.536655668035721e-07, + "loss": 0.1069, + "num_tokens": 14906808.0, + "reward": 0.87139892578125, + "reward_std": 0.03952668607234955, + "rewards//mean": 0.87139892578125, + "rewards//std": 0.044376324862241745, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2578, + "grad_norm": 2.020063638687134, + "kl": 0.9828615970909595, + "learning_rate": 8.534411787801586e-07, + "loss": 0.0983, + "num_tokens": 14918384.0, + "reward": 0.84454345703125, + "reward_std": 0.025956785306334496, + "rewards//mean": 0.84454345703125, + "rewards//std": 0.03251992166042328, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.258, + "grad_norm": 3.2922561168670654, + "kl": 1.1629251800477505, + "learning_rate": 8.532166483906802e-07, + "loss": 0.1163, + "num_tokens": 14929952.0, + "reward": 0.828369140625, + "reward_std": 0.020208220928907394, + "rewards//mean": 0.828369140625, + "rewards//std": 0.027890779078006744, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2582, + "grad_norm": 3.896566867828369, + "kl": 0.9031587056815624, + "learning_rate": 8.529919757255781e-07, + "loss": 0.0903, + "num_tokens": 14941472.0, + "reward": 0.8023681640625, + "reward_std": 0.017284249886870384, + "rewards//mean": 0.8023681640625, + "rewards//std": 0.02265687845647335, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2584, + "grad_norm": 1.8267507553100586, + "kl": 1.1318640485405922, + "learning_rate": 8.527671608753506e-07, + "loss": 0.1132, + "num_tokens": 14953024.0, + "reward": 0.822509765625, + "reward_std": 0.021743685007095337, + "rewards//mean": 0.822509765625, + "rewards//std": 0.024576283991336823, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2586, + "grad_norm": 1.9206901788711548, + "kl": 0.722185917198658, + "learning_rate": 8.525422039305528e-07, + "loss": 0.0722, + "num_tokens": 14964664.0, + "reward": 0.83660888671875, + "reward_std": 0.02606373280286789, + "rewards//mean": 0.83660888671875, + "rewards//std": 0.035805124789476395, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2588, + "grad_norm": 3.14884614944458, + "kl": 0.9603419005870819, + "learning_rate": 8.523171049817973e-07, + "loss": 0.096, + "num_tokens": 14976184.0, + "reward": 0.8236083984375, + "reward_std": 0.01672906056046486, + "rewards//mean": 0.8236083984375, + "rewards//std": 0.019428284838795662, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.259, + "grad_norm": 1.8015341758728027, + "kl": 0.8676140494644642, + "learning_rate": 8.520918641197541e-07, + "loss": 0.0868, + "num_tokens": 14987768.0, + "reward": 0.8299560546875, + "reward_std": 0.02170868217945099, + "rewards//mean": 0.8299560546875, + "rewards//std": 0.03291809931397438, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2592, + "grad_norm": 1.907727599143982, + "kl": 0.6574233882129192, + "learning_rate": 8.518664814351502e-07, + "loss": 0.0657, + "num_tokens": 14999392.0, + "reward": 0.77789306640625, + "reward_std": 0.021522726863622665, + "rewards//mean": 0.77789306640625, + "rewards//std": 0.036224186420440674, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2594, + "grad_norm": 1.967384696006775, + "kl": 0.7736397907137871, + "learning_rate": 8.516409570187696e-07, + "loss": 0.0774, + "num_tokens": 15011072.0, + "reward": 0.82550048828125, + "reward_std": 0.022686228156089783, + "rewards//mean": 0.82550048828125, + "rewards//std": 0.029347918927669525, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2596, + "grad_norm": 1.8970897197723389, + "kl": 0.8287846371531487, + "learning_rate": 8.514152909614535e-07, + "loss": 0.0829, + "num_tokens": 15022712.0, + "reward": 0.86224365234375, + "reward_std": 0.029573120176792145, + "rewards//mean": 0.86224365234375, + "rewards//std": 0.03570817783474922, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2598, + "grad_norm": 1.6896653175354004, + "kl": 0.6720482110977173, + "learning_rate": 8.511894833541005e-07, + "loss": 0.0672, + "num_tokens": 15034312.0, + "reward": 0.84124755859375, + "reward_std": 0.024182971566915512, + "rewards//mean": 0.84124755859375, + "rewards//std": 0.03218443691730499, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.26, + "grad_norm": 1.8914735317230225, + "kl": 0.7887661457061768, + "learning_rate": 8.509635342876654e-07, + "loss": 0.0769, + "num_tokens": 15045814.0, + "reward": 0.85809326171875, + "reward_std": 0.03401002660393715, + "rewards//mean": 0.85809326171875, + "rewards//std": 0.03863264620304108, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2602, + "grad_norm": 1.745437502861023, + "kl": 0.7837763093411922, + "learning_rate": 8.507374438531606e-07, + "loss": 0.0784, + "num_tokens": 15057350.0, + "reward": 0.80718994140625, + "reward_std": 0.017848577350378036, + "rewards//mean": 0.80718994140625, + "rewards//std": 0.031950294971466064, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2604, + "grad_norm": 2.0344223976135254, + "kl": 0.7624105848371983, + "learning_rate": 8.505112121416553e-07, + "loss": 0.0762, + "num_tokens": 15068982.0, + "reward": 0.8253173828125, + "reward_std": 0.020513979718089104, + "rewards//mean": 0.8253173828125, + "rewards//std": 0.027590377256274223, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2606, + "grad_norm": 2.4139957427978516, + "kl": 0.854373574256897, + "learning_rate": 8.502848392442758e-07, + "loss": 0.0854, + "num_tokens": 15080534.0, + "reward": 0.8438720703125, + "reward_std": 0.03242456167936325, + "rewards//mean": 0.8438720703125, + "rewards//std": 0.039142828434705734, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2608, + "grad_norm": 2.1237690448760986, + "kl": 0.8408098928630352, + "learning_rate": 8.500583252522052e-07, + "loss": 0.0841, + "num_tokens": 15092062.0, + "reward": 0.8228759765625, + "reward_std": 0.025084946304559708, + "rewards//mean": 0.8228759765625, + "rewards//std": 0.031166328117251396, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.261, + "grad_norm": 2.1822595596313477, + "kl": 1.1286577731370926, + "learning_rate": 8.498316702566826e-07, + "loss": 0.1129, + "num_tokens": 15103622.0, + "reward": 0.7939453125, + "reward_std": 0.02155713364481926, + "rewards//mean": 0.7939453125, + "rewards//std": 0.024389559403061867, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2612, + "grad_norm": 2.0552542209625244, + "kl": 0.8739968277513981, + "learning_rate": 8.496048743490053e-07, + "loss": 0.0874, + "num_tokens": 15115302.0, + "reward": 0.871337890625, + "reward_std": 0.028276920318603516, + "rewards//mean": 0.871337890625, + "rewards//std": 0.03847446292638779, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2614, + "grad_norm": 2.1836326122283936, + "kl": 1.003544956445694, + "learning_rate": 8.493779376205264e-07, + "loss": 0.1004, + "num_tokens": 15126870.0, + "reward": 0.810546875, + "reward_std": 0.02314295992255211, + "rewards//mean": 0.810546875, + "rewards//std": 0.037253499031066895, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2616, + "grad_norm": 1.8821353912353516, + "kl": 0.8202426470816135, + "learning_rate": 8.491508601626561e-07, + "loss": 0.082, + "num_tokens": 15138478.0, + "reward": 0.86212158203125, + "reward_std": 0.03181704506278038, + "rewards//mean": 0.86212158203125, + "rewards//std": 0.03851727768778801, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2618, + "grad_norm": 1.9904552698135376, + "kl": 0.7330868616700172, + "learning_rate": 8.489236420668608e-07, + "loss": 0.0733, + "num_tokens": 15150230.0, + "reward": 0.8505859375, + "reward_std": 0.023276114836335182, + "rewards//mean": 0.8505859375, + "rewards//std": 0.027321597561240196, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.262, + "grad_norm": 2.029090642929077, + "kl": 0.6372318975627422, + "learning_rate": 8.486962834246645e-07, + "loss": 0.0637, + "num_tokens": 15161766.0, + "reward": 0.8326416015625, + "reward_std": 0.02174319326877594, + "rewards//mean": 0.8326416015625, + "rewards//std": 0.030227400362491608, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2622, + "grad_norm": 2.009643077850342, + "kl": 0.9216242991387844, + "learning_rate": 8.484687843276468e-07, + "loss": 0.0922, + "num_tokens": 15173302.0, + "reward": 0.8399658203125, + "reward_std": 0.025125697255134583, + "rewards//mean": 0.8399658203125, + "rewards//std": 0.03182574361562729, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2624, + "grad_norm": 1.8791335821151733, + "kl": 0.8468313813209534, + "learning_rate": 8.482411448674445e-07, + "loss": 0.0847, + "num_tokens": 15184878.0, + "reward": 0.8365478515625, + "reward_std": 0.027382247149944305, + "rewards//mean": 0.8365478515625, + "rewards//std": 0.03146025165915489, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2626, + "grad_norm": 2.2190349102020264, + "kl": 0.8075997307896614, + "learning_rate": 8.480133651357505e-07, + "loss": 0.0808, + "num_tokens": 15196478.0, + "reward": 0.8385009765625, + "reward_std": 0.02758500725030899, + "rewards//mean": 0.8385009765625, + "rewards//std": 0.03132136911153793, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2628, + "grad_norm": 1.7618423700332642, + "kl": 0.7291473969817162, + "learning_rate": 8.477854452243147e-07, + "loss": 0.0729, + "num_tokens": 15208102.0, + "reward": 0.83135986328125, + "reward_std": 0.02113945223391056, + "rewards//mean": 0.83135986328125, + "rewards//std": 0.02832746133208275, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.263, + "grad_norm": 4.349953651428223, + "kl": 1.0147571414709091, + "learning_rate": 8.475573852249434e-07, + "loss": 0.1015, + "num_tokens": 15219654.0, + "reward": 0.770263671875, + "reward_std": 0.017852215096354485, + "rewards//mean": 0.770263671875, + "rewards//std": 0.023207735270261765, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2632, + "grad_norm": 2.3844094276428223, + "kl": 0.9350721091032028, + "learning_rate": 8.473291852294986e-07, + "loss": 0.0935, + "num_tokens": 15231166.0, + "reward": 0.84906005859375, + "reward_std": 0.02739860489964485, + "rewards//mean": 0.84906005859375, + "rewards//std": 0.031476788222789764, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2634, + "grad_norm": 1.8456212282180786, + "kl": 0.8244725838303566, + "learning_rate": 8.471008453298996e-07, + "loss": 0.0824, + "num_tokens": 15242710.0, + "reward": 0.86444091796875, + "reward_std": 0.02820904552936554, + "rewards//mean": 0.86444091796875, + "rewards//std": 0.04190246760845184, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2636, + "grad_norm": 2.269632339477539, + "kl": 0.9317579716444016, + "learning_rate": 8.468723656181218e-07, + "loss": 0.0932, + "num_tokens": 15254182.0, + "reward": 0.82830810546875, + "reward_std": 0.019191281870007515, + "rewards//mean": 0.82830810546875, + "rewards//std": 0.027537312358617783, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2638, + "grad_norm": 2.1076011657714844, + "kl": 1.1775119230151176, + "learning_rate": 8.466437461861964e-07, + "loss": 0.1178, + "num_tokens": 15265774.0, + "reward": 0.85028076171875, + "reward_std": 0.029368329793214798, + "rewards//mean": 0.85028076171875, + "rewards//std": 0.030946683138608932, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.264, + "grad_norm": 1.9020578861236572, + "kl": 1.128328301012516, + "learning_rate": 8.464149871262116e-07, + "loss": 0.1128, + "num_tokens": 15277454.0, + "reward": 0.841796875, + "reward_std": 0.030991578474640846, + "rewards//mean": 0.841796875, + "rewards//std": 0.04756743088364601, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2642, + "grad_norm": 2.191267728805542, + "kl": 0.7768005691468716, + "learning_rate": 8.461860885303113e-07, + "loss": 0.0777, + "num_tokens": 15289062.0, + "reward": 0.86468505859375, + "reward_std": 0.0290854349732399, + "rewards//mean": 0.86468505859375, + "rewards//std": 0.03523103892803192, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2644, + "grad_norm": 3.199338674545288, + "kl": 1.2684109061956406, + "learning_rate": 8.459570504906961e-07, + "loss": 0.1268, + "num_tokens": 15300654.0, + "reward": 0.828125, + "reward_std": 0.02695460617542267, + "rewards//mean": 0.828125, + "rewards//std": 0.03765443339943886, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2646, + "grad_norm": 1.872139573097229, + "kl": 0.7322818748652935, + "learning_rate": 8.457278730996222e-07, + "loss": 0.0732, + "num_tokens": 15312198.0, + "reward": 0.83856201171875, + "reward_std": 0.02289734221994877, + "rewards//mean": 0.83856201171875, + "rewards//std": 0.029561715200543404, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2648, + "grad_norm": 2.1895673274993896, + "kl": 0.9512357637286186, + "learning_rate": 8.454985564494024e-07, + "loss": 0.0951, + "num_tokens": 15323782.0, + "reward": 0.81146240234375, + "reward_std": 0.02028394117951393, + "rewards//mean": 0.81146240234375, + "rewards//std": 0.024815265089273453, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.265, + "grad_norm": 1.7171440124511719, + "kl": 0.9189570620656013, + "learning_rate": 8.452691006324054e-07, + "loss": 0.0919, + "num_tokens": 15335342.0, + "reward": 0.84967041015625, + "reward_std": 0.028361598029732704, + "rewards//mean": 0.84967041015625, + "rewards//std": 0.0342806838452816, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2652, + "grad_norm": 2.0280208587646484, + "kl": 0.6721566990017891, + "learning_rate": 8.45039505741056e-07, + "loss": 0.0672, + "num_tokens": 15346926.0, + "reward": 0.849853515625, + "reward_std": 0.0209527388215065, + "rewards//mean": 0.849853515625, + "rewards//std": 0.028737636283040047, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2654, + "grad_norm": 2.0018997192382812, + "kl": 0.9660781845450401, + "learning_rate": 8.448097718678348e-07, + "loss": 0.0966, + "num_tokens": 15358566.0, + "reward": 0.81158447265625, + "reward_std": 0.024467045441269875, + "rewards//mean": 0.81158447265625, + "rewards//std": 0.027636632323265076, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2656, + "grad_norm": 2.010664701461792, + "kl": 1.1283434703946114, + "learning_rate": 8.44579899105279e-07, + "loss": 0.1128, + "num_tokens": 15370126.0, + "reward": 0.83245849609375, + "reward_std": 0.024136768653988838, + "rewards//mean": 0.83245849609375, + "rewards//std": 0.03751986846327782, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2658, + "grad_norm": 2.1045238971710205, + "kl": 0.809752844274044, + "learning_rate": 8.443498875459808e-07, + "loss": 0.081, + "num_tokens": 15381606.0, + "reward": 0.84478759765625, + "reward_std": 0.028092514723539352, + "rewards//mean": 0.84478759765625, + "rewards//std": 0.03491944074630737, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.266, + "grad_norm": 2.1147403717041016, + "kl": 0.6272051259875298, + "learning_rate": 8.441197372825892e-07, + "loss": 0.0627, + "num_tokens": 15393126.0, + "reward": 0.8310546875, + "reward_std": 0.0207013338804245, + "rewards//mean": 0.8310546875, + "rewards//std": 0.04216935485601425, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2662, + "grad_norm": 1.8689652681350708, + "kl": 0.735901303589344, + "learning_rate": 8.438894484078085e-07, + "loss": 0.0736, + "num_tokens": 15404694.0, + "reward": 0.83441162109375, + "reward_std": 0.025579465553164482, + "rewards//mean": 0.83441162109375, + "rewards//std": 0.034108031541109085, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2664, + "grad_norm": 1.7717857360839844, + "kl": 0.7161866240203381, + "learning_rate": 8.43659021014399e-07, + "loss": 0.0716, + "num_tokens": 15416254.0, + "reward": 0.84619140625, + "reward_std": 0.027332797646522522, + "rewards//mean": 0.84619140625, + "rewards//std": 0.03603401780128479, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2666, + "grad_norm": 1.882939100265503, + "kl": 0.9368118494749069, + "learning_rate": 8.434284551951772e-07, + "loss": 0.0937, + "num_tokens": 15427862.0, + "reward": 0.793701171875, + "reward_std": 0.01791667938232422, + "rewards//mean": 0.793701171875, + "rewards//std": 0.025863181799650192, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2668, + "grad_norm": 1.7033449411392212, + "kl": 0.9704795069992542, + "learning_rate": 8.431977510430145e-07, + "loss": 0.097, + "num_tokens": 15439342.0, + "reward": 0.8135986328125, + "reward_std": 0.023381192237138748, + "rewards//mean": 0.8135986328125, + "rewards//std": 0.031791478395462036, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.267, + "grad_norm": 1.9556063413619995, + "kl": 0.7525311335921288, + "learning_rate": 8.429669086508389e-07, + "loss": 0.0753, + "num_tokens": 15450998.0, + "reward": 0.8294677734375, + "reward_std": 0.02936321496963501, + "rewards//mean": 0.8294677734375, + "rewards//std": 0.04053056612610817, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2672, + "grad_norm": 2.0872554779052734, + "kl": 0.7666615843772888, + "learning_rate": 8.427359281116333e-07, + "loss": 0.0767, + "num_tokens": 15462558.0, + "reward": 0.8463134765625, + "reward_std": 0.018406974151730537, + "rewards//mean": 0.8463134765625, + "rewards//std": 0.028656136244535446, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2674, + "grad_norm": 1.8707201480865479, + "kl": 0.9343790411949158, + "learning_rate": 8.42504809518437e-07, + "loss": 0.0934, + "num_tokens": 15474126.0, + "reward": 0.8607177734375, + "reward_std": 0.0281551331281662, + "rewards//mean": 0.8607177734375, + "rewards//std": 0.03464072197675705, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2676, + "grad_norm": 2.554849863052368, + "kl": 1.1727466136217117, + "learning_rate": 8.422735529643443e-07, + "loss": 0.1173, + "num_tokens": 15485702.0, + "reward": 0.85076904296875, + "reward_std": 0.03343772515654564, + "rewards//mean": 0.85076904296875, + "rewards//std": 0.03682966157793999, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2678, + "grad_norm": 2.392958641052246, + "kl": 0.8001222275197506, + "learning_rate": 8.420421585425055e-07, + "loss": 0.08, + "num_tokens": 15497198.0, + "reward": 0.8609619140625, + "reward_std": 0.03059377148747444, + "rewards//mean": 0.8609619140625, + "rewards//std": 0.03780028596520424, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.268, + "grad_norm": 1.8967984914779663, + "kl": 1.0940633602440357, + "learning_rate": 8.41810626346126e-07, + "loss": 0.1094, + "num_tokens": 15508694.0, + "reward": 0.83447265625, + "reward_std": 0.025303777307271957, + "rewards//mean": 0.83447265625, + "rewards//std": 0.03392920643091202, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2682, + "grad_norm": 2.7476696968078613, + "kl": 1.0968676656484604, + "learning_rate": 8.415789564684673e-07, + "loss": 0.1097, + "num_tokens": 15520174.0, + "reward": 0.802001953125, + "reward_std": 0.024585789069533348, + "rewards//mean": 0.802001953125, + "rewards//std": 0.03109363093972206, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2684, + "grad_norm": 2.031879186630249, + "kl": 0.8367527052760124, + "learning_rate": 8.413471490028455e-07, + "loss": 0.0837, + "num_tokens": 15531782.0, + "reward": 0.86236572265625, + "reward_std": 0.024523239582777023, + "rewards//mean": 0.86236572265625, + "rewards//std": 0.03283539041876793, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2686, + "grad_norm": 2.499013662338257, + "kl": 0.9151243343949318, + "learning_rate": 8.41115204042633e-07, + "loss": 0.0915, + "num_tokens": 15543310.0, + "reward": 0.832275390625, + "reward_std": 0.02698695845901966, + "rewards//mean": 0.832275390625, + "rewards//std": 0.03898724913597107, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2688, + "grad_norm": 1.7730753421783447, + "kl": 1.123116109520197, + "learning_rate": 8.408831216812573e-07, + "loss": 0.1123, + "num_tokens": 15554934.0, + "reward": 0.85284423828125, + "reward_std": 0.025472698733210564, + "rewards//mean": 0.85284423828125, + "rewards//std": 0.030624117702245712, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.269, + "grad_norm": 2.996457815170288, + "kl": 1.3202580362558365, + "learning_rate": 8.406509020122008e-07, + "loss": 0.132, + "num_tokens": 15566510.0, + "reward": 0.83551025390625, + "reward_std": 0.018979806452989578, + "rewards//mean": 0.83551025390625, + "rewards//std": 0.02001650631427765, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2692, + "grad_norm": 2.136343002319336, + "kl": 0.7854610607028008, + "learning_rate": 8.404185451290017e-07, + "loss": 0.0785, + "num_tokens": 15578150.0, + "reward": 0.84619140625, + "reward_std": 0.014826999977231026, + "rewards//mean": 0.84619140625, + "rewards//std": 0.02219006046652794, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2694, + "grad_norm": 2.4344322681427, + "kl": 1.1336115002632141, + "learning_rate": 8.401860511252533e-07, + "loss": 0.1134, + "num_tokens": 15589662.0, + "reward": 0.8480224609375, + "reward_std": 0.02620559372007847, + "rewards//mean": 0.8480224609375, + "rewards//std": 0.0289629939943552, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2696, + "grad_norm": 2.201475143432617, + "kl": 1.2189518809318542, + "learning_rate": 8.399534200946043e-07, + "loss": 0.1219, + "num_tokens": 15601174.0, + "reward": 0.839111328125, + "reward_std": 0.02629643678665161, + "rewards//mean": 0.839111328125, + "rewards//std": 0.02859400026500225, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2698, + "grad_norm": 2.728212833404541, + "kl": 0.9196591451764107, + "learning_rate": 8.397206521307583e-07, + "loss": 0.092, + "num_tokens": 15612750.0, + "reward": 0.840087890625, + "reward_std": 0.025007935240864754, + "rewards//mean": 0.840087890625, + "rewards//std": 0.030487943440675735, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.27, + "grad_norm": 2.2097067832946777, + "kl": 1.1701239347457886, + "learning_rate": 8.394877473274741e-07, + "loss": 0.117, + "num_tokens": 15624374.0, + "reward": 0.80853271484375, + "reward_std": 0.020234843716025352, + "rewards//mean": 0.80853271484375, + "rewards//std": 0.026625635102391243, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2702, + "grad_norm": 4.000133037567139, + "kl": 0.9976790063083172, + "learning_rate": 8.392547057785661e-07, + "loss": 0.0998, + "num_tokens": 15635966.0, + "reward": 0.79248046875, + "reward_std": 0.015087291598320007, + "rewards//mean": 0.79248046875, + "rewards//std": 0.017281096428632736, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.2704, + "grad_norm": 1.86553156375885, + "kl": 1.1873038858175278, + "learning_rate": 8.39021527577903e-07, + "loss": 0.1158, + "num_tokens": 15647529.0, + "reward": 0.83697509765625, + "reward_std": 0.030428510159254074, + "rewards//mean": 0.83697509765625, + "rewards//std": 0.0370987169444561, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2706, + "grad_norm": 1.8978906869888306, + "kl": 0.9939531050622463, + "learning_rate": 8.387882128194092e-07, + "loss": 0.0994, + "num_tokens": 15659153.0, + "reward": 0.85638427734375, + "reward_std": 0.02904665470123291, + "rewards//mean": 0.85638427734375, + "rewards//std": 0.039129048585891724, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2708, + "grad_norm": 1.8879302740097046, + "kl": 1.0486668832600117, + "learning_rate": 8.385547615970638e-07, + "loss": 0.1049, + "num_tokens": 15670769.0, + "reward": 0.8310546875, + "reward_std": 0.030161838978528976, + "rewards//mean": 0.8310546875, + "rewards//std": 0.03533834591507912, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.271, + "grad_norm": 2.1453301906585693, + "kl": 1.0261641144752502, + "learning_rate": 8.38321174004901e-07, + "loss": 0.1026, + "num_tokens": 15682329.0, + "reward": 0.8106689453125, + "reward_std": 0.018279360607266426, + "rewards//mean": 0.8106689453125, + "rewards//std": 0.024948880076408386, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2712, + "grad_norm": 2.0478174686431885, + "kl": 1.0907097160816193, + "learning_rate": 8.380874501370097e-07, + "loss": 0.1091, + "num_tokens": 15693889.0, + "reward": 0.85015869140625, + "reward_std": 0.026040218770503998, + "rewards//mean": 0.85015869140625, + "rewards//std": 0.02809404395520687, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2714, + "grad_norm": 1.9564050436019897, + "kl": 1.0038989633321762, + "learning_rate": 8.378535900875338e-07, + "loss": 0.1004, + "num_tokens": 15705489.0, + "reward": 0.8084716796875, + "reward_std": 0.021665241569280624, + "rewards//mean": 0.8084716796875, + "rewards//std": 0.03421686962246895, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2716, + "grad_norm": 2.043379545211792, + "kl": 1.1334854401648045, + "learning_rate": 8.376195939506725e-07, + "loss": 0.1133, + "num_tokens": 15717161.0, + "reward": 0.814453125, + "reward_std": 0.03113923966884613, + "rewards//mean": 0.814453125, + "rewards//std": 0.03601720929145813, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2718, + "grad_norm": 2.6536941528320312, + "kl": 0.8515354059636593, + "learning_rate": 8.373854618206789e-07, + "loss": 0.0852, + "num_tokens": 15728801.0, + "reward": 0.83441162109375, + "reward_std": 0.02729075215756893, + "rewards//mean": 0.83441162109375, + "rewards//std": 0.029656296595931053, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.272, + "grad_norm": 2.296523332595825, + "kl": 0.7019383162260056, + "learning_rate": 8.371511937918617e-07, + "loss": 0.0702, + "num_tokens": 15740433.0, + "reward": 0.81781005859375, + "reward_std": 0.021144770085811615, + "rewards//mean": 0.81781005859375, + "rewards//std": 0.02505929209291935, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.2722, + "grad_norm": 2.0261340141296387, + "kl": 1.0614310204982758, + "learning_rate": 8.369167899585839e-07, + "loss": 0.1015, + "num_tokens": 15752046.0, + "reward": 0.85797119140625, + "reward_std": 0.02800038643181324, + "rewards//mean": 0.85797119140625, + "rewards//std": 0.03966161981225014, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2724, + "grad_norm": 1.950930118560791, + "kl": 0.9281257688999176, + "learning_rate": 8.366822504152636e-07, + "loss": 0.0928, + "num_tokens": 15763614.0, + "reward": 0.864013671875, + "reward_std": 0.030520617961883545, + "rewards//mean": 0.864013671875, + "rewards//std": 0.04158448055386543, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2726, + "grad_norm": 1.9489996433258057, + "kl": 0.7994485571980476, + "learning_rate": 8.364475752563728e-07, + "loss": 0.0799, + "num_tokens": 15775182.0, + "reward": 0.8236083984375, + "reward_std": 0.018605463206768036, + "rewards//mean": 0.8236083984375, + "rewards//std": 0.025307917967438698, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2728, + "grad_norm": 1.9049121141433716, + "kl": 1.0386270694434643, + "learning_rate": 8.362127645764389e-07, + "loss": 0.1039, + "num_tokens": 15786694.0, + "reward": 0.81072998046875, + "reward_std": 0.029841039329767227, + "rewards//mean": 0.81072998046875, + "rewards//std": 0.035193637013435364, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.273, + "grad_norm": 3.1806800365448, + "kl": 1.250419057905674, + "learning_rate": 8.359778184700439e-07, + "loss": 0.125, + "num_tokens": 15798278.0, + "reward": 0.81915283203125, + "reward_std": 0.015385371632874012, + "rewards//mean": 0.81915283203125, + "rewards//std": 0.019094569608569145, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2732, + "grad_norm": 2.569620370864868, + "kl": 0.6638820171356201, + "learning_rate": 8.357427370318238e-07, + "loss": 0.0664, + "num_tokens": 15809846.0, + "reward": 0.8663330078125, + "reward_std": 0.027756601572036743, + "rewards//mean": 0.8663330078125, + "rewards//std": 0.03715890645980835, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2734, + "grad_norm": 2.0888004302978516, + "kl": 0.8210079595446587, + "learning_rate": 8.355075203564692e-07, + "loss": 0.0821, + "num_tokens": 15821454.0, + "reward": 0.83319091796875, + "reward_std": 0.019230201840400696, + "rewards//mean": 0.83319091796875, + "rewards//std": 0.023793181404471397, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2736, + "grad_norm": 2.063401222229004, + "kl": 0.9094007313251495, + "learning_rate": 8.352721685387256e-07, + "loss": 0.0909, + "num_tokens": 15833022.0, + "reward": 0.8145751953125, + "reward_std": 0.02393139898777008, + "rewards//mean": 0.8145751953125, + "rewards//std": 0.030361317098140717, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2738, + "grad_norm": 2.14750599861145, + "kl": 0.7562221325933933, + "learning_rate": 8.350366816733926e-07, + "loss": 0.0756, + "num_tokens": 15844574.0, + "reward": 0.85797119140625, + "reward_std": 0.021821245551109314, + "rewards//mean": 0.85797119140625, + "rewards//std": 0.03083151765167713, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.274, + "grad_norm": 1.9507213830947876, + "kl": 0.8699524626135826, + "learning_rate": 8.348010598553243e-07, + "loss": 0.087, + "num_tokens": 15856190.0, + "reward": 0.82391357421875, + "reward_std": 0.022833596915006638, + "rewards//mean": 0.82391357421875, + "rewards//std": 0.03014785796403885, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2742, + "grad_norm": 3.2683441638946533, + "kl": 1.1032124310731888, + "learning_rate": 8.34565303179429e-07, + "loss": 0.1103, + "num_tokens": 15867878.0, + "reward": 0.8192138671875, + "reward_std": 0.017444290220737457, + "rewards//mean": 0.8192138671875, + "rewards//std": 0.023276448249816895, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2744, + "grad_norm": 2.481736660003662, + "kl": 1.1937672421336174, + "learning_rate": 8.343294117406698e-07, + "loss": 0.1194, + "num_tokens": 15879518.0, + "reward": 0.8614501953125, + "reward_std": 0.037791378796100616, + "rewards//mean": 0.8614501953125, + "rewards//std": 0.04408425837755203, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2746, + "grad_norm": 2.1579480171203613, + "kl": 1.1808425933122635, + "learning_rate": 8.340933856340635e-07, + "loss": 0.1181, + "num_tokens": 15891054.0, + "reward": 0.81463623046875, + "reward_std": 0.025059344246983528, + "rewards//mean": 0.81463623046875, + "rewards//std": 0.03524177893996239, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2748, + "grad_norm": 2.126662492752075, + "kl": 1.2408379092812538, + "learning_rate": 8.338572249546812e-07, + "loss": 0.1241, + "num_tokens": 15902694.0, + "reward": 0.82525634765625, + "reward_std": 0.028806058689951897, + "rewards//mean": 0.82525634765625, + "rewards//std": 0.0363135039806366, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.275, + "grad_norm": 1.8180687427520752, + "kl": 0.7153037711977959, + "learning_rate": 8.336209297976489e-07, + "loss": 0.0715, + "num_tokens": 15914270.0, + "reward": 0.85211181640625, + "reward_std": 0.019677957519888878, + "rewards//mean": 0.85211181640625, + "rewards//std": 0.026501404121518135, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2752, + "grad_norm": 1.904836654663086, + "kl": 0.9495167210698128, + "learning_rate": 8.333845002581458e-07, + "loss": 0.095, + "num_tokens": 15925814.0, + "reward": 0.8583984375, + "reward_std": 0.02282162383198738, + "rewards//mean": 0.8583984375, + "rewards//std": 0.028881605714559555, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2754, + "grad_norm": 2.3523082733154297, + "kl": 0.8951255828142166, + "learning_rate": 8.331479364314059e-07, + "loss": 0.0895, + "num_tokens": 15937326.0, + "reward": 0.81475830078125, + "reward_std": 0.020255250856280327, + "rewards//mean": 0.81475830078125, + "rewards//std": 0.02134786732494831, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2756, + "grad_norm": 1.7464849948883057, + "kl": 1.008415162563324, + "learning_rate": 8.32911238412717e-07, + "loss": 0.1008, + "num_tokens": 15948870.0, + "reward": 0.8284912109375, + "reward_std": 0.02092229202389717, + "rewards//mean": 0.8284912109375, + "rewards//std": 0.030810706317424774, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2758, + "grad_norm": 2.3378422260284424, + "kl": 1.1145086735486984, + "learning_rate": 8.326744062974211e-07, + "loss": 0.1115, + "num_tokens": 15960494.0, + "reward": 0.78302001953125, + "reward_std": 0.017520878463983536, + "rewards//mean": 0.78302001953125, + "rewards//std": 0.02018294855952263, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.276, + "grad_norm": 1.7414829730987549, + "kl": 0.9360692463815212, + "learning_rate": 8.324374401809142e-07, + "loss": 0.0936, + "num_tokens": 15972014.0, + "reward": 0.86456298828125, + "reward_std": 0.02514760196208954, + "rewards//mean": 0.86456298828125, + "rewards//std": 0.0316891185939312, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2762, + "grad_norm": 2.0551576614379883, + "kl": 0.937575489282608, + "learning_rate": 8.322003401586461e-07, + "loss": 0.0938, + "num_tokens": 15983582.0, + "reward": 0.8604736328125, + "reward_std": 0.03415454924106598, + "rewards//mean": 0.8604736328125, + "rewards//std": 0.034670423716306686, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2764, + "grad_norm": 2.317769765853882, + "kl": 1.223831158131361, + "learning_rate": 8.319631063261207e-07, + "loss": 0.1224, + "num_tokens": 15995118.0, + "reward": 0.858154296875, + "reward_std": 0.02796752378344536, + "rewards//mean": 0.858154296875, + "rewards//std": 0.0285516157746315, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2766, + "grad_norm": 2.5948920249938965, + "kl": 0.8881645761430264, + "learning_rate": 8.317257387788958e-07, + "loss": 0.0888, + "num_tokens": 16006622.0, + "reward": 0.85003662109375, + "reward_std": 0.031165126711130142, + "rewards//mean": 0.85003662109375, + "rewards//std": 0.039190128445625305, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2768, + "grad_norm": 2.1784982681274414, + "kl": 0.647545799612999, + "learning_rate": 8.314882376125831e-07, + "loss": 0.0648, + "num_tokens": 16018238.0, + "reward": 0.85675048828125, + "reward_std": 0.024668702855706215, + "rewards//mean": 0.85675048828125, + "rewards//std": 0.03605790063738823, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.277, + "grad_norm": 1.922249674797058, + "kl": 0.6403172239661217, + "learning_rate": 8.312506029228477e-07, + "loss": 0.064, + "num_tokens": 16029862.0, + "reward": 0.7835693359375, + "reward_std": 0.02044534869492054, + "rewards//mean": 0.7835693359375, + "rewards//std": 0.02698010392487049, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2772, + "grad_norm": 1.9084663391113281, + "kl": 0.9563981741666794, + "learning_rate": 8.310128348054093e-07, + "loss": 0.0956, + "num_tokens": 16041430.0, + "reward": 0.8311767578125, + "reward_std": 0.015224622562527657, + "rewards//mean": 0.8311767578125, + "rewards//std": 0.020966242998838425, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2774, + "grad_norm": 2.2589263916015625, + "kl": 0.7609677575528622, + "learning_rate": 8.307749333560404e-07, + "loss": 0.0761, + "num_tokens": 16052966.0, + "reward": 0.81524658203125, + "reward_std": 0.016985023394227028, + "rewards//mean": 0.81524658203125, + "rewards//std": 0.02085285261273384, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2776, + "grad_norm": 1.8726993799209595, + "kl": 1.0962602868676186, + "learning_rate": 8.305368986705681e-07, + "loss": 0.1096, + "num_tokens": 16064470.0, + "reward": 0.85113525390625, + "reward_std": 0.02939489297568798, + "rewards//mean": 0.85113525390625, + "rewards//std": 0.03363969549536705, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2778, + "grad_norm": 1.754748821258545, + "kl": 1.1600496172904968, + "learning_rate": 8.302987308448723e-07, + "loss": 0.116, + "num_tokens": 16076022.0, + "reward": 0.85894775390625, + "reward_std": 0.023200303316116333, + "rewards//mean": 0.85894775390625, + "rewards//std": 0.02697470411658287, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.278, + "grad_norm": 1.950179934501648, + "kl": 0.8200961500406265, + "learning_rate": 8.300604299748874e-07, + "loss": 0.082, + "num_tokens": 16087726.0, + "reward": 0.86273193359375, + "reward_std": 0.03015493042767048, + "rewards//mean": 0.86273193359375, + "rewards//std": 0.036781538277864456, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2782, + "grad_norm": 1.8805323839187622, + "kl": 0.768724799156189, + "learning_rate": 8.298219961566008e-07, + "loss": 0.0769, + "num_tokens": 16099326.0, + "reward": 0.84027099609375, + "reward_std": 0.018914636224508286, + "rewards//mean": 0.84027099609375, + "rewards//std": 0.02490173652768135, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2784, + "grad_norm": 2.158843755722046, + "kl": 0.985366053879261, + "learning_rate": 8.295834294860534e-07, + "loss": 0.0985, + "num_tokens": 16110902.0, + "reward": 0.7772216796875, + "reward_std": 0.01946105808019638, + "rewards//mean": 0.7772216796875, + "rewards//std": 0.025103727355599403, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2786, + "grad_norm": 2.5753750801086426, + "kl": 1.2266412302851677, + "learning_rate": 8.293447300593402e-07, + "loss": 0.1227, + "num_tokens": 16122486.0, + "reward": 0.82354736328125, + "reward_std": 0.03179870545864105, + "rewards//mean": 0.82354736328125, + "rewards//std": 0.039146456867456436, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2788, + "grad_norm": 2.0162196159362793, + "kl": 0.8771979138255119, + "learning_rate": 8.291058979726091e-07, + "loss": 0.0877, + "num_tokens": 16133990.0, + "reward": 0.84075927734375, + "reward_std": 0.02191251888871193, + "rewards//mean": 0.84075927734375, + "rewards//std": 0.025197844952344894, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.279, + "grad_norm": 2.688236713409424, + "kl": 1.0257392525672913, + "learning_rate": 8.288669333220614e-07, + "loss": 0.1026, + "num_tokens": 16145502.0, + "reward": 0.80242919921875, + "reward_std": 0.01869981735944748, + "rewards//mean": 0.80242919921875, + "rewards//std": 0.021154114976525307, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2792, + "grad_norm": 2.108290672302246, + "kl": 0.953646145761013, + "learning_rate": 8.286278362039527e-07, + "loss": 0.0954, + "num_tokens": 16157118.0, + "reward": 0.864990234375, + "reward_std": 0.024742048233747482, + "rewards//mean": 0.864990234375, + "rewards//std": 0.03733711317181587, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2794, + "grad_norm": 2.173569917678833, + "kl": 1.133690945804119, + "learning_rate": 8.283886067145906e-07, + "loss": 0.1134, + "num_tokens": 16168662.0, + "reward": 0.85748291015625, + "reward_std": 0.025370456278324127, + "rewards//mean": 0.85748291015625, + "rewards//std": 0.036265116184949875, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2796, + "grad_norm": 2.5567378997802734, + "kl": 0.973927803337574, + "learning_rate": 8.281492449503372e-07, + "loss": 0.0974, + "num_tokens": 16180238.0, + "reward": 0.84967041015625, + "reward_std": 0.024923451244831085, + "rewards//mean": 0.84967041015625, + "rewards//std": 0.025581805035471916, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2798, + "grad_norm": 1.7988083362579346, + "kl": 1.0407603718340397, + "learning_rate": 8.279097510076069e-07, + "loss": 0.1041, + "num_tokens": 16191854.0, + "reward": 0.8468017578125, + "reward_std": 0.023751862347126007, + "rewards//mean": 0.8468017578125, + "rewards//std": 0.032862868160009384, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.28, + "grad_norm": 2.242676258087158, + "kl": 0.8817980289459229, + "learning_rate": 8.276701249828684e-07, + "loss": 0.0882, + "num_tokens": 16203398.0, + "reward": 0.77642822265625, + "reward_std": 0.024481140077114105, + "rewards//mean": 0.77642822265625, + "rewards//std": 0.03725182265043259, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2802, + "grad_norm": 2.306814193725586, + "kl": 0.9041531383991241, + "learning_rate": 8.274303669726426e-07, + "loss": 0.0904, + "num_tokens": 16214966.0, + "reward": 0.87030029296875, + "reward_std": 0.017981240525841713, + "rewards//mean": 0.87030029296875, + "rewards//std": 0.024460753425955772, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2804, + "grad_norm": 2.100013017654419, + "kl": 1.0972331017255783, + "learning_rate": 8.271904770735041e-07, + "loss": 0.1097, + "num_tokens": 16226534.0, + "reward": 0.8424072265625, + "reward_std": 0.027327045798301697, + "rewards//mean": 0.8424072265625, + "rewards//std": 0.031096309423446655, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2806, + "grad_norm": 1.9116203784942627, + "kl": 0.759739063680172, + "learning_rate": 8.269504553820805e-07, + "loss": 0.076, + "num_tokens": 16238102.0, + "reward": 0.8212890625, + "reward_std": 0.02250491827726364, + "rewards//mean": 0.8212890625, + "rewards//std": 0.0312344953417778, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2808, + "grad_norm": 1.6654787063598633, + "kl": 0.7314506061375141, + "learning_rate": 8.267103019950528e-07, + "loss": 0.0731, + "num_tokens": 16249638.0, + "reward": 0.84539794921875, + "reward_std": 0.01944996416568756, + "rewards//mean": 0.84539794921875, + "rewards//std": 0.0236002616584301, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.281, + "grad_norm": 2.174598217010498, + "kl": 0.7477049455046654, + "learning_rate": 8.264700170091543e-07, + "loss": 0.0748, + "num_tokens": 16261142.0, + "reward": 0.8157958984375, + "reward_std": 0.01541058998554945, + "rewards//mean": 0.8157958984375, + "rewards//std": 0.034395139664411545, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2812, + "grad_norm": 2.2756242752075195, + "kl": 1.0452504344284534, + "learning_rate": 8.262296005211721e-07, + "loss": 0.1045, + "num_tokens": 16272662.0, + "reward": 0.81268310546875, + "reward_std": 0.014798358082771301, + "rewards//mean": 0.81268310546875, + "rewards//std": 0.018797343596816063, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2814, + "grad_norm": 2.6187784671783447, + "kl": 0.98514574021101, + "learning_rate": 8.259890526279459e-07, + "loss": 0.0985, + "num_tokens": 16284302.0, + "reward": 0.80242919921875, + "reward_std": 0.020262975245714188, + "rewards//mean": 0.80242919921875, + "rewards//std": 0.029796335846185684, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2816, + "grad_norm": 2.5678088665008545, + "kl": 1.5318112671375275, + "learning_rate": 8.257483734263681e-07, + "loss": 0.1532, + "num_tokens": 16295894.0, + "reward": 0.85980224609375, + "reward_std": 0.021978463977575302, + "rewards//mean": 0.85980224609375, + "rewards//std": 0.0334012545645237, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2818, + "grad_norm": 1.93360435962677, + "kl": 0.7821978144347668, + "learning_rate": 8.255075630133845e-07, + "loss": 0.0782, + "num_tokens": 16307414.0, + "reward": 0.85809326171875, + "reward_std": 0.016746345907449722, + "rewards//mean": 0.85809326171875, + "rewards//std": 0.021339356899261475, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.282, + "grad_norm": 2.580477714538574, + "kl": 1.1813295483589172, + "learning_rate": 8.252666214859934e-07, + "loss": 0.1181, + "num_tokens": 16319030.0, + "reward": 0.81134033203125, + "reward_std": 0.030645307153463364, + "rewards//mean": 0.81134033203125, + "rewards//std": 0.03952895849943161, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2822, + "grad_norm": 1.9167613983154297, + "kl": 1.2526995688676834, + "learning_rate": 8.250255489412462e-07, + "loss": 0.1253, + "num_tokens": 16330710.0, + "reward": 0.8277587890625, + "reward_std": 0.029572494328022003, + "rewards//mean": 0.8277587890625, + "rewards//std": 0.042271003127098083, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2824, + "grad_norm": 2.1131751537323, + "kl": 1.3475144505500793, + "learning_rate": 8.247843454762466e-07, + "loss": 0.1348, + "num_tokens": 16342222.0, + "reward": 0.87408447265625, + "reward_std": 0.034903571009635925, + "rewards//mean": 0.87408447265625, + "rewards//std": 0.03914761543273926, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2826, + "grad_norm": 1.8033530712127686, + "kl": 0.99809730052948, + "learning_rate": 8.245430111881517e-07, + "loss": 0.0998, + "num_tokens": 16353846.0, + "reward": 0.86077880859375, + "reward_std": 0.023976463824510574, + "rewards//mean": 0.86077880859375, + "rewards//std": 0.03089429996907711, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2828, + "grad_norm": 1.9626859426498413, + "kl": 1.1048970818519592, + "learning_rate": 8.243015461741706e-07, + "loss": 0.1105, + "num_tokens": 16365478.0, + "reward": 0.8504638671875, + "reward_std": 0.026941265910863876, + "rewards//mean": 0.8504638671875, + "rewards//std": 0.03384321182966232, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.283, + "grad_norm": 1.930812120437622, + "kl": 1.0649839267134666, + "learning_rate": 8.240599505315654e-07, + "loss": 0.1065, + "num_tokens": 16377102.0, + "reward": 0.84674072265625, + "reward_std": 0.032658644020557404, + "rewards//mean": 0.84674072265625, + "rewards//std": 0.03810675069689751, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2832, + "grad_norm": 1.8389490842819214, + "kl": 0.9379938766360283, + "learning_rate": 8.238182243576511e-07, + "loss": 0.0938, + "num_tokens": 16388670.0, + "reward": 0.8619384765625, + "reward_std": 0.035044632852077484, + "rewards//mean": 0.8619384765625, + "rewards//std": 0.03912581130862236, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2834, + "grad_norm": 2.006347417831421, + "kl": 1.2944551296532154, + "learning_rate": 8.235763677497945e-07, + "loss": 0.1294, + "num_tokens": 16400174.0, + "reward": 0.85455322265625, + "reward_std": 0.029920613393187523, + "rewards//mean": 0.85455322265625, + "rewards//std": 0.03961503133177757, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2836, + "grad_norm": 1.946550726890564, + "kl": 1.170253373682499, + "learning_rate": 8.233343808054157e-07, + "loss": 0.117, + "num_tokens": 16411774.0, + "reward": 0.8385009765625, + "reward_std": 0.022858571261167526, + "rewards//mean": 0.8385009765625, + "rewards//std": 0.03225098177790642, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2838, + "grad_norm": 2.3576905727386475, + "kl": 1.0994434207677841, + "learning_rate": 8.23092263621987e-07, + "loss": 0.1099, + "num_tokens": 16423374.0, + "reward": 0.82666015625, + "reward_std": 0.023033365607261658, + "rewards//mean": 0.82666015625, + "rewards//std": 0.027982894331216812, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.284, + "grad_norm": 1.7095534801483154, + "kl": 0.9898534119129181, + "learning_rate": 8.228500162970332e-07, + "loss": 0.0822, + "num_tokens": 16434962.0, + "reward": 0.84808349609375, + "reward_std": 0.025041526183485985, + "rewards//mean": 0.84808349609375, + "rewards//std": 0.036776602268218994, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2842, + "grad_norm": 1.886306643486023, + "kl": 0.9893413186073303, + "learning_rate": 8.226076389281314e-07, + "loss": 0.0989, + "num_tokens": 16446602.0, + "reward": 0.8489990234375, + "reward_std": 0.03090938925743103, + "rewards//mean": 0.8489990234375, + "rewards//std": 0.03653941676020622, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2844, + "grad_norm": 2.4053452014923096, + "kl": 1.2394032925367355, + "learning_rate": 8.223651316129114e-07, + "loss": 0.1239, + "num_tokens": 16458170.0, + "reward": 0.82073974609375, + "reward_std": 0.022349392995238304, + "rewards//mean": 0.82073974609375, + "rewards//std": 0.02766619436442852, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2846, + "grad_norm": 2.3515782356262207, + "kl": 1.0104362964630127, + "learning_rate": 8.221224944490548e-07, + "loss": 0.101, + "num_tokens": 16469770.0, + "reward": 0.8685302734375, + "reward_std": 0.028584390878677368, + "rewards//mean": 0.8685302734375, + "rewards//std": 0.04958781227469444, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2848, + "grad_norm": 2.3165595531463623, + "kl": 1.0468768291175365, + "learning_rate": 8.21879727534296e-07, + "loss": 0.1047, + "num_tokens": 16481330.0, + "reward": 0.77130126953125, + "reward_std": 0.014838572591543198, + "rewards//mean": 0.77130126953125, + "rewards//std": 0.02633695863187313, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.285, + "grad_norm": 2.2237391471862793, + "kl": 0.9176875427365303, + "learning_rate": 8.216368309664213e-07, + "loss": 0.0918, + "num_tokens": 16492794.0, + "reward": 0.83758544921875, + "reward_std": 0.028250547125935555, + "rewards//mean": 0.83758544921875, + "rewards//std": 0.03767247125506401, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2852, + "grad_norm": 1.9471272230148315, + "kl": 1.089765451848507, + "learning_rate": 8.213938048432696e-07, + "loss": 0.109, + "num_tokens": 16504402.0, + "reward": 0.82525634765625, + "reward_std": 0.029324717819690704, + "rewards//mean": 0.82525634765625, + "rewards//std": 0.031244127079844475, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2854, + "grad_norm": 1.8422468900680542, + "kl": 1.4214666783809662, + "learning_rate": 8.211506492627318e-07, + "loss": 0.1421, + "num_tokens": 16515978.0, + "reward": 0.84246826171875, + "reward_std": 0.038484975695610046, + "rewards//mean": 0.84246826171875, + "rewards//std": 0.045073557645082474, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2856, + "grad_norm": 2.2490005493164062, + "kl": 0.9342833980917931, + "learning_rate": 8.209073643227509e-07, + "loss": 0.0934, + "num_tokens": 16527578.0, + "reward": 0.79730224609375, + "reward_std": 0.013810960575938225, + "rewards//mean": 0.79730224609375, + "rewards//std": 0.022521764039993286, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2858, + "grad_norm": 1.8799141645431519, + "kl": 0.7605405449867249, + "learning_rate": 8.206639501213219e-07, + "loss": 0.0761, + "num_tokens": 16539242.0, + "reward": 0.8240966796875, + "reward_std": 0.019297946244478226, + "rewards//mean": 0.8240966796875, + "rewards//std": 0.021575426682829857, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.286, + "grad_norm": 2.5045382976531982, + "kl": 1.2337428629398346, + "learning_rate": 8.204204067564924e-07, + "loss": 0.1234, + "num_tokens": 16550802.0, + "reward": 0.84820556640625, + "reward_std": 0.02431846782565117, + "rewards//mean": 0.84820556640625, + "rewards//std": 0.03365229442715645, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2862, + "grad_norm": 2.5241730213165283, + "kl": 1.0584186278283596, + "learning_rate": 8.201767343263611e-07, + "loss": 0.1058, + "num_tokens": 16562418.0, + "reward": 0.83489990234375, + "reward_std": 0.018866239115595818, + "rewards//mean": 0.83489990234375, + "rewards//std": 0.025154555216431618, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2864, + "grad_norm": 1.9687682390213013, + "kl": 0.7741382829844952, + "learning_rate": 8.199329329290796e-07, + "loss": 0.0774, + "num_tokens": 16574082.0, + "reward": 0.85772705078125, + "reward_std": 0.017693841829895973, + "rewards//mean": 0.85772705078125, + "rewards//std": 0.027106259018182755, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2866, + "grad_norm": 1.9599382877349854, + "kl": 0.8949921354651451, + "learning_rate": 8.19689002662851e-07, + "loss": 0.0895, + "num_tokens": 16585578.0, + "reward": 0.85498046875, + "reward_std": 0.027076557278633118, + "rewards//mean": 0.85498046875, + "rewards//std": 0.0361882820725441, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2868, + "grad_norm": 2.6066532135009766, + "kl": 0.8619443438947201, + "learning_rate": 8.194449436259303e-07, + "loss": 0.0862, + "num_tokens": 16597090.0, + "reward": 0.8284912109375, + "reward_std": 0.014097953215241432, + "rewards//mean": 0.8284912109375, + "rewards//std": 0.017298169434070587, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.287, + "grad_norm": 2.132068395614624, + "kl": 1.1523170918226242, + "learning_rate": 8.192007559166247e-07, + "loss": 0.1152, + "num_tokens": 16608658.0, + "reward": 0.81378173828125, + "reward_std": 0.020632395520806313, + "rewards//mean": 0.81378173828125, + "rewards//std": 0.02379699796438217, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2872, + "grad_norm": 2.6631906032562256, + "kl": 1.0365860313177109, + "learning_rate": 8.189564396332926e-07, + "loss": 0.1037, + "num_tokens": 16620234.0, + "reward": 0.85443115234375, + "reward_std": 0.030493225902318954, + "rewards//mean": 0.85443115234375, + "rewards//std": 0.0375593900680542, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2874, + "grad_norm": 3.0325543880462646, + "kl": 0.9282132498919964, + "learning_rate": 8.187119948743449e-07, + "loss": 0.0928, + "num_tokens": 16631858.0, + "reward": 0.85064697265625, + "reward_std": 0.031226031482219696, + "rewards//mean": 0.85064697265625, + "rewards//std": 0.03751099482178688, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2876, + "grad_norm": 2.3985865116119385, + "kl": 0.8166398145258427, + "learning_rate": 8.184674217382437e-07, + "loss": 0.0817, + "num_tokens": 16643482.0, + "reward": 0.84619140625, + "reward_std": 0.01665816828608513, + "rewards//mean": 0.84619140625, + "rewards//std": 0.02585030160844326, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2878, + "grad_norm": 2.1212661266326904, + "kl": 0.8436030000448227, + "learning_rate": 8.182227203235031e-07, + "loss": 0.0844, + "num_tokens": 16655138.0, + "reward": 0.8795166015625, + "reward_std": 0.022569436579942703, + "rewards//mean": 0.8795166015625, + "rewards//std": 0.025734975934028625, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.288, + "grad_norm": 2.005990982055664, + "kl": 1.4833500906825066, + "learning_rate": 8.179778907286887e-07, + "loss": 0.1483, + "num_tokens": 16666746.0, + "reward": 0.821044921875, + "reward_std": 0.027193710207939148, + "rewards//mean": 0.821044921875, + "rewards//std": 0.03452626243233681, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2882, + "grad_norm": 2.9394702911376953, + "kl": 1.2375601939857006, + "learning_rate": 8.177329330524181e-07, + "loss": 0.1238, + "num_tokens": 16678658.0, + "reward": 0.8349609375, + "reward_std": 0.030077556148171425, + "rewards//mean": 0.8349609375, + "rewards//std": 0.03946346789598465, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2884, + "grad_norm": 2.221727132797241, + "kl": 1.2168191522359848, + "learning_rate": 8.1748784739336e-07, + "loss": 0.1217, + "num_tokens": 16690226.0, + "reward": 0.84814453125, + "reward_std": 0.03045974299311638, + "rewards//mean": 0.84814453125, + "rewards//std": 0.03238049894571304, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2886, + "grad_norm": 2.0130879878997803, + "kl": 0.939220666885376, + "learning_rate": 8.17242633850235e-07, + "loss": 0.0939, + "num_tokens": 16701786.0, + "reward": 0.8568115234375, + "reward_std": 0.019128844141960144, + "rewards//mean": 0.8568115234375, + "rewards//std": 0.02646803855895996, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2888, + "grad_norm": 2.136218786239624, + "kl": 0.9767297692596912, + "learning_rate": 8.16997292521815e-07, + "loss": 0.0977, + "num_tokens": 16713362.0, + "reward": 0.86090087890625, + "reward_std": 0.02481190115213394, + "rewards//mean": 0.86090087890625, + "rewards//std": 0.03717290610074997, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.289, + "grad_norm": 2.016977071762085, + "kl": 1.069300390779972, + "learning_rate": 8.167518235069234e-07, + "loss": 0.1069, + "num_tokens": 16724930.0, + "reward": 0.831787109375, + "reward_std": 0.03233769163489342, + "rewards//mean": 0.831787109375, + "rewards//std": 0.04204784706234932, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2892, + "grad_norm": 2.434346914291382, + "kl": 0.8995243571698666, + "learning_rate": 8.165062269044352e-07, + "loss": 0.09, + "num_tokens": 16736402.0, + "reward": 0.77093505859375, + "reward_std": 0.015386145561933517, + "rewards//mean": 0.77093505859375, + "rewards//std": 0.01873200014233589, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2894, + "grad_norm": 2.4338388442993164, + "kl": 1.0875799506902695, + "learning_rate": 8.162605028132768e-07, + "loss": 0.1088, + "num_tokens": 16748026.0, + "reward": 0.847412109375, + "reward_std": 0.029974959790706635, + "rewards//mean": 0.847412109375, + "rewards//std": 0.03676842153072357, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2896, + "grad_norm": 2.4305660724639893, + "kl": 1.161112766712904, + "learning_rate": 8.160146513324254e-07, + "loss": 0.1161, + "num_tokens": 16759634.0, + "reward": 0.8370361328125, + "reward_std": 0.018154822289943695, + "rewards//mean": 0.8370361328125, + "rewards//std": 0.021681809797883034, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2898, + "grad_norm": 2.2185611724853516, + "kl": 1.256595328450203, + "learning_rate": 8.157686725609105e-07, + "loss": 0.1257, + "num_tokens": 16771202.0, + "reward": 0.80657958984375, + "reward_std": 0.01749875396490097, + "rewards//mean": 0.80657958984375, + "rewards//std": 0.023292293772101402, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.29, + "grad_norm": 2.3227732181549072, + "kl": 0.733618076890707, + "learning_rate": 8.155225665978118e-07, + "loss": 0.0734, + "num_tokens": 16782842.0, + "reward": 0.84039306640625, + "reward_std": 0.024509437382221222, + "rewards//mean": 0.84039306640625, + "rewards//std": 0.03693268448114395, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2902, + "grad_norm": 2.334460973739624, + "kl": 0.9158366546034813, + "learning_rate": 8.152763335422612e-07, + "loss": 0.0916, + "num_tokens": 16794354.0, + "reward": 0.87249755859375, + "reward_std": 0.029864545911550522, + "rewards//mean": 0.87249755859375, + "rewards//std": 0.043074969202280045, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2904, + "grad_norm": 4.302100658416748, + "kl": 1.3051862195134163, + "learning_rate": 8.150299734934412e-07, + "loss": 0.1305, + "num_tokens": 16805938.0, + "reward": 0.735107421875, + "reward_std": 0.017423976212739944, + "rewards//mean": 0.735107421875, + "rewards//std": 0.027751486748456955, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2906, + "grad_norm": 2.086300849914551, + "kl": 1.5422799289226532, + "learning_rate": 8.147834865505853e-07, + "loss": 0.1542, + "num_tokens": 16817458.0, + "reward": 0.85406494140625, + "reward_std": 0.033168986439704895, + "rewards//mean": 0.85406494140625, + "rewards//std": 0.0467112697660923, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2908, + "grad_norm": 2.262880563735962, + "kl": 1.312078133225441, + "learning_rate": 8.145368728129789e-07, + "loss": 0.1312, + "num_tokens": 16828954.0, + "reward": 0.8084716796875, + "reward_std": 0.017811456695199013, + "rewards//mean": 0.8084716796875, + "rewards//std": 0.02630050666630268, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.291, + "grad_norm": 1.75749671459198, + "kl": 1.1861493960022926, + "learning_rate": 8.142901323799577e-07, + "loss": 0.1186, + "num_tokens": 16840530.0, + "reward": 0.84637451171875, + "reward_std": 0.03635651618242264, + "rewards//mean": 0.84637451171875, + "rewards//std": 0.04515945166349411, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2912, + "grad_norm": 8.514933586120605, + "kl": 1.7748880833387375, + "learning_rate": 8.140432653509087e-07, + "loss": 0.1775, + "num_tokens": 16852122.0, + "reward": 0.80035400390625, + "reward_std": 0.014324352145195007, + "rewards//mean": 0.80035400390625, + "rewards//std": 0.023572662845253944, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2914, + "grad_norm": 2.068279266357422, + "kl": 1.1292577609419823, + "learning_rate": 8.1379627182527e-07, + "loss": 0.1129, + "num_tokens": 16863682.0, + "reward": 0.85980224609375, + "reward_std": 0.030251411721110344, + "rewards//mean": 0.85980224609375, + "rewards//std": 0.03773229569196701, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2916, + "grad_norm": 2.2378485202789307, + "kl": 1.1048205383121967, + "learning_rate": 8.135491519025306e-07, + "loss": 0.1105, + "num_tokens": 16875330.0, + "reward": 0.8499755859375, + "reward_std": 0.02969418838620186, + "rewards//mean": 0.8499755859375, + "rewards//std": 0.03644317761063576, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2918, + "grad_norm": 1.8486108779907227, + "kl": 1.1175079494714737, + "learning_rate": 8.133019056822302e-07, + "loss": 0.1118, + "num_tokens": 16886930.0, + "reward": 0.8487548828125, + "reward_std": 0.02060743048787117, + "rewards//mean": 0.8487548828125, + "rewards//std": 0.02898598089814186, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.292, + "grad_norm": 2.1832685470581055, + "kl": 1.2155062407255173, + "learning_rate": 8.130545332639597e-07, + "loss": 0.1216, + "num_tokens": 16898506.0, + "reward": 0.86151123046875, + "reward_std": 0.03994078189134598, + "rewards//mean": 0.86151123046875, + "rewards//std": 0.04408404231071472, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2922, + "grad_norm": 2.020132541656494, + "kl": 1.3670612275600433, + "learning_rate": 8.128070347473608e-07, + "loss": 0.1367, + "num_tokens": 16910170.0, + "reward": 0.85577392578125, + "reward_std": 0.033758051693439484, + "rewards//mean": 0.85577392578125, + "rewards//std": 0.03688962012529373, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2924, + "grad_norm": 2.3412957191467285, + "kl": 0.7962788119912148, + "learning_rate": 8.125594102321255e-07, + "loss": 0.0796, + "num_tokens": 16921730.0, + "reward": 0.8551025390625, + "reward_std": 0.029022160917520523, + "rewards//mean": 0.8551025390625, + "rewards//std": 0.03569784387946129, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2926, + "grad_norm": 1.7280937433242798, + "kl": 1.020878717303276, + "learning_rate": 8.123116598179971e-07, + "loss": 0.1021, + "num_tokens": 16933394.0, + "reward": 0.87158203125, + "reward_std": 0.02756357565522194, + "rewards//mean": 0.87158203125, + "rewards//std": 0.03775400295853615, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2928, + "grad_norm": 1.882761836051941, + "kl": 1.2338461205363274, + "learning_rate": 8.120637836047697e-07, + "loss": 0.1234, + "num_tokens": 16944906.0, + "reward": 0.86474609375, + "reward_std": 0.030324004590511322, + "rewards//mean": 0.86474609375, + "rewards//std": 0.03664056956768036, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.293, + "grad_norm": 2.4715185165405273, + "kl": 1.2167891077697277, + "learning_rate": 8.118157816922874e-07, + "loss": 0.1217, + "num_tokens": 16956514.0, + "reward": 0.8194580078125, + "reward_std": 0.023027483373880386, + "rewards//mean": 0.8194580078125, + "rewards//std": 0.02882678247988224, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2932, + "grad_norm": 2.3432443141937256, + "kl": 1.6112428829073906, + "learning_rate": 8.115676541804455e-07, + "loss": 0.1611, + "num_tokens": 16968058.0, + "reward": 0.83770751953125, + "reward_std": 0.03683687746524811, + "rewards//mean": 0.83770751953125, + "rewards//std": 0.03725304454565048, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2934, + "grad_norm": 2.1020567417144775, + "kl": 1.3196452856063843, + "learning_rate": 8.113194011691899e-07, + "loss": 0.132, + "num_tokens": 16979578.0, + "reward": 0.832275390625, + "reward_std": 0.026909932494163513, + "rewards//mean": 0.832275390625, + "rewards//std": 0.02909778617322445, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2936, + "grad_norm": 2.379180908203125, + "kl": 1.6255786046385765, + "learning_rate": 8.110710227585167e-07, + "loss": 0.1626, + "num_tokens": 16991106.0, + "reward": 0.83740234375, + "reward_std": 0.03270108252763748, + "rewards//mean": 0.83740234375, + "rewards//std": 0.03638851270079613, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2938, + "grad_norm": 1.7323822975158691, + "kl": 0.9775112718343735, + "learning_rate": 8.108225190484726e-07, + "loss": 0.0978, + "num_tokens": 17002722.0, + "reward": 0.8782958984375, + "reward_std": 0.031038349494338036, + "rewards//mean": 0.8782958984375, + "rewards//std": 0.03379128500819206, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.294, + "grad_norm": 2.5161144733428955, + "kl": 1.267770454287529, + "learning_rate": 8.105738901391551e-07, + "loss": 0.1268, + "num_tokens": 17014410.0, + "reward": 0.84185791015625, + "reward_std": 0.03340717405080795, + "rewards//mean": 0.84185791015625, + "rewards//std": 0.037866052240133286, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2942, + "grad_norm": 2.139716863632202, + "kl": 1.2968077212572098, + "learning_rate": 8.103251361307118e-07, + "loss": 0.1297, + "num_tokens": 17026026.0, + "reward": 0.845458984375, + "reward_std": 0.03006110154092312, + "rewards//mean": 0.845458984375, + "rewards//std": 0.04551641643047333, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2944, + "grad_norm": 1.9881742000579834, + "kl": 1.0550460368394852, + "learning_rate": 8.100762571233408e-07, + "loss": 0.1055, + "num_tokens": 17037570.0, + "reward": 0.8248291015625, + "reward_std": 0.028485849499702454, + "rewards//mean": 0.8248291015625, + "rewards//std": 0.03845773637294769, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2946, + "grad_norm": 2.2026479244232178, + "kl": 1.4027185812592506, + "learning_rate": 8.098272532172905e-07, + "loss": 0.1403, + "num_tokens": 17049322.0, + "reward": 0.84906005859375, + "reward_std": 0.02615469880402088, + "rewards//mean": 0.84906005859375, + "rewards//std": 0.03469066694378853, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2948, + "grad_norm": 2.9873063564300537, + "kl": 1.1423545069992542, + "learning_rate": 8.095781245128597e-07, + "loss": 0.1142, + "num_tokens": 17060978.0, + "reward": 0.83428955078125, + "reward_std": 0.021706942468881607, + "rewards//mean": 0.83428955078125, + "rewards//std": 0.03370218724012375, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.295, + "grad_norm": 2.2435736656188965, + "kl": 1.3106045052409172, + "learning_rate": 8.093288711103971e-07, + "loss": 0.1311, + "num_tokens": 17072474.0, + "reward": 0.87451171875, + "reward_std": 0.03685038536787033, + "rewards//mean": 0.87451171875, + "rewards//std": 0.04838276654481888, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2952, + "grad_norm": 2.349391222000122, + "kl": 1.3284645900130272, + "learning_rate": 8.090794931103026e-07, + "loss": 0.1328, + "num_tokens": 17084026.0, + "reward": 0.85931396484375, + "reward_std": 0.03722620755434036, + "rewards//mean": 0.85931396484375, + "rewards//std": 0.0439164973795414, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2954, + "grad_norm": 1.8001118898391724, + "kl": 0.6358111277222633, + "learning_rate": 8.08829990613025e-07, + "loss": 0.0636, + "num_tokens": 17095562.0, + "reward": 0.86175537109375, + "reward_std": 0.027497775852680206, + "rewards//mean": 0.86175537109375, + "rewards//std": 0.04508732259273529, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2956, + "grad_norm": 1.8509376049041748, + "kl": 0.8014605976641178, + "learning_rate": 8.085803637190643e-07, + "loss": 0.0801, + "num_tokens": 17107130.0, + "reward": 0.81719970703125, + "reward_std": 0.015959536656737328, + "rewards//mean": 0.81719970703125, + "rewards//std": 0.028272902593016624, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2958, + "grad_norm": 2.06424617767334, + "kl": 0.9010571576654911, + "learning_rate": 8.083306125289697e-07, + "loss": 0.0901, + "num_tokens": 17118594.0, + "reward": 0.8829345703125, + "reward_std": 0.021559372544288635, + "rewards//mean": 0.8829345703125, + "rewards//std": 0.02559104934334755, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.296, + "grad_norm": 1.9997092485427856, + "kl": 1.1511977054178715, + "learning_rate": 8.080807371433414e-07, + "loss": 0.1151, + "num_tokens": 17130146.0, + "reward": 0.82110595703125, + "reward_std": 0.024857841432094574, + "rewards//mean": 0.82110595703125, + "rewards//std": 0.029431361705064774, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2962, + "grad_norm": 2.3035154342651367, + "kl": 0.6690607070922852, + "learning_rate": 8.07830737662829e-07, + "loss": 0.0669, + "num_tokens": 17141746.0, + "reward": 0.868896484375, + "reward_std": 0.028142042458057404, + "rewards//mean": 0.868896484375, + "rewards//std": 0.04160194844007492, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2964, + "grad_norm": 2.76484751701355, + "kl": 1.6012582182884216, + "learning_rate": 8.075806141881325e-07, + "loss": 0.1601, + "num_tokens": 17153418.0, + "reward": 0.84417724609375, + "reward_std": 0.041417546570301056, + "rewards//mean": 0.84417724609375, + "rewards//std": 0.04666847363114357, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2966, + "grad_norm": 2.752030611038208, + "kl": 1.2514798119664192, + "learning_rate": 8.073303668200011e-07, + "loss": 0.1251, + "num_tokens": 17165042.0, + "reward": 0.8515625, + "reward_std": 0.03050164505839348, + "rewards//mean": 0.8515625, + "rewards//std": 0.03611793741583824, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2968, + "grad_norm": 1.9448866844177246, + "kl": 1.153668139129877, + "learning_rate": 8.070799956592349e-07, + "loss": 0.1154, + "num_tokens": 17176538.0, + "reward": 0.83502197265625, + "reward_std": 0.023606613278388977, + "rewards//mean": 0.83502197265625, + "rewards//std": 0.026427043601870537, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.297, + "grad_norm": 1.8997888565063477, + "kl": 0.8650849722325802, + "learning_rate": 8.06829500806683e-07, + "loss": 0.0865, + "num_tokens": 17188114.0, + "reward": 0.83331298828125, + "reward_std": 0.02721722051501274, + "rewards//mean": 0.83331298828125, + "rewards//std": 0.03971463814377785, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2972, + "grad_norm": 2.444751262664795, + "kl": 1.5376519337296486, + "learning_rate": 8.06578882363245e-07, + "loss": 0.1538, + "num_tokens": 17199770.0, + "reward": 0.81451416015625, + "reward_std": 0.018092025071382523, + "rewards//mean": 0.81451416015625, + "rewards//std": 0.03457615152001381, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2974, + "grad_norm": 2.2877044677734375, + "kl": 1.0866966620087624, + "learning_rate": 8.063281404298699e-07, + "loss": 0.1087, + "num_tokens": 17211386.0, + "reward": 0.83258056640625, + "reward_std": 0.026887869462370872, + "rewards//mean": 0.83258056640625, + "rewards//std": 0.03596920892596245, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2976, + "grad_norm": 1.9907156229019165, + "kl": 1.1559253856539726, + "learning_rate": 8.060772751075562e-07, + "loss": 0.1156, + "num_tokens": 17222970.0, + "reward": 0.8282470703125, + "reward_std": 0.019378134980797768, + "rewards//mean": 0.8282470703125, + "rewards//std": 0.028696255758404732, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2978, + "grad_norm": 2.2437267303466797, + "kl": 1.2296664007008076, + "learning_rate": 8.058262864973528e-07, + "loss": 0.123, + "num_tokens": 17234570.0, + "reward": 0.85076904296875, + "reward_std": 0.033204469829797745, + "rewards//mean": 0.85076904296875, + "rewards//std": 0.039538148790597916, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.298, + "grad_norm": 2.2416279315948486, + "kl": 1.3320495709776878, + "learning_rate": 8.055751747003579e-07, + "loss": 0.1332, + "num_tokens": 17246090.0, + "reward": 0.85528564453125, + "reward_std": 0.03492129594087601, + "rewards//mean": 0.85528564453125, + "rewards//std": 0.03904656320810318, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2982, + "grad_norm": 2.9107506275177, + "kl": 1.1356214135885239, + "learning_rate": 8.053239398177191e-07, + "loss": 0.1136, + "num_tokens": 17257610.0, + "reward": 0.81805419921875, + "reward_std": 0.024214670062065125, + "rewards//mean": 0.81805419921875, + "rewards//std": 0.031862523406744, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2984, + "grad_norm": 2.242908477783203, + "kl": 0.9806222133338451, + "learning_rate": 8.050725819506339e-07, + "loss": 0.0981, + "num_tokens": 17269186.0, + "reward": 0.7735595703125, + "reward_std": 0.010913461446762085, + "rewards//mean": 0.7735595703125, + "rewards//std": 0.01891985535621643, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2986, + "grad_norm": 2.2862889766693115, + "kl": 1.2837243899703026, + "learning_rate": 8.048211012003489e-07, + "loss": 0.1284, + "num_tokens": 17280810.0, + "reward": 0.79498291015625, + "reward_std": 0.02202451601624489, + "rewards//mean": 0.79498291015625, + "rewards//std": 0.03708402439951897, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2988, + "grad_norm": 2.5402188301086426, + "kl": 1.1043884679675102, + "learning_rate": 8.045694976681612e-07, + "loss": 0.1104, + "num_tokens": 17292442.0, + "reward": 0.8045654296875, + "reward_std": 0.01907430589199066, + "rewards//mean": 0.8045654296875, + "rewards//std": 0.025685518980026245, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.299, + "grad_norm": 3.069411039352417, + "kl": 1.0443679690361023, + "learning_rate": 8.043177714554159e-07, + "loss": 0.1044, + "num_tokens": 17304026.0, + "reward": 0.875732421875, + "reward_std": 0.021527888253331184, + "rewards//mean": 0.875732421875, + "rewards//std": 0.03452626243233681, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2992, + "grad_norm": 1.9751781225204468, + "kl": 0.9911154136061668, + "learning_rate": 8.04065922663509e-07, + "loss": 0.0991, + "num_tokens": 17315658.0, + "reward": 0.837646484375, + "reward_std": 0.027467790991067886, + "rewards//mean": 0.837646484375, + "rewards//std": 0.031764086335897446, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2994, + "grad_norm": 2.043424129486084, + "kl": 0.8715234436094761, + "learning_rate": 8.038139513938845e-07, + "loss": 0.0872, + "num_tokens": 17327298.0, + "reward": 0.81866455078125, + "reward_std": 0.01729327067732811, + "rewards//mean": 0.81866455078125, + "rewards//std": 0.021336520090699196, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2996, + "grad_norm": 2.3990976810455322, + "kl": 0.9717786088585854, + "learning_rate": 8.035618577480369e-07, + "loss": 0.0972, + "num_tokens": 17338866.0, + "reward": 0.852294921875, + "reward_std": 0.02309809997677803, + "rewards//mean": 0.852294921875, + "rewards//std": 0.03117142803966999, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2998, + "grad_norm": 1.6281423568725586, + "kl": 1.3064044639468193, + "learning_rate": 8.033096418275092e-07, + "loss": 0.1306, + "num_tokens": 17350490.0, + "reward": 0.80731201171875, + "reward_std": 0.015849100425839424, + "rewards//mean": 0.80731201171875, + "rewards//std": 0.03044166974723339, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3, + "grad_norm": 2.295783042907715, + "kl": 1.1806846782565117, + "learning_rate": 8.030573037338941e-07, + "loss": 0.1181, + "num_tokens": 17362090.0, + "reward": 0.7943115234375, + "reward_std": 0.020428607240319252, + "rewards//mean": 0.7943115234375, + "rewards//std": 0.027963140979409218, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3002, + "grad_norm": 2.078956365585327, + "kl": 0.9603407829999924, + "learning_rate": 8.028048435688333e-07, + "loss": 0.096, + "num_tokens": 17373610.0, + "reward": 0.8179931640625, + "reward_std": 0.018657241016626358, + "rewards//mean": 0.8179931640625, + "rewards//std": 0.023773211985826492, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3004, + "grad_norm": 2.5381569862365723, + "kl": 0.8997996971011162, + "learning_rate": 8.025522614340177e-07, + "loss": 0.09, + "num_tokens": 17385074.0, + "reward": 0.801025390625, + "reward_std": 0.020761575549840927, + "rewards//mean": 0.801025390625, + "rewards//std": 0.02825315110385418, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3006, + "grad_norm": 2.34901762008667, + "kl": 1.4570824950933456, + "learning_rate": 8.022995574311875e-07, + "loss": 0.1457, + "num_tokens": 17396682.0, + "reward": 0.803466796875, + "reward_std": 0.025224022567272186, + "rewards//mean": 0.803466796875, + "rewards//std": 0.032240886241197586, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3008, + "grad_norm": 2.021026372909546, + "kl": 1.1615985482931137, + "learning_rate": 8.020467316621316e-07, + "loss": 0.1162, + "num_tokens": 17408202.0, + "reward": 0.8505859375, + "reward_std": 0.015615087002515793, + "rewards//mean": 0.8505859375, + "rewards//std": 0.01904783770442009, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.301, + "grad_norm": 2.2438769340515137, + "kl": 1.0954576060175896, + "learning_rate": 8.017937842286882e-07, + "loss": 0.1095, + "num_tokens": 17419810.0, + "reward": 0.8519287109375, + "reward_std": 0.022645672783255577, + "rewards//mean": 0.8519287109375, + "rewards//std": 0.03034336306154728, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3012, + "grad_norm": 2.348038911819458, + "kl": 0.8236126862466335, + "learning_rate": 8.015407152327447e-07, + "loss": 0.0824, + "num_tokens": 17431346.0, + "reward": 0.80767822265625, + "reward_std": 0.012283369898796082, + "rewards//mean": 0.80767822265625, + "rewards//std": 0.01674354262650013, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3014, + "grad_norm": 3.306532859802246, + "kl": 0.8909425586462021, + "learning_rate": 8.012875247762372e-07, + "loss": 0.0891, + "num_tokens": 17442874.0, + "reward": 0.885009765625, + "reward_std": 0.02753233164548874, + "rewards//mean": 0.885009765625, + "rewards//std": 0.03567854315042496, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3016, + "grad_norm": 2.077897787094116, + "kl": 1.193754319101572, + "learning_rate": 8.010342129611507e-07, + "loss": 0.1194, + "num_tokens": 17454434.0, + "reward": 0.7987060546875, + "reward_std": 0.026690995320677757, + "rewards//mean": 0.7987060546875, + "rewards//std": 0.03633168339729309, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3018, + "grad_norm": 1.9916654825210571, + "kl": 1.5946860313415527, + "learning_rate": 8.007807798895193e-07, + "loss": 0.1595, + "num_tokens": 17466114.0, + "reward": 0.81591796875, + "reward_std": 0.0356898158788681, + "rewards//mean": 0.81591796875, + "rewards//std": 0.038219451904296875, + "step": 1509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.302, + "grad_norm": 1.8796058893203735, + "kl": 1.3158336654305458, + "learning_rate": 8.005272256634257e-07, + "loss": 0.1316, + "num_tokens": 17477690.0, + "reward": 0.82366943359375, + "reward_std": 0.019512230530381203, + "rewards//mean": 0.82366943359375, + "rewards//std": 0.027400091290473938, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3022, + "grad_norm": 2.1505367755889893, + "kl": 1.541728362441063, + "learning_rate": 8.002735503850015e-07, + "loss": 0.1542, + "num_tokens": 17489314.0, + "reward": 0.79833984375, + "reward_std": 0.02386932633817196, + "rewards//mean": 0.79833984375, + "rewards//std": 0.0318526029586792, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3024, + "grad_norm": 2.521803379058838, + "kl": 1.244603056460619, + "learning_rate": 8.000197541564271e-07, + "loss": 0.1245, + "num_tokens": 17500906.0, + "reward": 0.8526611328125, + "reward_std": 0.022898465394973755, + "rewards//mean": 0.8526611328125, + "rewards//std": 0.03599345684051514, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3026, + "grad_norm": 1.6513363122940063, + "kl": 1.0338254198431969, + "learning_rate": 7.997658370799316e-07, + "loss": 0.1034, + "num_tokens": 17512442.0, + "reward": 0.8409423828125, + "reward_std": 0.029213797301054, + "rewards//mean": 0.8409423828125, + "rewards//std": 0.03371414542198181, + "step": 1513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3028, + "grad_norm": 4.510700702667236, + "kl": 1.3446697890758514, + "learning_rate": 7.995117992577928e-07, + "loss": 0.1345, + "num_tokens": 17523962.0, + "reward": 0.78204345703125, + "reward_std": 0.017866164445877075, + "rewards//mean": 0.78204345703125, + "rewards//std": 0.023686055094003677, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.303, + "grad_norm": 2.254911184310913, + "kl": 1.2232105135917664, + "learning_rate": 7.992576407923372e-07, + "loss": 0.1223, + "num_tokens": 17535546.0, + "reward": 0.83349609375, + "reward_std": 0.02284548059105873, + "rewards//mean": 0.83349609375, + "rewards//std": 0.03016548976302147, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3032, + "grad_norm": 2.2333760261535645, + "kl": 1.3296502381563187, + "learning_rate": 7.990033617859395e-07, + "loss": 0.133, + "num_tokens": 17547026.0, + "reward": 0.8519287109375, + "reward_std": 0.016805453225970268, + "rewards//mean": 0.8519287109375, + "rewards//std": 0.022784797474741936, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3034, + "grad_norm": 2.187278985977173, + "kl": 1.2011192664504051, + "learning_rate": 7.987489623410235e-07, + "loss": 0.1201, + "num_tokens": 17558570.0, + "reward": 0.85272216796875, + "reward_std": 0.026256924495100975, + "rewards//mean": 0.85272216796875, + "rewards//std": 0.03425727039575577, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3036, + "grad_norm": 3.5671699047088623, + "kl": 1.6332244910299778, + "learning_rate": 7.984944425600613e-07, + "loss": 0.1633, + "num_tokens": 17570098.0, + "reward": 0.84710693359375, + "reward_std": 0.039329566061496735, + "rewards//mean": 0.84710693359375, + "rewards//std": 0.04740353673696518, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3038, + "grad_norm": 2.04453182220459, + "kl": 1.3737416341900826, + "learning_rate": 7.982398025455732e-07, + "loss": 0.1374, + "num_tokens": 17581810.0, + "reward": 0.83331298828125, + "reward_std": 0.015252603217959404, + "rewards//mean": 0.83331298828125, + "rewards//std": 0.018189990893006325, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.304, + "grad_norm": 2.3139469623565674, + "kl": 1.2727109342813492, + "learning_rate": 7.979850424001282e-07, + "loss": 0.1273, + "num_tokens": 17593378.0, + "reward": 0.8641357421875, + "reward_std": 0.028906922787427902, + "rewards//mean": 0.8641357421875, + "rewards//std": 0.03161576762795448, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3042, + "grad_norm": 4.189932346343994, + "kl": 1.8032265529036522, + "learning_rate": 7.97730162226344e-07, + "loss": 0.1803, + "num_tokens": 17605034.0, + "reward": 0.82293701171875, + "reward_std": 0.024084148928523064, + "rewards//mean": 0.82293701171875, + "rewards//std": 0.02744590863585472, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3044, + "grad_norm": 2.1346333026885986, + "kl": 0.9193527065217495, + "learning_rate": 7.974751621268858e-07, + "loss": 0.0919, + "num_tokens": 17616650.0, + "reward": 0.88531494140625, + "reward_std": 0.033519357442855835, + "rewards//mean": 0.88531494140625, + "rewards//std": 0.04548709839582443, + "step": 1522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.3046, + "grad_norm": 1.8993123769760132, + "kl": 0.8260882906615734, + "learning_rate": 7.972200422044682e-07, + "loss": 0.0827, + "num_tokens": 17628208.0, + "reward": 0.85479736328125, + "reward_std": 0.022321544587612152, + "rewards//mean": 0.85479736328125, + "rewards//std": 0.030817275866866112, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3048, + "grad_norm": 2.28356671333313, + "kl": 1.657431609928608, + "learning_rate": 7.969648025618529e-07, + "loss": 0.1657, + "num_tokens": 17639776.0, + "reward": 0.83819580078125, + "reward_std": 0.026148376986384392, + "rewards//mean": 0.83819580078125, + "rewards//std": 0.03131043165922165, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.305, + "grad_norm": 2.499087333679199, + "kl": 1.0636197738349438, + "learning_rate": 7.967094433018508e-07, + "loss": 0.1064, + "num_tokens": 17651296.0, + "reward": 0.8717041015625, + "reward_std": 0.038300011307001114, + "rewards//mean": 0.8717041015625, + "rewards//std": 0.04410210996866226, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.3052, + "grad_norm": 3.133949041366577, + "kl": 2.1240504384040833, + "learning_rate": 7.964539645273202e-07, + "loss": 0.2182, + "num_tokens": 17662781.0, + "reward": 0.77593994140625, + "reward_std": 0.02001497708261013, + "rewards//mean": 0.77593994140625, + "rewards//std": 0.0225459486246109, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3054, + "grad_norm": 1.9169918298721313, + "kl": 1.0555896013975143, + "learning_rate": 7.961983663411684e-07, + "loss": 0.1056, + "num_tokens": 17674357.0, + "reward": 0.8289794921875, + "reward_std": 0.021449033170938492, + "rewards//mean": 0.8289794921875, + "rewards//std": 0.028675148263573647, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3056, + "grad_norm": 1.6119900941848755, + "kl": 1.2868527621030807, + "learning_rate": 7.959426488463499e-07, + "loss": 0.1287, + "num_tokens": 17685933.0, + "reward": 0.8214111328125, + "reward_std": 0.02259696274995804, + "rewards//mean": 0.8214111328125, + "rewards//std": 0.029268702492117882, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3058, + "grad_norm": 2.6116325855255127, + "kl": 1.233541026711464, + "learning_rate": 7.956868121458677e-07, + "loss": 0.1234, + "num_tokens": 17697437.0, + "reward": 0.8150634765625, + "reward_std": 0.03022104501724243, + "rewards//mean": 0.8150634765625, + "rewards//std": 0.036473069339990616, + "step": 1529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.306, + "grad_norm": 2.243337869644165, + "kl": 0.7758295089006424, + "learning_rate": 7.954308563427732e-07, + "loss": 0.0776, + "num_tokens": 17708989.0, + "reward": 0.8243408203125, + "reward_std": 0.018057601526379585, + "rewards//mean": 0.8243408203125, + "rewards//std": 0.026775097474455833, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3062, + "grad_norm": 2.7153873443603516, + "kl": 1.3293144553899765, + "learning_rate": 7.951747815401649e-07, + "loss": 0.1329, + "num_tokens": 17720629.0, + "reward": 0.84002685546875, + "reward_std": 0.03077160194516182, + "rewards//mean": 0.84002685546875, + "rewards//std": 0.04312589764595032, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3064, + "grad_norm": 2.470773935317993, + "kl": 1.2891269251704216, + "learning_rate": 7.949185878411899e-07, + "loss": 0.1289, + "num_tokens": 17732197.0, + "reward": 0.8394775390625, + "reward_std": 0.028235912322998047, + "rewards//mean": 0.8394775390625, + "rewards//std": 0.03502662107348442, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3066, + "grad_norm": 1.7730140686035156, + "kl": 1.0974257290363312, + "learning_rate": 7.946622753490432e-07, + "loss": 0.1097, + "num_tokens": 17743757.0, + "reward": 0.84259033203125, + "reward_std": 0.023315653204917908, + "rewards//mean": 0.84259033203125, + "rewards//std": 0.03423605486750603, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3068, + "grad_norm": 2.605414628982544, + "kl": 0.9307443425059319, + "learning_rate": 7.94405844166967e-07, + "loss": 0.0931, + "num_tokens": 17755325.0, + "reward": 0.842529296875, + "reward_std": 0.02845938317477703, + "rewards//mean": 0.842529296875, + "rewards//std": 0.03916699439287186, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.307, + "grad_norm": 2.8239405155181885, + "kl": 1.4791762381792068, + "learning_rate": 7.941492943982521e-07, + "loss": 0.1479, + "num_tokens": 17766989.0, + "reward": 0.849609375, + "reward_std": 0.03364288806915283, + "rewards//mean": 0.849609375, + "rewards//std": 0.04015100747346878, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3072, + "grad_norm": 2.518048048019409, + "kl": 1.1594782173633575, + "learning_rate": 7.938926261462365e-07, + "loss": 0.1159, + "num_tokens": 17778549.0, + "reward": 0.8603515625, + "reward_std": 0.027606263756752014, + "rewards//mean": 0.8603515625, + "rewards//std": 0.03189440071582794, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3074, + "grad_norm": 2.071779489517212, + "kl": 1.0171090140938759, + "learning_rate": 7.936358395143063e-07, + "loss": 0.1017, + "num_tokens": 17790125.0, + "reward": 0.848876953125, + "reward_std": 0.021073047071695328, + "rewards//mean": 0.848876953125, + "rewards//std": 0.028158696368336678, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3076, + "grad_norm": 1.8629794120788574, + "kl": 0.6063016019761562, + "learning_rate": 7.93378934605895e-07, + "loss": 0.0606, + "num_tokens": 17801749.0, + "reward": 0.79742431640625, + "reward_std": 0.013388663530349731, + "rewards//mean": 0.79742431640625, + "rewards//std": 0.01938645727932453, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3078, + "grad_norm": 3.8796703815460205, + "kl": 1.3572199791669846, + "learning_rate": 7.93121911524484e-07, + "loss": 0.1357, + "num_tokens": 17813269.0, + "reward": 0.83660888671875, + "reward_std": 0.018517224118113518, + "rewards//mean": 0.83660888671875, + "rewards//std": 0.028258977457880974, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.308, + "grad_norm": 2.8308331966400146, + "kl": 1.2816067039966583, + "learning_rate": 7.928647703736023e-07, + "loss": 0.1282, + "num_tokens": 17824941.0, + "reward": 0.81512451171875, + "reward_std": 0.0331568717956543, + "rewards//mean": 0.81512451171875, + "rewards//std": 0.04603588953614235, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3082, + "grad_norm": 2.27358078956604, + "kl": 1.0992652252316475, + "learning_rate": 7.926075112568258e-07, + "loss": 0.1099, + "num_tokens": 17836469.0, + "reward": 0.82916259765625, + "reward_std": 0.023445764556527138, + "rewards//mean": 0.82916259765625, + "rewards//std": 0.03404850885272026, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3084, + "grad_norm": 1.9298545122146606, + "kl": 1.064426749944687, + "learning_rate": 7.923501342777787e-07, + "loss": 0.1064, + "num_tokens": 17847973.0, + "reward": 0.8651123046875, + "reward_std": 0.029952920973300934, + "rewards//mean": 0.8651123046875, + "rewards//std": 0.041304513812065125, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3086, + "grad_norm": 2.1209347248077393, + "kl": 0.9164910092949867, + "learning_rate": 7.920926395401326e-07, + "loss": 0.0916, + "num_tokens": 17859477.0, + "reward": 0.82940673828125, + "reward_std": 0.020781658589839935, + "rewards//mean": 0.82940673828125, + "rewards//std": 0.025268036872148514, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3088, + "grad_norm": 2.3547725677490234, + "kl": 0.8027265258133411, + "learning_rate": 7.918350271476063e-07, + "loss": 0.0803, + "num_tokens": 17871069.0, + "reward": 0.83807373046875, + "reward_std": 0.020201120525598526, + "rewards//mean": 0.83807373046875, + "rewards//std": 0.03254784271121025, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.309, + "grad_norm": 2.4676263332366943, + "kl": 0.9239803925156593, + "learning_rate": 7.915772972039659e-07, + "loss": 0.0924, + "num_tokens": 17882597.0, + "reward": 0.8787841796875, + "reward_std": 0.03652965649962425, + "rewards//mean": 0.8787841796875, + "rewards//std": 0.04672993719577789, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3092, + "grad_norm": 2.9704744815826416, + "kl": 1.0245046317577362, + "learning_rate": 7.913194498130251e-07, + "loss": 0.1025, + "num_tokens": 17894221.0, + "reward": 0.83245849609375, + "reward_std": 0.03341396152973175, + "rewards//mean": 0.83245849609375, + "rewards//std": 0.0448206290602684, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3094, + "grad_norm": 4.462617874145508, + "kl": 1.1118945218622684, + "learning_rate": 7.910614850786447e-07, + "loss": 0.1112, + "num_tokens": 17905773.0, + "reward": 0.8427734375, + "reward_std": 0.028039779514074326, + "rewards//mean": 0.8427734375, + "rewards//std": 0.03543417155742645, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3096, + "grad_norm": 2.7734360694885254, + "kl": 0.9194216430187225, + "learning_rate": 7.90803403104733e-07, + "loss": 0.0919, + "num_tokens": 17917365.0, + "reward": 0.87335205078125, + "reward_std": 0.026564689353108406, + "rewards//mean": 0.87335205078125, + "rewards//std": 0.031100871041417122, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3098, + "grad_norm": 4.01673698425293, + "kl": 1.155869022011757, + "learning_rate": 7.905452039952451e-07, + "loss": 0.1156, + "num_tokens": 17928917.0, + "reward": 0.8172607421875, + "reward_std": 0.02041761577129364, + "rewards//mean": 0.8172607421875, + "rewards//std": 0.03419208526611328, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.31, + "grad_norm": 2.9433884620666504, + "kl": 1.0333248414099216, + "learning_rate": 7.90286887854184e-07, + "loss": 0.1033, + "num_tokens": 17940469.0, + "reward": 0.8416748046875, + "reward_std": 0.022916413843631744, + "rewards//mean": 0.8416748046875, + "rewards//std": 0.025142289698123932, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3102, + "grad_norm": 1.8269966840744019, + "kl": 0.9485232755541801, + "learning_rate": 7.900284547855991e-07, + "loss": 0.0949, + "num_tokens": 17952093.0, + "reward": 0.85223388671875, + "reward_std": 0.02268178015947342, + "rewards//mean": 0.85223388671875, + "rewards//std": 0.027148669585585594, + "step": 1551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3104, + "grad_norm": 1.8960235118865967, + "kl": 1.1288143508136272, + "learning_rate": 7.897699048935873e-07, + "loss": 0.1129, + "num_tokens": 17963685.0, + "reward": 0.85467529296875, + "reward_std": 0.02728421613574028, + "rewards//mean": 0.85467529296875, + "rewards//std": 0.033531975001096725, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3106, + "grad_norm": 2.062451124191284, + "kl": 1.009611576795578, + "learning_rate": 7.895112382822924e-07, + "loss": 0.101, + "num_tokens": 17975205.0, + "reward": 0.877685546875, + "reward_std": 0.03003978729248047, + "rewards//mean": 0.877685546875, + "rewards//std": 0.03782050311565399, + "step": 1553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3108, + "grad_norm": 1.742631196975708, + "kl": 0.9598760046064854, + "learning_rate": 7.892524550559055e-07, + "loss": 0.096, + "num_tokens": 17986781.0, + "reward": 0.80621337890625, + "reward_std": 0.019575193524360657, + "rewards//mean": 0.80621337890625, + "rewards//std": 0.02347485162317753, + "step": 1554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.311, + "grad_norm": 2.1733319759368896, + "kl": 0.6548100411891937, + "learning_rate": 7.889935553186641e-07, + "loss": 0.0655, + "num_tokens": 17998397.0, + "reward": 0.87371826171875, + "reward_std": 0.02211184799671173, + "rewards//mean": 0.87371826171875, + "rewards//std": 0.03190240636467934, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3112, + "grad_norm": 2.215411901473999, + "kl": 1.09148770570755, + "learning_rate": 7.887345391748532e-07, + "loss": 0.1091, + "num_tokens": 18010037.0, + "reward": 0.82257080078125, + "reward_std": 0.02450289949774742, + "rewards//mean": 0.82257080078125, + "rewards//std": 0.0300150066614151, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3114, + "grad_norm": 2.0803723335266113, + "kl": 1.7548730075359344, + "learning_rate": 7.884754067288046e-07, + "loss": 0.1755, + "num_tokens": 18021477.0, + "reward": 0.82342529296875, + "reward_std": 0.041863225400447845, + "rewards//mean": 0.82342529296875, + "rewards//std": 0.04921993240714073, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3116, + "grad_norm": 2.0384457111358643, + "kl": 1.3219940066337585, + "learning_rate": 7.882161580848966e-07, + "loss": 0.1322, + "num_tokens": 18033053.0, + "reward": 0.85650634765625, + "reward_std": 0.03387215733528137, + "rewards//mean": 0.85650634765625, + "rewards//std": 0.04066791757941246, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3118, + "grad_norm": 2.639075994491577, + "kl": 1.5291403606534004, + "learning_rate": 7.879567933475546e-07, + "loss": 0.1529, + "num_tokens": 18044605.0, + "reward": 0.8427734375, + "reward_std": 0.020563028752803802, + "rewards//mean": 0.8427734375, + "rewards//std": 0.023344310000538826, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.312, + "grad_norm": 2.5067710876464844, + "kl": 1.1534148938953876, + "learning_rate": 7.876973126212506e-07, + "loss": 0.1153, + "num_tokens": 18056125.0, + "reward": 0.8365478515625, + "reward_std": 0.028582533821463585, + "rewards//mean": 0.8365478515625, + "rewards//std": 0.03640660271048546, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3122, + "grad_norm": 2.0016913414001465, + "kl": 1.082566298544407, + "learning_rate": 7.874377160105036e-07, + "loss": 0.1083, + "num_tokens": 18067749.0, + "reward": 0.77764892578125, + "reward_std": 0.01672932878136635, + "rewards//mean": 0.77764892578125, + "rewards//std": 0.022394370287656784, + "step": 1561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3124, + "grad_norm": 2.271162986755371, + "kl": 1.5656351000070572, + "learning_rate": 7.871780036198788e-07, + "loss": 0.1566, + "num_tokens": 18079341.0, + "reward": 0.865234375, + "reward_std": 0.03757353499531746, + "rewards//mean": 0.865234375, + "rewards//std": 0.040024127811193466, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3126, + "grad_norm": 2.176583766937256, + "kl": 1.2858109921216965, + "learning_rate": 7.869181755539887e-07, + "loss": 0.1286, + "num_tokens": 18090853.0, + "reward": 0.836669921875, + "reward_std": 0.027790352702140808, + "rewards//mean": 0.836669921875, + "rewards//std": 0.03232341632246971, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3128, + "grad_norm": 3.2378122806549072, + "kl": 1.0078425593674183, + "learning_rate": 7.866582319174917e-07, + "loss": 0.1008, + "num_tokens": 18102493.0, + "reward": 0.826416015625, + "reward_std": 0.01919153705239296, + "rewards//mean": 0.826416015625, + "rewards//std": 0.0237544197589159, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.313, + "grad_norm": 6.482329368591309, + "kl": 1.3162346929311752, + "learning_rate": 7.863981728150931e-07, + "loss": 0.1316, + "num_tokens": 18114125.0, + "reward": 0.8721923828125, + "reward_std": 0.02641114592552185, + "rewards//mean": 0.8721923828125, + "rewards//std": 0.033893272280693054, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3132, + "grad_norm": 2.922621250152588, + "kl": 1.1691013425588608, + "learning_rate": 7.861379983515448e-07, + "loss": 0.1169, + "num_tokens": 18125877.0, + "reward": 0.833251953125, + "reward_std": 0.03108024038374424, + "rewards//mean": 0.833251953125, + "rewards//std": 0.036557018756866455, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3134, + "grad_norm": 2.8224105834960938, + "kl": 1.795449584722519, + "learning_rate": 7.858777086316451e-07, + "loss": 0.1795, + "num_tokens": 18137597.0, + "reward": 0.862060546875, + "reward_std": 0.0430721640586853, + "rewards//mean": 0.862060546875, + "rewards//std": 0.051001958549022675, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3136, + "grad_norm": 2.8912508487701416, + "kl": 2.052280768752098, + "learning_rate": 7.856173037602382e-07, + "loss": 0.2052, + "num_tokens": 18149197.0, + "reward": 0.85614013671875, + "reward_std": 0.027057155966758728, + "rewards//mean": 0.85614013671875, + "rewards//std": 0.032892968505620956, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3138, + "grad_norm": 3.1474506855010986, + "kl": 1.2234520129859447, + "learning_rate": 7.853567838422159e-07, + "loss": 0.1223, + "num_tokens": 18160789.0, + "reward": 0.839599609375, + "reward_std": 0.019678421318531036, + "rewards//mean": 0.839599609375, + "rewards//std": 0.0229453444480896, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.314, + "grad_norm": 2.229790449142456, + "kl": 1.0767807513475418, + "learning_rate": 7.850961489825149e-07, + "loss": 0.1077, + "num_tokens": 18172421.0, + "reward": 0.8577880859375, + "reward_std": 0.03019712120294571, + "rewards//mean": 0.8577880859375, + "rewards//std": 0.04224807769060135, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3142, + "grad_norm": 3.1246492862701416, + "kl": 1.2321001440286636, + "learning_rate": 7.848353992861194e-07, + "loss": 0.1232, + "num_tokens": 18184053.0, + "reward": 0.814697265625, + "reward_std": 0.015858564525842667, + "rewards//mean": 0.814697265625, + "rewards//std": 0.020242206752300262, + "step": 1571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3144, + "grad_norm": 2.5706136226654053, + "kl": 1.633335493505001, + "learning_rate": 7.84574534858059e-07, + "loss": 0.1633, + "num_tokens": 18195533.0, + "reward": 0.86962890625, + "reward_std": 0.036411695182323456, + "rewards//mean": 0.86962890625, + "rewards//std": 0.0504462793469429, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3146, + "grad_norm": 4.191011905670166, + "kl": 1.5684664249420166, + "learning_rate": 7.8431355580341e-07, + "loss": 0.1568, + "num_tokens": 18207173.0, + "reward": 0.81036376953125, + "reward_std": 0.024309178814291954, + "rewards//mean": 0.81036376953125, + "rewards//std": 0.03138094022870064, + "step": 1573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3148, + "grad_norm": 2.398757219314575, + "kl": 1.4664518162608147, + "learning_rate": 7.840524622272948e-07, + "loss": 0.1466, + "num_tokens": 18218765.0, + "reward": 0.8272705078125, + "reward_std": 0.026490718126296997, + "rewards//mean": 0.8272705078125, + "rewards//std": 0.029507705941796303, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.315, + "grad_norm": 1.964145541191101, + "kl": 1.5862287282943726, + "learning_rate": 7.837912542348817e-07, + "loss": 0.1586, + "num_tokens": 18230333.0, + "reward": 0.8458251953125, + "reward_std": 0.03904380649328232, + "rewards//mean": 0.8458251953125, + "rewards//std": 0.04142162576317787, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3152, + "grad_norm": 2.315229892730713, + "kl": 1.4940860643982887, + "learning_rate": 7.835299319313853e-07, + "loss": 0.1494, + "num_tokens": 18241813.0, + "reward": 0.84197998046875, + "reward_std": 0.02717779576778412, + "rewards//mean": 0.84197998046875, + "rewards//std": 0.032188668847084045, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3154, + "grad_norm": 3.976433515548706, + "kl": 1.3558690771460533, + "learning_rate": 7.832684954220663e-07, + "loss": 0.1356, + "num_tokens": 18253557.0, + "reward": 0.83026123046875, + "reward_std": 0.026134490966796875, + "rewards//mean": 0.83026123046875, + "rewards//std": 0.029642511159181595, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3156, + "grad_norm": 2.1839652061462402, + "kl": 1.3551195859909058, + "learning_rate": 7.830069448122312e-07, + "loss": 0.1355, + "num_tokens": 18265093.0, + "reward": 0.850341796875, + "reward_std": 0.049579910933971405, + "rewards//mean": 0.850341796875, + "rewards//std": 0.05290861800312996, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3158, + "grad_norm": 2.1458988189697266, + "kl": 1.3633864894509315, + "learning_rate": 7.827452802072327e-07, + "loss": 0.1363, + "num_tokens": 18276757.0, + "reward": 0.828125, + "reward_std": 0.026896463707089424, + "rewards//mean": 0.828125, + "rewards//std": 0.03301383927464485, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.316, + "grad_norm": 2.062990188598633, + "kl": 0.9884178787469864, + "learning_rate": 7.82483501712469e-07, + "loss": 0.0988, + "num_tokens": 18288309.0, + "reward": 0.84271240234375, + "reward_std": 0.016919434070587158, + "rewards//mean": 0.84271240234375, + "rewards//std": 0.0355994887650013, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3162, + "grad_norm": 2.112541913986206, + "kl": 1.04594661667943, + "learning_rate": 7.822216094333847e-07, + "loss": 0.1046, + "num_tokens": 18299837.0, + "reward": 0.84857177734375, + "reward_std": 0.021786194294691086, + "rewards//mean": 0.84857177734375, + "rewards//std": 0.03120146133005619, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3164, + "grad_norm": 2.8066630363464355, + "kl": 1.4012076407670975, + "learning_rate": 7.819596034754696e-07, + "loss": 0.1401, + "num_tokens": 18311365.0, + "reward": 0.85400390625, + "reward_std": 0.02332780696451664, + "rewards//mean": 0.85400390625, + "rewards//std": 0.025415653362870216, + "step": 1582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3166, + "grad_norm": 3.926029682159424, + "kl": 1.5532252863049507, + "learning_rate": 7.816974839442603e-07, + "loss": 0.1553, + "num_tokens": 18322965.0, + "reward": 0.83343505859375, + "reward_std": 0.02932843565940857, + "rewards//mean": 0.83343505859375, + "rewards//std": 0.03595910966396332, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3168, + "grad_norm": 2.1204915046691895, + "kl": 1.060265775769949, + "learning_rate": 7.814352509453379e-07, + "loss": 0.106, + "num_tokens": 18334485.0, + "reward": 0.81103515625, + "reward_std": 0.02380114234983921, + "rewards//mean": 0.81103515625, + "rewards//std": 0.027800535783171654, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.317, + "grad_norm": 4.060159683227539, + "kl": 0.9439886696636677, + "learning_rate": 7.811729045843301e-07, + "loss": 0.0944, + "num_tokens": 18346021.0, + "reward": 0.79510498046875, + "reward_std": 0.012913115322589874, + "rewards//mean": 0.79510498046875, + "rewards//std": 0.017132308334112167, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3172, + "grad_norm": 2.285837173461914, + "kl": 1.4378974884748459, + "learning_rate": 7.8091044496691e-07, + "loss": 0.1438, + "num_tokens": 18357589.0, + "reward": 0.8544921875, + "reward_std": 0.026672784239053726, + "rewards//mean": 0.8544921875, + "rewards//std": 0.03408233821392059, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3174, + "grad_norm": 2.371490478515625, + "kl": 0.931724701076746, + "learning_rate": 7.806478721987963e-07, + "loss": 0.0932, + "num_tokens": 18369269.0, + "reward": 0.856689453125, + "reward_std": 0.019669223576784134, + "rewards//mean": 0.856689453125, + "rewards//std": 0.027550021186470985, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3176, + "grad_norm": 2.208517551422119, + "kl": 1.120299607515335, + "learning_rate": 7.803851863857532e-07, + "loss": 0.112, + "num_tokens": 18380941.0, + "reward": 0.82904052734375, + "reward_std": 0.027486389502882957, + "rewards//mean": 0.82904052734375, + "rewards//std": 0.031686730682849884, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3178, + "grad_norm": 2.1403920650482178, + "kl": 1.2717829756438732, + "learning_rate": 7.801223876335907e-07, + "loss": 0.1272, + "num_tokens": 18392573.0, + "reward": 0.8631591796875, + "reward_std": 0.0301213301718235, + "rewards//mean": 0.8631591796875, + "rewards//std": 0.03285181149840355, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.318, + "grad_norm": 1.7350138425827026, + "kl": 1.198329083621502, + "learning_rate": 7.798594760481637e-07, + "loss": 0.1198, + "num_tokens": 18404093.0, + "reward": 0.816650390625, + "reward_std": 0.018037620931863785, + "rewards//mean": 0.816650390625, + "rewards//std": 0.02472366951406002, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3182, + "grad_norm": 1.739819884300232, + "kl": 0.9458631128072739, + "learning_rate": 7.795964517353733e-07, + "loss": 0.0946, + "num_tokens": 18415741.0, + "reward": 0.841552734375, + "reward_std": 0.019227702170610428, + "rewards//mean": 0.841552734375, + "rewards//std": 0.028475167229771614, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3184, + "grad_norm": 2.1962599754333496, + "kl": 0.9123665429651737, + "learning_rate": 7.793333148011657e-07, + "loss": 0.0912, + "num_tokens": 18427269.0, + "reward": 0.8162841796875, + "reward_std": 0.015067648142576218, + "rewards//mean": 0.8162841796875, + "rewards//std": 0.01867504231631756, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3186, + "grad_norm": 2.316964626312256, + "kl": 0.8730153366923332, + "learning_rate": 7.790700653515323e-07, + "loss": 0.0873, + "num_tokens": 18438749.0, + "reward": 0.84149169921875, + "reward_std": 0.026870323345065117, + "rewards//mean": 0.84149169921875, + "rewards//std": 0.03692571446299553, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3188, + "grad_norm": 2.1990232467651367, + "kl": 0.9955050721764565, + "learning_rate": 7.788067034925099e-07, + "loss": 0.0996, + "num_tokens": 18450261.0, + "reward": 0.88104248046875, + "reward_std": 0.030915383249521255, + "rewards//mean": 0.88104248046875, + "rewards//std": 0.03703787177801132, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.319, + "grad_norm": 2.961508274078369, + "kl": 1.212947141379118, + "learning_rate": 7.785432293301806e-07, + "loss": 0.1213, + "num_tokens": 18461805.0, + "reward": 0.8548583984375, + "reward_std": 0.03331078216433525, + "rewards//mean": 0.8548583984375, + "rewards//std": 0.042933329939842224, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3192, + "grad_norm": 2.806334972381592, + "kl": 0.8651366904377937, + "learning_rate": 7.78279642970672e-07, + "loss": 0.0865, + "num_tokens": 18473309.0, + "reward": 0.8472900390625, + "reward_std": 0.024253815412521362, + "rewards//mean": 0.8472900390625, + "rewards//std": 0.0274517685174942, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3194, + "grad_norm": 2.1144497394561768, + "kl": 1.069103479385376, + "learning_rate": 7.780159445201562e-07, + "loss": 0.1069, + "num_tokens": 18484917.0, + "reward": 0.794677734375, + "reward_std": 0.012513509020209312, + "rewards//mean": 0.794677734375, + "rewards//std": 0.014990124851465225, + "step": 1597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3196, + "grad_norm": 1.7564005851745605, + "kl": 1.261869229376316, + "learning_rate": 7.777521340848514e-07, + "loss": 0.1262, + "num_tokens": 18496501.0, + "reward": 0.86895751953125, + "reward_std": 0.025888491421937943, + "rewards//mean": 0.86895751953125, + "rewards//std": 0.03965284302830696, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3198, + "grad_norm": 2.2535808086395264, + "kl": 0.7083954066038132, + "learning_rate": 7.774882117710202e-07, + "loss": 0.0708, + "num_tokens": 18508021.0, + "reward": 0.85516357421875, + "reward_std": 0.022233670577406883, + "rewards//mean": 0.85516357421875, + "rewards//std": 0.02931695617735386, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.32, + "grad_norm": 2.6835365295410156, + "kl": 1.1634095646440983, + "learning_rate": 7.772241776849704e-07, + "loss": 0.1163, + "num_tokens": 18519509.0, + "reward": 0.83013916015625, + "reward_std": 0.021596727892756462, + "rewards//mean": 0.83013916015625, + "rewards//std": 0.03285198286175728, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3202, + "grad_norm": 2.1491944789886475, + "kl": 1.0464067235589027, + "learning_rate": 7.769600319330552e-07, + "loss": 0.1046, + "num_tokens": 18531109.0, + "reward": 0.783447265625, + "reward_std": 0.025234568864107132, + "rewards//mean": 0.783447265625, + "rewards//std": 0.033188533037900925, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3204, + "grad_norm": 2.477029323577881, + "kl": 0.7876321226358414, + "learning_rate": 7.76695774621672e-07, + "loss": 0.0788, + "num_tokens": 18542701.0, + "reward": 0.88604736328125, + "reward_std": 0.027389496564865112, + "rewards//mean": 0.88604736328125, + "rewards//std": 0.03197444975376129, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3206, + "grad_norm": 1.9742473363876343, + "kl": 1.2614410817623138, + "learning_rate": 7.764314058572639e-07, + "loss": 0.1261, + "num_tokens": 18554277.0, + "reward": 0.8392333984375, + "reward_std": 0.014846490696072578, + "rewards//mean": 0.8392333984375, + "rewards//std": 0.023930605500936508, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3208, + "grad_norm": 3.1286399364471436, + "kl": 0.9801789037883282, + "learning_rate": 7.761669257463187e-07, + "loss": 0.098, + "num_tokens": 18566013.0, + "reward": 0.821533203125, + "reward_std": 0.017585797235369682, + "rewards//mean": 0.821533203125, + "rewards//std": 0.02282893657684326, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.321, + "grad_norm": 2.93505859375, + "kl": 0.8716515563428402, + "learning_rate": 7.759023343953688e-07, + "loss": 0.0872, + "num_tokens": 18577477.0, + "reward": 0.8472900390625, + "reward_std": 0.021249346435070038, + "rewards//mean": 0.8472900390625, + "rewards//std": 0.026691291481256485, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3212, + "grad_norm": 1.8548977375030518, + "kl": 0.9122760146856308, + "learning_rate": 7.756376319109916e-07, + "loss": 0.0912, + "num_tokens": 18589077.0, + "reward": 0.8338623046875, + "reward_std": 0.021147847175598145, + "rewards//mean": 0.8338623046875, + "rewards//std": 0.030337374657392502, + "step": 1606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3214, + "grad_norm": 2.289137125015259, + "kl": 0.8514838293194771, + "learning_rate": 7.753728183998092e-07, + "loss": 0.0851, + "num_tokens": 18600653.0, + "reward": 0.82659912109375, + "reward_std": 0.014863740652799606, + "rewards//mean": 0.82659912109375, + "rewards//std": 0.019320756196975708, + "step": 1607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3216, + "grad_norm": 3.5294861793518066, + "kl": 1.0204000174999237, + "learning_rate": 7.751078939684885e-07, + "loss": 0.102, + "num_tokens": 18612165.0, + "reward": 0.83929443359375, + "reward_std": 0.017459962517023087, + "rewards//mean": 0.83929443359375, + "rewards//std": 0.023896660655736923, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3218, + "grad_norm": 2.1728615760803223, + "kl": 1.3854641690850258, + "learning_rate": 7.748428587237411e-07, + "loss": 0.1385, + "num_tokens": 18623701.0, + "reward": 0.85845947265625, + "reward_std": 0.037492528557777405, + "rewards//mean": 0.85845947265625, + "rewards//std": 0.041902828961610794, + "step": 1609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.322, + "grad_norm": 2.274254083633423, + "kl": 1.3838536515831947, + "learning_rate": 7.74577712772323e-07, + "loss": 0.1384, + "num_tokens": 18635253.0, + "reward": 0.81597900390625, + "reward_std": 0.018412385135889053, + "rewards//mean": 0.81597900390625, + "rewards//std": 0.02495032176375389, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3222, + "grad_norm": 2.0292141437530518, + "kl": 1.3959488235414028, + "learning_rate": 7.743124562210351e-07, + "loss": 0.1396, + "num_tokens": 18646805.0, + "reward": 0.83160400390625, + "reward_std": 0.03628232330083847, + "rewards//mean": 0.83160400390625, + "rewards//std": 0.04072073847055435, + "step": 1611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3224, + "grad_norm": 1.9796253442764282, + "kl": 1.2609350346028805, + "learning_rate": 7.740470891767224e-07, + "loss": 0.1261, + "num_tokens": 18658333.0, + "reward": 0.8795166015625, + "reward_std": 0.02216266095638275, + "rewards//mean": 0.8795166015625, + "rewards//std": 0.037052836269140244, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3226, + "grad_norm": 1.7780858278274536, + "kl": 1.0831639729440212, + "learning_rate": 7.737816117462751e-07, + "loss": 0.1083, + "num_tokens": 18669805.0, + "reward": 0.8260498046875, + "reward_std": 0.022485747933387756, + "rewards//mean": 0.8260498046875, + "rewards//std": 0.029805796220898628, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3228, + "grad_norm": 2.350802183151245, + "kl": 1.015198603272438, + "learning_rate": 7.735160240366274e-07, + "loss": 0.1015, + "num_tokens": 18681485.0, + "reward": 0.83026123046875, + "reward_std": 0.02239460125565529, + "rewards//mean": 0.83026123046875, + "rewards//std": 0.027462448924779892, + "step": 1614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.323, + "grad_norm": 2.7505297660827637, + "kl": 1.1701455116271973, + "learning_rate": 7.732503261547578e-07, + "loss": 0.117, + "num_tokens": 18693093.0, + "reward": 0.8048095703125, + "reward_std": 0.015481002628803253, + "rewards//mean": 0.8048095703125, + "rewards//std": 0.018804289400577545, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.3232, + "grad_norm": 2.565464735031128, + "kl": 0.958940900862217, + "learning_rate": 7.729845182076895e-07, + "loss": 0.0925, + "num_tokens": 18704648.0, + "reward": 0.84490966796875, + "reward_std": 0.026057250797748566, + "rewards//mean": 0.84490966796875, + "rewards//std": 0.0435742624104023, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3234, + "grad_norm": 2.291609287261963, + "kl": 1.058329675346613, + "learning_rate": 7.7271860030249e-07, + "loss": 0.1058, + "num_tokens": 18716216.0, + "reward": 0.86163330078125, + "reward_std": 0.02987067401409149, + "rewards//mean": 0.86163330078125, + "rewards//std": 0.033852770924568176, + "step": 1617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3236, + "grad_norm": 2.2712700366973877, + "kl": 1.2312976345419884, + "learning_rate": 7.72452572546271e-07, + "loss": 0.1231, + "num_tokens": 18727864.0, + "reward": 0.849853515625, + "reward_std": 0.027427976951003075, + "rewards//mean": 0.849853515625, + "rewards//std": 0.03255487233400345, + "step": 1618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3238, + "grad_norm": 2.168194055557251, + "kl": 1.5119665376842022, + "learning_rate": 7.721864350461882e-07, + "loss": 0.1512, + "num_tokens": 18739456.0, + "reward": 0.8245849609375, + "reward_std": 0.02572796121239662, + "rewards//mean": 0.8245849609375, + "rewards//std": 0.028071202337741852, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.324, + "grad_norm": 1.9711004495620728, + "kl": 1.2856448888778687, + "learning_rate": 7.71920187909442e-07, + "loss": 0.1286, + "num_tokens": 18750992.0, + "reward": 0.84490966796875, + "reward_std": 0.03752205893397331, + "rewards//mean": 0.84490966796875, + "rewards//std": 0.047474056482315063, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3242, + "grad_norm": 2.5050532817840576, + "kl": 1.5600391626358032, + "learning_rate": 7.716538312432765e-07, + "loss": 0.156, + "num_tokens": 18762432.0, + "reward": 0.84765625, + "reward_std": 0.03403717279434204, + "rewards//mean": 0.84765625, + "rewards//std": 0.038121096789836884, + "step": 1621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3244, + "grad_norm": 3.0251619815826416, + "kl": 1.368517816066742, + "learning_rate": 7.713873651549804e-07, + "loss": 0.1369, + "num_tokens": 18773960.0, + "reward": 0.78363037109375, + "reward_std": 0.02300802245736122, + "rewards//mean": 0.78363037109375, + "rewards//std": 0.027714300900697708, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3246, + "grad_norm": 2.1418917179107666, + "kl": 0.7207666784524918, + "learning_rate": 7.71120789751886e-07, + "loss": 0.0721, + "num_tokens": 18785448.0, + "reward": 0.83709716796875, + "reward_std": 0.016438009217381477, + "rewards//mean": 0.83709716796875, + "rewards//std": 0.02053685672581196, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3248, + "grad_norm": 2.1514065265655518, + "kl": 0.9097731448709965, + "learning_rate": 7.7085410514137e-07, + "loss": 0.091, + "num_tokens": 18797120.0, + "reward": 0.8450927734375, + "reward_std": 0.0176820270717144, + "rewards//mean": 0.8450927734375, + "rewards//std": 0.036797020584344864, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.325, + "grad_norm": 2.2302346229553223, + "kl": 1.2498055174946785, + "learning_rate": 7.705873114308527e-07, + "loss": 0.125, + "num_tokens": 18808736.0, + "reward": 0.78668212890625, + "reward_std": 0.02172163501381874, + "rewards//mean": 0.78668212890625, + "rewards//std": 0.03319394960999489, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3252, + "grad_norm": 1.789707899093628, + "kl": 1.172688089311123, + "learning_rate": 7.703204087277988e-07, + "loss": 0.1173, + "num_tokens": 18820352.0, + "reward": 0.828857421875, + "reward_std": 0.021092547103762627, + "rewards//mean": 0.828857421875, + "rewards//std": 0.025909962132573128, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3254, + "grad_norm": 1.9112013578414917, + "kl": 0.759766761213541, + "learning_rate": 7.700533971397165e-07, + "loss": 0.076, + "num_tokens": 18831928.0, + "reward": 0.8392333984375, + "reward_std": 0.019591284915804863, + "rewards//mean": 0.8392333984375, + "rewards//std": 0.02433208003640175, + "step": 1627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3256, + "grad_norm": 1.8563059568405151, + "kl": 1.2186555564403534, + "learning_rate": 7.697862767741583e-07, + "loss": 0.1219, + "num_tokens": 18843480.0, + "reward": 0.87701416015625, + "reward_std": 0.030143294483423233, + "rewards//mean": 0.87701416015625, + "rewards//std": 0.039291199296712875, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3258, + "grad_norm": 1.9807584285736084, + "kl": 1.2587885484099388, + "learning_rate": 7.695190477387199e-07, + "loss": 0.1259, + "num_tokens": 18855032.0, + "reward": 0.87542724609375, + "reward_std": 0.03404088318347931, + "rewards//mean": 0.87542724609375, + "rewards//std": 0.04234198108315468, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.326, + "grad_norm": 3.205050230026245, + "kl": 1.5790330544114113, + "learning_rate": 7.692517101410414e-07, + "loss": 0.1579, + "num_tokens": 18866696.0, + "reward": 0.844482421875, + "reward_std": 0.019551554694771767, + "rewards//mean": 0.844482421875, + "rewards//std": 0.03286586329340935, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3262, + "grad_norm": 2.442159414291382, + "kl": 1.0738133564591408, + "learning_rate": 7.689842640888063e-07, + "loss": 0.1074, + "num_tokens": 18878208.0, + "reward": 0.834716796875, + "reward_std": 0.02141866460442543, + "rewards//mean": 0.834716796875, + "rewards//std": 0.02984563820064068, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3264, + "grad_norm": 2.665010690689087, + "kl": 1.135521225631237, + "learning_rate": 7.687167096897418e-07, + "loss": 0.1136, + "num_tokens": 18889880.0, + "reward": 0.77740478515625, + "reward_std": 0.016163675114512444, + "rewards//mean": 0.77740478515625, + "rewards//std": 0.021344322711229324, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3266, + "grad_norm": 3.6952526569366455, + "kl": 1.2899731323122978, + "learning_rate": 7.684490470516185e-07, + "loss": 0.129, + "num_tokens": 18901384.0, + "reward": 0.82012939453125, + "reward_std": 0.021170329302549362, + "rewards//mean": 0.82012939453125, + "rewards//std": 0.026203272864222527, + "step": 1633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3268, + "grad_norm": 2.112513542175293, + "kl": 1.2232006825506687, + "learning_rate": 7.681812762822515e-07, + "loss": 0.1223, + "num_tokens": 18912976.0, + "reward": 0.85345458984375, + "reward_std": 0.026332776993513107, + "rewards//mean": 0.85345458984375, + "rewards//std": 0.028763895854353905, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.327, + "grad_norm": 1.9707705974578857, + "kl": 1.282958585768938, + "learning_rate": 7.679133974894982e-07, + "loss": 0.1283, + "num_tokens": 18924504.0, + "reward": 0.8863525390625, + "reward_std": 0.035345517098903656, + "rewards//mean": 0.8863525390625, + "rewards//std": 0.04791099950671196, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3272, + "grad_norm": 2.8503708839416504, + "kl": 1.86912602186203, + "learning_rate": 7.676454107812607e-07, + "loss": 0.1869, + "num_tokens": 18936048.0, + "reward": 0.841552734375, + "reward_std": 0.020842082798480988, + "rewards//mean": 0.841552734375, + "rewards//std": 0.027532432228326797, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3274, + "grad_norm": 3.245810031890869, + "kl": 1.0237435474991798, + "learning_rate": 7.673773162654836e-07, + "loss": 0.1024, + "num_tokens": 18947688.0, + "reward": 0.82354736328125, + "reward_std": 0.029026929289102554, + "rewards//mean": 0.82354736328125, + "rewards//std": 0.03377935662865639, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3276, + "grad_norm": 4.347595691680908, + "kl": 1.2622648179531097, + "learning_rate": 7.671091140501555e-07, + "loss": 0.1262, + "num_tokens": 18959232.0, + "reward": 0.80126953125, + "reward_std": 0.013335267081856728, + "rewards//mean": 0.80126953125, + "rewards//std": 0.019243918359279633, + "step": 1638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3278, + "grad_norm": 1.8997724056243896, + "kl": 1.2582791596651077, + "learning_rate": 7.668408042433081e-07, + "loss": 0.1258, + "num_tokens": 18970760.0, + "reward": 0.8516845703125, + "reward_std": 0.03193528205156326, + "rewards//mean": 0.8516845703125, + "rewards//std": 0.0362582802772522, + "step": 1639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.328, + "grad_norm": 2.988468885421753, + "kl": 1.6594780310988426, + "learning_rate": 7.665723869530169e-07, + "loss": 0.1659, + "num_tokens": 18982248.0, + "reward": 0.8399658203125, + "reward_std": 0.026882745325565338, + "rewards//mean": 0.8399658203125, + "rewards//std": 0.03795534744858742, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3282, + "grad_norm": 2.7239415645599365, + "kl": 1.0321888625621796, + "learning_rate": 7.663038622873999e-07, + "loss": 0.1032, + "num_tokens": 18993792.0, + "reward": 0.7960205078125, + "reward_std": 0.012799867428839207, + "rewards//mean": 0.7960205078125, + "rewards//std": 0.01868152618408203, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3284, + "grad_norm": 2.262624502182007, + "kl": 1.283872801810503, + "learning_rate": 7.660352303546192e-07, + "loss": 0.1284, + "num_tokens": 19005376.0, + "reward": 0.83380126953125, + "reward_std": 0.02039901167154312, + "rewards//mean": 0.83380126953125, + "rewards//std": 0.03403916954994202, + "step": 1642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3286, + "grad_norm": 2.4678070545196533, + "kl": 1.6167702600359917, + "learning_rate": 7.657664912628794e-07, + "loss": 0.1617, + "num_tokens": 19017080.0, + "reward": 0.8294677734375, + "reward_std": 0.021920155733823776, + "rewards//mean": 0.8294677734375, + "rewards//std": 0.02802370674908161, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3288, + "grad_norm": 2.7546451091766357, + "kl": 0.8123422563076019, + "learning_rate": 7.654976451204287e-07, + "loss": 0.0812, + "num_tokens": 19028584.0, + "reward": 0.8662109375, + "reward_std": 0.023691482841968536, + "rewards//mean": 0.8662109375, + "rewards//std": 0.029643084853887558, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.329, + "grad_norm": 2.0686004161834717, + "kl": 1.3161420822143555, + "learning_rate": 7.652286920355583e-07, + "loss": 0.1316, + "num_tokens": 19040240.0, + "reward": 0.8504638671875, + "reward_std": 0.0324176624417305, + "rewards//mean": 0.8504638671875, + "rewards//std": 0.03234284371137619, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3292, + "grad_norm": 2.0808420181274414, + "kl": 0.8158667981624603, + "learning_rate": 7.649596321166024e-07, + "loss": 0.0816, + "num_tokens": 19051936.0, + "reward": 0.841796875, + "reward_std": 0.015839245170354843, + "rewards//mean": 0.841796875, + "rewards//std": 0.0178532674908638, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3294, + "grad_norm": 2.69523024559021, + "kl": 0.8371287621557713, + "learning_rate": 7.646904654719385e-07, + "loss": 0.0837, + "num_tokens": 19063536.0, + "reward": 0.8299560546875, + "reward_std": 0.016209296882152557, + "rewards//mean": 0.8299560546875, + "rewards//std": 0.023836802691221237, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3296, + "grad_norm": 1.930070400238037, + "kl": 1.0753106400370598, + "learning_rate": 7.644211922099867e-07, + "loss": 0.1075, + "num_tokens": 19075088.0, + "reward": 0.85638427734375, + "reward_std": 0.021039264276623726, + "rewards//mean": 0.85638427734375, + "rewards//std": 0.025226665660738945, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3298, + "grad_norm": 2.2488019466400146, + "kl": 1.018364779651165, + "learning_rate": 7.641518124392103e-07, + "loss": 0.1018, + "num_tokens": 19086656.0, + "reward": 0.86322021484375, + "reward_std": 0.032089486718177795, + "rewards//mean": 0.86322021484375, + "rewards//std": 0.03728269413113594, + "step": 1649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.33, + "grad_norm": 2.0104405879974365, + "kl": 1.6298075467348099, + "learning_rate": 7.638823262681154e-07, + "loss": 0.163, + "num_tokens": 19098176.0, + "reward": 0.8416748046875, + "reward_std": 0.038275331258773804, + "rewards//mean": 0.8416748046875, + "rewards//std": 0.04786168783903122, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3302, + "grad_norm": 2.8175854682922363, + "kl": 0.9388470314443111, + "learning_rate": 7.636127338052511e-07, + "loss": 0.0939, + "num_tokens": 19109720.0, + "reward": 0.8468017578125, + "reward_std": 0.01973528228700161, + "rewards//mean": 0.8468017578125, + "rewards//std": 0.028232516720891, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3304, + "grad_norm": 2.184499979019165, + "kl": 1.3746264204382896, + "learning_rate": 7.633430351592093e-07, + "loss": 0.1375, + "num_tokens": 19121264.0, + "reward": 0.85992431640625, + "reward_std": 0.022781409323215485, + "rewards//mean": 0.85992431640625, + "rewards//std": 0.03944423794746399, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3306, + "grad_norm": 2.386234998703003, + "kl": 1.0343327447772026, + "learning_rate": 7.630732304386243e-07, + "loss": 0.1034, + "num_tokens": 19132816.0, + "reward": 0.87103271484375, + "reward_std": 0.03979931026697159, + "rewards//mean": 0.87103271484375, + "rewards//std": 0.045890647917985916, + "step": 1653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3308, + "grad_norm": 2.3104021549224854, + "kl": 1.3547619581222534, + "learning_rate": 7.628033197521735e-07, + "loss": 0.1355, + "num_tokens": 19144400.0, + "reward": 0.7774658203125, + "reward_std": 0.019799508154392242, + "rewards//mean": 0.7774658203125, + "rewards//std": 0.02332322485744953, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.331, + "grad_norm": 2.3833274841308594, + "kl": 1.3557376712560654, + "learning_rate": 7.625333032085769e-07, + "loss": 0.1356, + "num_tokens": 19155968.0, + "reward": 0.85235595703125, + "reward_std": 0.021274326369166374, + "rewards//mean": 0.85235595703125, + "rewards//std": 0.02412174642086029, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3312, + "grad_norm": 2.595283031463623, + "kl": 0.9535662606358528, + "learning_rate": 7.622631809165972e-07, + "loss": 0.0954, + "num_tokens": 19167544.0, + "reward": 0.835693359375, + "reward_std": 0.02625756710767746, + "rewards//mean": 0.835693359375, + "rewards//std": 0.032880596816539764, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3314, + "grad_norm": 2.7985384464263916, + "kl": 1.3163466602563858, + "learning_rate": 7.619929529850396e-07, + "loss": 0.1316, + "num_tokens": 19179072.0, + "reward": 0.81060791015625, + "reward_std": 0.01220204308629036, + "rewards//mean": 0.81060791015625, + "rewards//std": 0.01888171210885048, + "step": 1657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3316, + "grad_norm": 2.3977513313293457, + "kl": 1.0197549760341644, + "learning_rate": 7.617226195227517e-07, + "loss": 0.102, + "num_tokens": 19190640.0, + "reward": 0.77740478515625, + "reward_std": 0.013771357014775276, + "rewards//mean": 0.77740478515625, + "rewards//std": 0.01672545075416565, + "step": 1658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3318, + "grad_norm": 2.4292500019073486, + "kl": 1.3872503116726875, + "learning_rate": 7.614521806386243e-07, + "loss": 0.1387, + "num_tokens": 19202168.0, + "reward": 0.84161376953125, + "reward_std": 0.02494017779827118, + "rewards//mean": 0.84161376953125, + "rewards//std": 0.02940460294485092, + "step": 1659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.332, + "grad_norm": 2.1796011924743652, + "kl": 0.9836952611804008, + "learning_rate": 7.611816364415895e-07, + "loss": 0.0984, + "num_tokens": 19213768.0, + "reward": 0.85565185546875, + "reward_std": 0.028908098116517067, + "rewards//mean": 0.85565185546875, + "rewards//std": 0.03461422026157379, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3322, + "grad_norm": 2.0340383052825928, + "kl": 1.2404558137059212, + "learning_rate": 7.60910987040623e-07, + "loss": 0.124, + "num_tokens": 19225296.0, + "reward": 0.81097412109375, + "reward_std": 0.023252567276358604, + "rewards//mean": 0.81097412109375, + "rewards//std": 0.029762277379631996, + "step": 1661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3324, + "grad_norm": 2.018101215362549, + "kl": 1.758645236492157, + "learning_rate": 7.606402325447419e-07, + "loss": 0.1759, + "num_tokens": 19236968.0, + "reward": 0.81182861328125, + "reward_std": 0.041330866515636444, + "rewards//mean": 0.81182861328125, + "rewards//std": 0.05012866109609604, + "step": 1662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3326, + "grad_norm": 2.2587642669677734, + "kl": 0.9868241921067238, + "learning_rate": 7.603693730630066e-07, + "loss": 0.0987, + "num_tokens": 19248544.0, + "reward": 0.8326416015625, + "reward_std": 0.025549430400133133, + "rewards//mean": 0.8326416015625, + "rewards//std": 0.03291993588209152, + "step": 1663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3328, + "grad_norm": 2.5849671363830566, + "kl": 0.8653412833809853, + "learning_rate": 7.600984087045186e-07, + "loss": 0.0865, + "num_tokens": 19260144.0, + "reward": 0.83953857421875, + "reward_std": 0.029122523963451385, + "rewards//mean": 0.83953857421875, + "rewards//std": 0.04101225733757019, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.333, + "grad_norm": 2.0715537071228027, + "kl": 0.9633585251867771, + "learning_rate": 7.598273395784229e-07, + "loss": 0.0963, + "num_tokens": 19271688.0, + "reward": 0.85382080078125, + "reward_std": 0.019596872851252556, + "rewards//mean": 0.85382080078125, + "rewards//std": 0.022785712033510208, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3332, + "grad_norm": 2.3885998725891113, + "kl": 1.0170930996537209, + "learning_rate": 7.59556165793906e-07, + "loss": 0.1017, + "num_tokens": 19283216.0, + "reward": 0.84002685546875, + "reward_std": 0.029614388942718506, + "rewards//mean": 0.84002685546875, + "rewards//std": 0.031191272661089897, + "step": 1666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3334, + "grad_norm": 2.056429624557495, + "kl": 0.8922125697135925, + "learning_rate": 7.592848874601963e-07, + "loss": 0.0892, + "num_tokens": 19294784.0, + "reward": 0.86541748046875, + "reward_std": 0.0233765821903944, + "rewards//mean": 0.86541748046875, + "rewards//std": 0.03360232710838318, + "step": 1667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3336, + "grad_norm": 2.1614511013031006, + "kl": 1.369120266288519, + "learning_rate": 7.590135046865651e-07, + "loss": 0.1369, + "num_tokens": 19306344.0, + "reward": 0.86383056640625, + "reward_std": 0.025954999029636383, + "rewards//mean": 0.86383056640625, + "rewards//std": 0.0302486140280962, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3338, + "grad_norm": 2.244654417037964, + "kl": 1.4699707627296448, + "learning_rate": 7.587420175823252e-07, + "loss": 0.147, + "num_tokens": 19317880.0, + "reward": 0.8226318359375, + "reward_std": 0.020325634628534317, + "rewards//mean": 0.8226318359375, + "rewards//std": 0.024227336049079895, + "step": 1669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.334, + "grad_norm": 1.9979828596115112, + "kl": 0.9475029297173023, + "learning_rate": 7.584704262568314e-07, + "loss": 0.0948, + "num_tokens": 19329392.0, + "reward": 0.855224609375, + "reward_std": 0.020440051332116127, + "rewards//mean": 0.855224609375, + "rewards//std": 0.025694062933325768, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3342, + "grad_norm": 2.6101088523864746, + "kl": 0.9226338714361191, + "learning_rate": 7.581987308194809e-07, + "loss": 0.0923, + "num_tokens": 19340976.0, + "reward": 0.80242919921875, + "reward_std": 0.022830555215477943, + "rewards//mean": 0.80242919921875, + "rewards//std": 0.025456642732024193, + "step": 1671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3344, + "grad_norm": 2.303866386413574, + "kl": 1.1915860697627068, + "learning_rate": 7.579269313797125e-07, + "loss": 0.1192, + "num_tokens": 19352448.0, + "reward": 0.8358154296875, + "reward_std": 0.021579008549451828, + "rewards//mean": 0.8358154296875, + "rewards//std": 0.0306471548974514, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3346, + "grad_norm": 2.0542454719543457, + "kl": 1.2225299701094627, + "learning_rate": 7.576550280470071e-07, + "loss": 0.1223, + "num_tokens": 19364120.0, + "reward": 0.833740234375, + "reward_std": 0.02891112118959427, + "rewards//mean": 0.833740234375, + "rewards//std": 0.036063410341739655, + "step": 1673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3348, + "grad_norm": 2.2736587524414062, + "kl": 1.2128590121865273, + "learning_rate": 7.573830209308872e-07, + "loss": 0.1213, + "num_tokens": 19375688.0, + "reward": 0.88037109375, + "reward_std": 0.027187587693333626, + "rewards//mean": 0.88037109375, + "rewards//std": 0.032796699553728104, + "step": 1674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.335, + "grad_norm": 2.4608283042907715, + "kl": 1.114521212875843, + "learning_rate": 7.57110910140917e-07, + "loss": 0.1115, + "num_tokens": 19387248.0, + "reward": 0.8780517578125, + "reward_std": 0.03584162890911102, + "rewards//mean": 0.8780517578125, + "rewards//std": 0.041587937623262405, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3352, + "grad_norm": 2.7307686805725098, + "kl": 1.2429914101958275, + "learning_rate": 7.568386957867032e-07, + "loss": 0.1243, + "num_tokens": 19398832.0, + "reward": 0.86944580078125, + "reward_std": 0.031007347628474236, + "rewards//mean": 0.86944580078125, + "rewards//std": 0.034735582768917084, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3354, + "grad_norm": 2.268768310546875, + "kl": 1.113535262644291, + "learning_rate": 7.565663779778933e-07, + "loss": 0.1114, + "num_tokens": 19410360.0, + "reward": 0.86932373046875, + "reward_std": 0.03283742815256119, + "rewards//mean": 0.86932373046875, + "rewards//std": 0.04013248160481453, + "step": 1677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3356, + "grad_norm": 3.4893014430999756, + "kl": 1.2216964289546013, + "learning_rate": 7.562939568241771e-07, + "loss": 0.1222, + "num_tokens": 19421904.0, + "reward": 0.80426025390625, + "reward_std": 0.019742289558053017, + "rewards//mean": 0.80426025390625, + "rewards//std": 0.027936264872550964, + "step": 1678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3358, + "grad_norm": 2.045588970184326, + "kl": 1.3207953497767448, + "learning_rate": 7.560214324352858e-07, + "loss": 0.1321, + "num_tokens": 19433472.0, + "reward": 0.84954833984375, + "reward_std": 0.02129722386598587, + "rewards//mean": 0.84954833984375, + "rewards//std": 0.02762129157781601, + "step": 1679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.336, + "grad_norm": 2.2413277626037598, + "kl": 1.158900409936905, + "learning_rate": 7.55748804920992e-07, + "loss": 0.1159, + "num_tokens": 19445120.0, + "reward": 0.85711669921875, + "reward_std": 0.02331465110182762, + "rewards//mean": 0.85711669921875, + "rewards//std": 0.026836862787604332, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3362, + "grad_norm": 2.9440836906433105, + "kl": 1.6170477718114853, + "learning_rate": 7.554760743911103e-07, + "loss": 0.1617, + "num_tokens": 19456768.0, + "reward": 0.8546142578125, + "reward_std": 0.027163254097104073, + "rewards//mean": 0.8546142578125, + "rewards//std": 0.03157744184136391, + "step": 1681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3364, + "grad_norm": 2.954803705215454, + "kl": 1.744253620505333, + "learning_rate": 7.552032409554962e-07, + "loss": 0.1744, + "num_tokens": 19468296.0, + "reward": 0.81512451171875, + "reward_std": 0.019074706360697746, + "rewards//mean": 0.81512451171875, + "rewards//std": 0.023888424038887024, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3366, + "grad_norm": 2.144840955734253, + "kl": 1.5837879478931427, + "learning_rate": 7.549303047240474e-07, + "loss": 0.1584, + "num_tokens": 19479840.0, + "reward": 0.81219482421875, + "reward_std": 0.022199168801307678, + "rewards//mean": 0.81219482421875, + "rewards//std": 0.028393646702170372, + "step": 1683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3368, + "grad_norm": 2.489920139312744, + "kl": 1.2401353269815445, + "learning_rate": 7.54657265806702e-07, + "loss": 0.124, + "num_tokens": 19491384.0, + "reward": 0.85894775390625, + "reward_std": 0.02790280431509018, + "rewards//mean": 0.85894775390625, + "rewards//std": 0.033299583941698074, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.337, + "grad_norm": 2.564251661300659, + "kl": 1.363706760108471, + "learning_rate": 7.543841243134408e-07, + "loss": 0.1364, + "num_tokens": 19502952.0, + "reward": 0.8402099609375, + "reward_std": 0.02285730093717575, + "rewards//mean": 0.8402099609375, + "rewards//std": 0.029990093782544136, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3372, + "grad_norm": 2.3911430835723877, + "kl": 1.6419494152069092, + "learning_rate": 7.541108803542845e-07, + "loss": 0.1642, + "num_tokens": 19514496.0, + "reward": 0.82464599609375, + "reward_std": 0.025527140125632286, + "rewards//mean": 0.82464599609375, + "rewards//std": 0.034807417541742325, + "step": 1686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3374, + "grad_norm": 3.7110743522644043, + "kl": 1.5675870701670647, + "learning_rate": 7.538375340392961e-07, + "loss": 0.1568, + "num_tokens": 19526096.0, + "reward": 0.85137939453125, + "reward_std": 0.03343860059976578, + "rewards//mean": 0.85137939453125, + "rewards//std": 0.04073449224233627, + "step": 1687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3376, + "grad_norm": 2.2196497917175293, + "kl": 1.2328812777996063, + "learning_rate": 7.535640854785791e-07, + "loss": 0.1233, + "num_tokens": 19537640.0, + "reward": 0.84417724609375, + "reward_std": 0.024183396250009537, + "rewards//mean": 0.84417724609375, + "rewards//std": 0.0295867957174778, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3378, + "grad_norm": 3.346951484680176, + "kl": 1.0342012867331505, + "learning_rate": 7.532905347822791e-07, + "loss": 0.1034, + "num_tokens": 19549192.0, + "reward": 0.85260009765625, + "reward_std": 0.024996325373649597, + "rewards//mean": 0.85260009765625, + "rewards//std": 0.04174937307834625, + "step": 1689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.338, + "grad_norm": 2.203160524368286, + "kl": 1.3636800944805145, + "learning_rate": 7.530168820605818e-07, + "loss": 0.1364, + "num_tokens": 19560792.0, + "reward": 0.83502197265625, + "reward_std": 0.027433060109615326, + "rewards//mean": 0.83502197265625, + "rewards//std": 0.03924648091197014, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3382, + "grad_norm": 2.122709274291992, + "kl": 1.372876800596714, + "learning_rate": 7.527431274237149e-07, + "loss": 0.1373, + "num_tokens": 19572456.0, + "reward": 0.8397216796875, + "reward_std": 0.027389410883188248, + "rewards//mean": 0.8397216796875, + "rewards//std": 0.029173383489251137, + "step": 1691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3384, + "grad_norm": 3.6173410415649414, + "kl": 1.2726735919713974, + "learning_rate": 7.524692709819463e-07, + "loss": 0.1273, + "num_tokens": 19584008.0, + "reward": 0.83624267578125, + "reward_std": 0.022356148809194565, + "rewards//mean": 0.83624267578125, + "rewards//std": 0.0271017923951149, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3386, + "grad_norm": 3.6583757400512695, + "kl": 1.2347675561904907, + "learning_rate": 7.521953128455855e-07, + "loss": 0.1235, + "num_tokens": 19595552.0, + "reward": 0.79998779296875, + "reward_std": 0.01885152794420719, + "rewards//mean": 0.79998779296875, + "rewards//std": 0.026293238624930382, + "step": 1693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3388, + "grad_norm": 2.580787181854248, + "kl": 1.7076608315110207, + "learning_rate": 7.519212531249829e-07, + "loss": 0.1708, + "num_tokens": 19607088.0, + "reward": 0.81427001953125, + "reward_std": 0.03766436502337456, + "rewards//mean": 0.81427001953125, + "rewards//std": 0.04139214754104614, + "step": 1694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.339, + "grad_norm": 2.487157106399536, + "kl": 1.5530758015811443, + "learning_rate": 7.516470919305298e-07, + "loss": 0.1553, + "num_tokens": 19618648.0, + "reward": 0.83270263671875, + "reward_std": 0.03615532070398331, + "rewards//mean": 0.83270263671875, + "rewards//std": 0.047120075672864914, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3392, + "grad_norm": 1.8907105922698975, + "kl": 1.0405315421521664, + "learning_rate": 7.513728293726579e-07, + "loss": 0.1041, + "num_tokens": 19630144.0, + "reward": 0.8604736328125, + "reward_std": 0.020691538229584694, + "rewards//mean": 0.8604736328125, + "rewards//std": 0.03128461539745331, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3394, + "grad_norm": 3.1925766468048096, + "kl": 0.83034997433424, + "learning_rate": 7.510984655618406e-07, + "loss": 0.083, + "num_tokens": 19641720.0, + "reward": 0.869140625, + "reward_std": 0.027692146599292755, + "rewards//mean": 0.869140625, + "rewards//std": 0.03467413783073425, + "step": 1697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3396, + "grad_norm": 2.1872477531433105, + "kl": 1.3415842279791832, + "learning_rate": 7.508240006085913e-07, + "loss": 0.1342, + "num_tokens": 19653352.0, + "reward": 0.8131103515625, + "reward_std": 0.019943006336688995, + "rewards//mean": 0.8131103515625, + "rewards//std": 0.026210565119981766, + "step": 1698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3398, + "grad_norm": 2.5281693935394287, + "kl": 1.0085118412971497, + "learning_rate": 7.505494346234647e-07, + "loss": 0.1009, + "num_tokens": 19664856.0, + "reward": 0.862060546875, + "reward_std": 0.020905546844005585, + "rewards//mean": 0.862060546875, + "rewards//std": 0.02657448872923851, + "step": 1699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.34, + "grad_norm": 2.2653045654296875, + "kl": 1.0201074928045273, + "learning_rate": 7.502747677170555e-07, + "loss": 0.102, + "num_tokens": 19676440.0, + "reward": 0.79345703125, + "reward_std": 0.02104242704808712, + "rewards//mean": 0.79345703125, + "rewards//std": 0.028742903843522072, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3402, + "grad_norm": 1.772116780281067, + "kl": 0.7976325489580631, + "learning_rate": 7.5e-07, + "loss": 0.0798, + "num_tokens": 19688008.0, + "reward": 0.8714599609375, + "reward_std": 0.01922345533967018, + "rewards//mean": 0.8714599609375, + "rewards//std": 0.02495615929365158, + "step": 1701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3404, + "grad_norm": 2.822141170501709, + "kl": 1.0977128744125366, + "learning_rate": 7.497251315829743e-07, + "loss": 0.1098, + "num_tokens": 19699600.0, + "reward": 0.8653564453125, + "reward_std": 0.023819822818040848, + "rewards//mean": 0.8653564453125, + "rewards//std": 0.0322265625, + "step": 1702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3406, + "grad_norm": 2.353276014328003, + "kl": 1.2786153182387352, + "learning_rate": 7.494501625766955e-07, + "loss": 0.1279, + "num_tokens": 19711248.0, + "reward": 0.8487548828125, + "reward_std": 0.025982998311519623, + "rewards//mean": 0.8487548828125, + "rewards//std": 0.028885535895824432, + "step": 1703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3408, + "grad_norm": 2.375336170196533, + "kl": 1.1746254861354828, + "learning_rate": 7.491750930919212e-07, + "loss": 0.1175, + "num_tokens": 19722808.0, + "reward": 0.83184814453125, + "reward_std": 0.01755732297897339, + "rewards//mean": 0.83184814453125, + "rewards//std": 0.024601448327302933, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.341, + "grad_norm": 2.1103451251983643, + "kl": 1.3618728443980217, + "learning_rate": 7.488999232394491e-07, + "loss": 0.1362, + "num_tokens": 19734440.0, + "reward": 0.81927490234375, + "reward_std": 0.025896020233631134, + "rewards//mean": 0.81927490234375, + "rewards//std": 0.030283624306321144, + "step": 1705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3412, + "grad_norm": 2.241057872772217, + "kl": 1.0528841651976109, + "learning_rate": 7.486246531301177e-07, + "loss": 0.1053, + "num_tokens": 19746032.0, + "reward": 0.81561279296875, + "reward_std": 0.018034925684332848, + "rewards//mean": 0.81561279296875, + "rewards//std": 0.027491098269820213, + "step": 1706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3414, + "grad_norm": 2.8775618076324463, + "kl": 1.0993666872382164, + "learning_rate": 7.483492828748056e-07, + "loss": 0.1099, + "num_tokens": 19757680.0, + "reward": 0.83636474609375, + "reward_std": 0.014125131070613861, + "rewards//mean": 0.83636474609375, + "rewards//std": 0.02398013137280941, + "step": 1707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3416, + "grad_norm": 2.6733527183532715, + "kl": 1.0839085653424263, + "learning_rate": 7.480738125844322e-07, + "loss": 0.1084, + "num_tokens": 19769240.0, + "reward": 0.869384765625, + "reward_std": 0.03207353875041008, + "rewards//mean": 0.869384765625, + "rewards//std": 0.037233177572488785, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3418, + "grad_norm": 3.177396297454834, + "kl": 1.0015723556280136, + "learning_rate": 7.477982423699567e-07, + "loss": 0.1002, + "num_tokens": 19780832.0, + "reward": 0.857666015625, + "reward_std": 0.022191140800714493, + "rewards//mean": 0.857666015625, + "rewards//std": 0.027115821838378906, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.342, + "grad_norm": 2.0884203910827637, + "kl": 1.6100852638483047, + "learning_rate": 7.475225723423788e-07, + "loss": 0.161, + "num_tokens": 19792456.0, + "reward": 0.828857421875, + "reward_std": 0.02709764987230301, + "rewards//mean": 0.828857421875, + "rewards//std": 0.03088259883224964, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3422, + "grad_norm": 3.287003755569458, + "kl": 1.0825219228863716, + "learning_rate": 7.472468026127384e-07, + "loss": 0.1083, + "num_tokens": 19804160.0, + "reward": 0.81097412109375, + "reward_std": 0.019927995279431343, + "rewards//mean": 0.81097412109375, + "rewards//std": 0.024425454437732697, + "step": 1711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3424, + "grad_norm": 2.2236557006835938, + "kl": 0.7801645211875439, + "learning_rate": 7.469709332921154e-07, + "loss": 0.078, + "num_tokens": 19815656.0, + "reward": 0.8280029296875, + "reward_std": 0.01929762214422226, + "rewards//mean": 0.8280029296875, + "rewards//std": 0.04147421568632126, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3426, + "grad_norm": 2.2250115871429443, + "kl": 1.449027732014656, + "learning_rate": 7.4669496449163e-07, + "loss": 0.1449, + "num_tokens": 19827232.0, + "reward": 0.8345947265625, + "reward_std": 0.021069660782814026, + "rewards//mean": 0.8345947265625, + "rewards//std": 0.02942345291376114, + "step": 1713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3428, + "grad_norm": 2.1702709197998047, + "kl": 0.89957295358181, + "learning_rate": 7.464188963224427e-07, + "loss": 0.09, + "num_tokens": 19838776.0, + "reward": 0.8201904296875, + "reward_std": 0.012795337475836277, + "rewards//mean": 0.8201904296875, + "rewards//std": 0.026829317212104797, + "step": 1714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.343, + "grad_norm": 2.522571325302124, + "kl": 0.8199459612369537, + "learning_rate": 7.461427288957531e-07, + "loss": 0.082, + "num_tokens": 19850320.0, + "reward": 0.85260009765625, + "reward_std": 0.01603105291724205, + "rewards//mean": 0.85260009765625, + "rewards//std": 0.026822756975889206, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3432, + "grad_norm": 2.48093318939209, + "kl": 1.3361146971583366, + "learning_rate": 7.45866462322802e-07, + "loss": 0.1336, + "num_tokens": 19861824.0, + "reward": 0.835693359375, + "reward_std": 0.01565816067159176, + "rewards//mean": 0.835693359375, + "rewards//std": 0.02233557030558586, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3434, + "grad_norm": 2.23061466217041, + "kl": 0.7897198684513569, + "learning_rate": 7.45590096714869e-07, + "loss": 0.079, + "num_tokens": 19873336.0, + "reward": 0.81280517578125, + "reward_std": 0.017740264534950256, + "rewards//mean": 0.81280517578125, + "rewards//std": 0.023159988224506378, + "step": 1717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3436, + "grad_norm": 1.7233387231826782, + "kl": 1.0096617378294468, + "learning_rate": 7.453136321832745e-07, + "loss": 0.101, + "num_tokens": 19884872.0, + "reward": 0.87548828125, + "reward_std": 0.026616143062710762, + "rewards//mean": 0.87548828125, + "rewards//std": 0.04489828273653984, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3438, + "grad_norm": 2.7792346477508545, + "kl": 0.9570935145020485, + "learning_rate": 7.450370688393784e-07, + "loss": 0.0957, + "num_tokens": 19896416.0, + "reward": 0.79833984375, + "reward_std": 0.017836440354585648, + "rewards//mean": 0.79833984375, + "rewards//std": 0.033134669065475464, + "step": 1719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.344, + "grad_norm": 2.302109479904175, + "kl": 1.399824857711792, + "learning_rate": 7.447604067945802e-07, + "loss": 0.14, + "num_tokens": 19908048.0, + "reward": 0.85455322265625, + "reward_std": 0.03312108665704727, + "rewards//mean": 0.85455322265625, + "rewards//std": 0.04002256691455841, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3442, + "grad_norm": 2.4789602756500244, + "kl": 0.9345315992832184, + "learning_rate": 7.444836461603194e-07, + "loss": 0.0935, + "num_tokens": 19919816.0, + "reward": 0.8447265625, + "reward_std": 0.01931134983897209, + "rewards//mean": 0.8447265625, + "rewards//std": 0.022819651290774345, + "step": 1721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3444, + "grad_norm": 2.931394577026367, + "kl": 1.113354355096817, + "learning_rate": 7.442067870480751e-07, + "loss": 0.1113, + "num_tokens": 19931328.0, + "reward": 0.852294921875, + "reward_std": 0.03478846698999405, + "rewards//mean": 0.852294921875, + "rewards//std": 0.04033081978559494, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3446, + "grad_norm": 2.440431594848633, + "kl": 1.1292430274188519, + "learning_rate": 7.439298295693663e-07, + "loss": 0.1129, + "num_tokens": 19942944.0, + "reward": 0.84808349609375, + "reward_std": 0.03242112696170807, + "rewards//mean": 0.84808349609375, + "rewards//std": 0.045384481549263, + "step": 1723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3448, + "grad_norm": 2.9342398643493652, + "kl": 1.3463728427886963, + "learning_rate": 7.436527738357513e-07, + "loss": 0.1346, + "num_tokens": 19954488.0, + "reward": 0.84698486328125, + "reward_std": 0.023034941405057907, + "rewards//mean": 0.84698486328125, + "rewards//std": 0.033105816692113876, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.345, + "grad_norm": 2.200618267059326, + "kl": 1.376644290983677, + "learning_rate": 7.433756199588282e-07, + "loss": 0.1377, + "num_tokens": 19966064.0, + "reward": 0.8009033203125, + "reward_std": 0.022766824811697006, + "rewards//mean": 0.8009033203125, + "rewards//std": 0.03265214338898659, + "step": 1725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3452, + "grad_norm": 2.2011868953704834, + "kl": 0.8347810655832291, + "learning_rate": 7.430983680502343e-07, + "loss": 0.0835, + "num_tokens": 19977672.0, + "reward": 0.84429931640625, + "reward_std": 0.02514733001589775, + "rewards//mean": 0.84429931640625, + "rewards//std": 0.03928426280617714, + "step": 1726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3454, + "grad_norm": 2.5292625427246094, + "kl": 0.9744424670934677, + "learning_rate": 7.42821018221647e-07, + "loss": 0.0974, + "num_tokens": 19989360.0, + "reward": 0.83990478515625, + "reward_std": 0.02310396358370781, + "rewards//mean": 0.83990478515625, + "rewards//std": 0.03209494799375534, + "step": 1727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3456, + "grad_norm": 2.7063944339752197, + "kl": 1.3088148422539234, + "learning_rate": 7.425435705847825e-07, + "loss": 0.1309, + "num_tokens": 20000992.0, + "reward": 0.8538818359375, + "reward_std": 0.020589347928762436, + "rewards//mean": 0.8538818359375, + "rewards//std": 0.025341391563415527, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3458, + "grad_norm": 1.8206895589828491, + "kl": 1.2820711135864258, + "learning_rate": 7.422660252513968e-07, + "loss": 0.1282, + "num_tokens": 20012576.0, + "reward": 0.81109619140625, + "reward_std": 0.021171707659959793, + "rewards//mean": 0.81109619140625, + "rewards//std": 0.03382100537419319, + "step": 1729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.346, + "grad_norm": 2.5433554649353027, + "kl": 1.0850379019975662, + "learning_rate": 7.41988382333285e-07, + "loss": 0.1085, + "num_tokens": 20024200.0, + "reward": 0.85491943359375, + "reward_std": 0.02414305880665779, + "rewards//mean": 0.85491943359375, + "rewards//std": 0.03452620655298233, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3462, + "grad_norm": 3.9664835929870605, + "kl": 1.021358449012041, + "learning_rate": 7.417106419422818e-07, + "loss": 0.1021, + "num_tokens": 20035696.0, + "reward": 0.77752685546875, + "reward_std": 0.01875619776546955, + "rewards//mean": 0.77752685546875, + "rewards//std": 0.030634989961981773, + "step": 1731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3464, + "grad_norm": 2.70992374420166, + "kl": 1.1049550846219063, + "learning_rate": 7.41432804190261e-07, + "loss": 0.1105, + "num_tokens": 20047224.0, + "reward": 0.854736328125, + "reward_std": 0.02067694254219532, + "rewards//mean": 0.854736328125, + "rewards//std": 0.02591930888593197, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3466, + "grad_norm": 2.8674097061157227, + "kl": 1.468604639172554, + "learning_rate": 7.411548691891357e-07, + "loss": 0.1469, + "num_tokens": 20058808.0, + "reward": 0.85845947265625, + "reward_std": 0.027926668524742126, + "rewards//mean": 0.85845947265625, + "rewards//std": 0.035281702876091, + "step": 1733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3468, + "grad_norm": 2.042708396911621, + "kl": 1.020597793161869, + "learning_rate": 7.408768370508576e-07, + "loss": 0.1021, + "num_tokens": 20070456.0, + "reward": 0.8028564453125, + "reward_std": 0.014948856085538864, + "rewards//mean": 0.8028564453125, + "rewards//std": 0.0214402936398983, + "step": 1734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.347, + "grad_norm": 3.6061432361602783, + "kl": 0.9610940590500832, + "learning_rate": 7.405987078874185e-07, + "loss": 0.0961, + "num_tokens": 20081992.0, + "reward": 0.871337890625, + "reward_std": 0.02606511302292347, + "rewards//mean": 0.871337890625, + "rewards//std": 0.03680792078375816, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3472, + "grad_norm": 2.1631524562835693, + "kl": 0.6311742961406708, + "learning_rate": 7.403204818108487e-07, + "loss": 0.0631, + "num_tokens": 20093584.0, + "reward": 0.8572998046875, + "reward_std": 0.019526630640029907, + "rewards//mean": 0.8572998046875, + "rewards//std": 0.031388960778713226, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3474, + "grad_norm": 2.1339948177337646, + "kl": 0.9311587139964104, + "learning_rate": 7.400421589332174e-07, + "loss": 0.0931, + "num_tokens": 20105104.0, + "reward": 0.88690185546875, + "reward_std": 0.019515302032232285, + "rewards//mean": 0.88690185546875, + "rewards//std": 0.03258131071925163, + "step": 1737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3476, + "grad_norm": 2.5099270343780518, + "kl": 0.9511955231428146, + "learning_rate": 7.397637393666333e-07, + "loss": 0.0951, + "num_tokens": 20116816.0, + "reward": 0.83685302734375, + "reward_std": 0.01527322456240654, + "rewards//mean": 0.83685302734375, + "rewards//std": 0.021084588021039963, + "step": 1738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3478, + "grad_norm": 2.043095111846924, + "kl": 0.6990199834108353, + "learning_rate": 7.394852232232436e-07, + "loss": 0.0699, + "num_tokens": 20128432.0, + "reward": 0.8184814453125, + "reward_std": 0.017498144879937172, + "rewards//mean": 0.8184814453125, + "rewards//std": 0.02567608840763569, + "step": 1739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.348, + "grad_norm": 2.4979615211486816, + "kl": 1.158813439309597, + "learning_rate": 7.392066106152345e-07, + "loss": 0.1159, + "num_tokens": 20140024.0, + "reward": 0.8455810546875, + "reward_std": 0.028695186600089073, + "rewards//mean": 0.8455810546875, + "rewards//std": 0.03799361735582352, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3482, + "grad_norm": 1.97088623046875, + "kl": 1.2483737021684647, + "learning_rate": 7.389279016548316e-07, + "loss": 0.1248, + "num_tokens": 20151560.0, + "reward": 0.820556640625, + "reward_std": 0.019206106662750244, + "rewards//mean": 0.820556640625, + "rewards//std": 0.029155993834137917, + "step": 1741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3484, + "grad_norm": 2.6208860874176025, + "kl": 0.9671548493206501, + "learning_rate": 7.386490964542982e-07, + "loss": 0.0967, + "num_tokens": 20163128.0, + "reward": 0.84844970703125, + "reward_std": 0.02560563012957573, + "rewards//mean": 0.84844970703125, + "rewards//std": 0.037539634853601456, + "step": 1742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3486, + "grad_norm": 2.4746642112731934, + "kl": 1.2283164858818054, + "learning_rate": 7.383701951259375e-07, + "loss": 0.1228, + "num_tokens": 20174624.0, + "reward": 0.7999267578125, + "reward_std": 0.024233032017946243, + "rewards//mean": 0.7999267578125, + "rewards//std": 0.026725297793745995, + "step": 1743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3488, + "grad_norm": 2.0407209396362305, + "kl": 1.1801616959273815, + "learning_rate": 7.380911977820906e-07, + "loss": 0.118, + "num_tokens": 20186248.0, + "reward": 0.8712158203125, + "reward_std": 0.028384394943714142, + "rewards//mean": 0.8712158203125, + "rewards//std": 0.03466693311929703, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.349, + "grad_norm": 2.2650036811828613, + "kl": 0.9199401214718819, + "learning_rate": 7.378121045351377e-07, + "loss": 0.092, + "num_tokens": 20197824.0, + "reward": 0.86492919921875, + "reward_std": 0.022736210376024246, + "rewards//mean": 0.86492919921875, + "rewards//std": 0.026021242141723633, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3492, + "grad_norm": 1.962618350982666, + "kl": 1.2914118096232414, + "learning_rate": 7.375329154974975e-07, + "loss": 0.1291, + "num_tokens": 20209384.0, + "reward": 0.87847900390625, + "reward_std": 0.02714006043970585, + "rewards//mean": 0.87847900390625, + "rewards//std": 0.03043420985341072, + "step": 1746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3494, + "grad_norm": 2.016063928604126, + "kl": 0.8794427886605263, + "learning_rate": 7.372536307816272e-07, + "loss": 0.0879, + "num_tokens": 20220912.0, + "reward": 0.857666015625, + "reward_std": 0.021556008607149124, + "rewards//mean": 0.857666015625, + "rewards//std": 0.027803802862763405, + "step": 1747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3496, + "grad_norm": 1.9443272352218628, + "kl": 1.0437804460525513, + "learning_rate": 7.369742505000231e-07, + "loss": 0.1044, + "num_tokens": 20232472.0, + "reward": 0.87493896484375, + "reward_std": 0.021434718742966652, + "rewards//mean": 0.87493896484375, + "rewards//std": 0.03417808189988136, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3498, + "grad_norm": 2.1749489307403564, + "kl": 0.8937936685979366, + "learning_rate": 7.366947747652191e-07, + "loss": 0.0894, + "num_tokens": 20243976.0, + "reward": 0.8009033203125, + "reward_std": 0.020434053614735603, + "rewards//mean": 0.8009033203125, + "rewards//std": 0.03377515450119972, + "step": 1749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.35, + "grad_norm": 2.729098081588745, + "kl": 0.7871437221765518, + "learning_rate": 7.364152036897882e-07, + "loss": 0.0787, + "num_tokens": 20255528.0, + "reward": 0.8861083984375, + "reward_std": 0.02496851049363613, + "rewards//mean": 0.8861083984375, + "rewards//std": 0.03222092613577843, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3502, + "grad_norm": 2.047386884689331, + "kl": 1.2210485190153122, + "learning_rate": 7.361355373863413e-07, + "loss": 0.1221, + "num_tokens": 20267040.0, + "reward": 0.84796142578125, + "reward_std": 0.02364504709839821, + "rewards//mean": 0.84796142578125, + "rewards//std": 0.031499382108449936, + "step": 1751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3504, + "grad_norm": 2.585280418395996, + "kl": 1.2616507187485695, + "learning_rate": 7.358557759675284e-07, + "loss": 0.1262, + "num_tokens": 20278704.0, + "reward": 0.81365966796875, + "reward_std": 0.021809715777635574, + "rewards//mean": 0.81365966796875, + "rewards//std": 0.025360126048326492, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3506, + "grad_norm": 2.4027953147888184, + "kl": 1.497279442846775, + "learning_rate": 7.35575919546037e-07, + "loss": 0.1497, + "num_tokens": 20290456.0, + "reward": 0.752685546875, + "reward_std": 0.01595856435596943, + "rewards//mean": 0.752685546875, + "rewards//std": 0.02671085111796856, + "step": 1753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3508, + "grad_norm": 1.9706050157546997, + "kl": 1.4648128375411034, + "learning_rate": 7.352959682345935e-07, + "loss": 0.1465, + "num_tokens": 20302024.0, + "reward": 0.86529541015625, + "reward_std": 0.02779914066195488, + "rewards//mean": 0.86529541015625, + "rewards//std": 0.03876369073987007, + "step": 1754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.351, + "grad_norm": 2.5034093856811523, + "kl": 1.0132259652018547, + "learning_rate": 7.350159221459621e-07, + "loss": 0.1013, + "num_tokens": 20313600.0, + "reward": 0.86248779296875, + "reward_std": 0.023317504674196243, + "rewards//mean": 0.86248779296875, + "rewards//std": 0.0296445544809103, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3512, + "grad_norm": 4.494880676269531, + "kl": 1.7272873595356941, + "learning_rate": 7.347357813929454e-07, + "loss": 0.1727, + "num_tokens": 20325056.0, + "reward": 0.76641845703125, + "reward_std": 0.01419996377080679, + "rewards//mean": 0.76641845703125, + "rewards//std": 0.019798271358013153, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3514, + "grad_norm": 3.15852689743042, + "kl": 1.0805333852767944, + "learning_rate": 7.344555460883839e-07, + "loss": 0.1081, + "num_tokens": 20336568.0, + "reward": 0.85992431640625, + "reward_std": 0.01924937218427658, + "rewards//mean": 0.85992431640625, + "rewards//std": 0.02393716759979725, + "step": 1757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3516, + "grad_norm": 3.005446672439575, + "kl": 1.3546732664108276, + "learning_rate": 7.341752163451567e-07, + "loss": 0.1355, + "num_tokens": 20348152.0, + "reward": 0.85430908203125, + "reward_std": 0.01827901415526867, + "rewards//mean": 0.85430908203125, + "rewards//std": 0.02883695624768734, + "step": 1758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3518, + "grad_norm": 2.0632822513580322, + "kl": 0.96230448782444, + "learning_rate": 7.338947922761802e-07, + "loss": 0.0962, + "num_tokens": 20359728.0, + "reward": 0.8262939453125, + "reward_std": 0.01381033007055521, + "rewards//mean": 0.8262939453125, + "rewards//std": 0.020669566467404366, + "step": 1759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.352, + "grad_norm": 1.8050286769866943, + "kl": 0.9001716673374176, + "learning_rate": 7.336142739944093e-07, + "loss": 0.09, + "num_tokens": 20371392.0, + "reward": 0.85223388671875, + "reward_std": 0.021405503153800964, + "rewards//mean": 0.85223388671875, + "rewards//std": 0.02610255777835846, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3522, + "grad_norm": 1.8168693780899048, + "kl": 0.6770295202732086, + "learning_rate": 7.333336616128369e-07, + "loss": 0.0677, + "num_tokens": 20382952.0, + "reward": 0.85931396484375, + "reward_std": 0.016092445701360703, + "rewards//mean": 0.85931396484375, + "rewards//std": 0.02188222110271454, + "step": 1761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3524, + "grad_norm": 2.284956693649292, + "kl": 0.9277826845645905, + "learning_rate": 7.330529552444932e-07, + "loss": 0.0928, + "num_tokens": 20394568.0, + "reward": 0.8460693359375, + "reward_std": 0.015896093100309372, + "rewards//mean": 0.8460693359375, + "rewards//std": 0.027958810329437256, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3526, + "grad_norm": 2.2893881797790527, + "kl": 1.4933704659342766, + "learning_rate": 7.327721550024475e-07, + "loss": 0.1493, + "num_tokens": 20406200.0, + "reward": 0.841064453125, + "reward_std": 0.028833352029323578, + "rewards//mean": 0.841064453125, + "rewards//std": 0.031295500695705414, + "step": 1763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3528, + "grad_norm": 2.0647830963134766, + "kl": 1.1399365365505219, + "learning_rate": 7.324912609998053e-07, + "loss": 0.114, + "num_tokens": 20417816.0, + "reward": 0.84466552734375, + "reward_std": 0.030580192804336548, + "rewards//mean": 0.84466552734375, + "rewards//std": 0.04001651704311371, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.353, + "grad_norm": 2.4549529552459717, + "kl": 1.1437233574688435, + "learning_rate": 7.322102733497109e-07, + "loss": 0.1144, + "num_tokens": 20429408.0, + "reward": 0.83380126953125, + "reward_std": 0.022027086466550827, + "rewards//mean": 0.83380126953125, + "rewards//std": 0.03337087482213974, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3532, + "grad_norm": 2.748950719833374, + "kl": 1.1527586057782173, + "learning_rate": 7.319291921653463e-07, + "loss": 0.1153, + "num_tokens": 20440976.0, + "reward": 0.80645751953125, + "reward_std": 0.015260763466358185, + "rewards//mean": 0.80645751953125, + "rewards//std": 0.023478075861930847, + "step": 1766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3534, + "grad_norm": 2.011247396469116, + "kl": 0.8996926359832287, + "learning_rate": 7.316480175599308e-07, + "loss": 0.09, + "num_tokens": 20452560.0, + "reward": 0.83563232421875, + "reward_std": 0.01760326698422432, + "rewards//mean": 0.83563232421875, + "rewards//std": 0.022769762203097343, + "step": 1767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3536, + "grad_norm": 2.821275234222412, + "kl": 1.0682964138686657, + "learning_rate": 7.313667496467215e-07, + "loss": 0.1068, + "num_tokens": 20464088.0, + "reward": 0.84625244140625, + "reward_std": 0.017040016129612923, + "rewards//mean": 0.84625244140625, + "rewards//std": 0.02679678425192833, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3538, + "grad_norm": 2.147867202758789, + "kl": 1.2828566879034042, + "learning_rate": 7.310853885390132e-07, + "loss": 0.1283, + "num_tokens": 20475632.0, + "reward": 0.856201171875, + "reward_std": 0.02524542063474655, + "rewards//mean": 0.856201171875, + "rewards//std": 0.03432927653193474, + "step": 1769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.354, + "grad_norm": 2.4573655128479004, + "kl": 1.1945386826992035, + "learning_rate": 7.308039343501379e-07, + "loss": 0.1195, + "num_tokens": 20487272.0, + "reward": 0.83685302734375, + "reward_std": 0.030841413885354996, + "rewards//mean": 0.83685302734375, + "rewards//std": 0.04422563314437866, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3542, + "grad_norm": 2.137849807739258, + "kl": 0.9280127212405205, + "learning_rate": 7.305223871934656e-07, + "loss": 0.0928, + "num_tokens": 20498880.0, + "reward": 0.86053466796875, + "reward_std": 0.026122167706489563, + "rewards//mean": 0.86053466796875, + "rewards//std": 0.03331821411848068, + "step": 1771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3544, + "grad_norm": 2.7699921131134033, + "kl": 1.4857338517904282, + "learning_rate": 7.302407471824033e-07, + "loss": 0.1486, + "num_tokens": 20510504.0, + "reward": 0.8447265625, + "reward_std": 0.027108684182167053, + "rewards//mean": 0.8447265625, + "rewards//std": 0.031719256192445755, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3546, + "grad_norm": 2.165132999420166, + "kl": 1.4478738233447075, + "learning_rate": 7.299590144303954e-07, + "loss": 0.1448, + "num_tokens": 20522048.0, + "reward": 0.85516357421875, + "reward_std": 0.02843414805829525, + "rewards//mean": 0.85516357421875, + "rewards//std": 0.036267202347517014, + "step": 1773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3548, + "grad_norm": 2.8185417652130127, + "kl": 0.7724824883043766, + "learning_rate": 7.296771890509242e-07, + "loss": 0.0772, + "num_tokens": 20533568.0, + "reward": 0.83233642578125, + "reward_std": 0.021651502698659897, + "rewards//mean": 0.83233642578125, + "rewards//std": 0.028563741594552994, + "step": 1774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.355, + "grad_norm": 2.3654441833496094, + "kl": 1.411547489464283, + "learning_rate": 7.293952711575086e-07, + "loss": 0.1412, + "num_tokens": 20545096.0, + "reward": 0.8251953125, + "reward_std": 0.030635429546236992, + "rewards//mean": 0.8251953125, + "rewards//std": 0.03312370181083679, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3552, + "grad_norm": 1.9886846542358398, + "kl": 1.1729403212666512, + "learning_rate": 7.291132608637052e-07, + "loss": 0.1173, + "num_tokens": 20556656.0, + "reward": 0.8760986328125, + "reward_std": 0.024694111198186874, + "rewards//mean": 0.8760986328125, + "rewards//std": 0.031392816454172134, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3554, + "grad_norm": 2.2507991790771484, + "kl": 1.356358379125595, + "learning_rate": 7.288311582831077e-07, + "loss": 0.1356, + "num_tokens": 20568296.0, + "reward": 0.8206787109375, + "reward_std": 0.02057044766843319, + "rewards//mean": 0.8206787109375, + "rewards//std": 0.021426167339086533, + "step": 1777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3556, + "grad_norm": 2.2739486694335938, + "kl": 1.2267119884490967, + "learning_rate": 7.285489635293471e-07, + "loss": 0.1227, + "num_tokens": 20579840.0, + "reward": 0.83984375, + "reward_std": 0.03013153001666069, + "rewards//mean": 0.83984375, + "rewards//std": 0.03176504001021385, + "step": 1778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3558, + "grad_norm": 2.4156572818756104, + "kl": 0.7271563783288002, + "learning_rate": 7.282666767160912e-07, + "loss": 0.0727, + "num_tokens": 20591448.0, + "reward": 0.79052734375, + "reward_std": 0.017893651500344276, + "rewards//mean": 0.79052734375, + "rewards//std": 0.031570009887218475, + "step": 1779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.356, + "grad_norm": 2.060361385345459, + "kl": 1.3369466960430145, + "learning_rate": 7.279842979570453e-07, + "loss": 0.1337, + "num_tokens": 20603088.0, + "reward": 0.82562255859375, + "reward_std": 0.022543733939528465, + "rewards//mean": 0.82562255859375, + "rewards//std": 0.02696796879172325, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3562, + "grad_norm": 2.415576696395874, + "kl": 0.8832765780389309, + "learning_rate": 7.277018273659516e-07, + "loss": 0.0883, + "num_tokens": 20614632.0, + "reward": 0.84405517578125, + "reward_std": 0.01693926565349102, + "rewards//mean": 0.84405517578125, + "rewards//std": 0.02586720511317253, + "step": 1781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3564, + "grad_norm": 1.7977432012557983, + "kl": 1.5613547638058662, + "learning_rate": 7.274192650565889e-07, + "loss": 0.1561, + "num_tokens": 20626184.0, + "reward": 0.87445068359375, + "reward_std": 0.022333696484565735, + "rewards//mean": 0.87445068359375, + "rewards//std": 0.027838557958602905, + "step": 1782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3566, + "grad_norm": 3.5203678607940674, + "kl": 1.1929401867091656, + "learning_rate": 7.271366111427734e-07, + "loss": 0.1193, + "num_tokens": 20637808.0, + "reward": 0.89630126953125, + "reward_std": 0.030496250838041306, + "rewards//mean": 0.89630126953125, + "rewards//std": 0.03273334726691246, + "step": 1783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3568, + "grad_norm": 2.591165781021118, + "kl": 1.09410510212183, + "learning_rate": 7.26853865738358e-07, + "loss": 0.1094, + "num_tokens": 20649400.0, + "reward": 0.80950927734375, + "reward_std": 0.02625853195786476, + "rewards//mean": 0.80950927734375, + "rewards//std": 0.03272455930709839, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.357, + "grad_norm": 4.302237510681152, + "kl": 1.6835313849151134, + "learning_rate": 7.265710289572328e-07, + "loss": 0.1684, + "num_tokens": 20660912.0, + "reward": 0.82122802734375, + "reward_std": 0.023625988513231277, + "rewards//mean": 0.82122802734375, + "rewards//std": 0.029730726033449173, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3572, + "grad_norm": 3.791259288787842, + "kl": 1.0449153408408165, + "learning_rate": 7.262881009133241e-07, + "loss": 0.1045, + "num_tokens": 20672488.0, + "reward": 0.87640380859375, + "reward_std": 0.024104533717036247, + "rewards//mean": 0.87640380859375, + "rewards//std": 0.028436798602342606, + "step": 1786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3574, + "grad_norm": 2.0449440479278564, + "kl": 1.0596325621008873, + "learning_rate": 7.260050817205955e-07, + "loss": 0.106, + "num_tokens": 20684032.0, + "reward": 0.88165283203125, + "reward_std": 0.031078336760401726, + "rewards//mean": 0.88165283203125, + "rewards//std": 0.03530186414718628, + "step": 1787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3576, + "grad_norm": 2.696906805038452, + "kl": 0.8477588295936584, + "learning_rate": 7.25721971493047e-07, + "loss": 0.0848, + "num_tokens": 20695520.0, + "reward": 0.85382080078125, + "reward_std": 0.024488236755132675, + "rewards//mean": 0.85382080078125, + "rewards//std": 0.03293895721435547, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3578, + "grad_norm": 3.0361266136169434, + "kl": 0.9231469333171844, + "learning_rate": 7.254387703447153e-07, + "loss": 0.0923, + "num_tokens": 20707048.0, + "reward": 0.80401611328125, + "reward_std": 0.014304598793387413, + "rewards//mean": 0.80401611328125, + "rewards//std": 0.02342255972325802, + "step": 1789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.358, + "grad_norm": 2.349731922149658, + "kl": 1.2290288507938385, + "learning_rate": 7.25155478389674e-07, + "loss": 0.1229, + "num_tokens": 20718624.0, + "reward": 0.81512451171875, + "reward_std": 0.020725928246974945, + "rewards//mean": 0.81512451171875, + "rewards//std": 0.028476694598793983, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3582, + "grad_norm": 5.824702739715576, + "kl": 1.2179707288742065, + "learning_rate": 7.248720957420329e-07, + "loss": 0.1218, + "num_tokens": 20730240.0, + "reward": 0.76983642578125, + "reward_std": 0.019481122493743896, + "rewards//mean": 0.76983642578125, + "rewards//std": 0.024796348065137863, + "step": 1791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3584, + "grad_norm": 2.4881951808929443, + "kl": 1.576691411435604, + "learning_rate": 7.245886225159386e-07, + "loss": 0.1577, + "num_tokens": 20741800.0, + "reward": 0.8367919921875, + "reward_std": 0.025014162063598633, + "rewards//mean": 0.8367919921875, + "rewards//std": 0.04105454683303833, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3586, + "grad_norm": 1.9410139322280884, + "kl": 0.9574297592043877, + "learning_rate": 7.243050588255737e-07, + "loss": 0.0957, + "num_tokens": 20753360.0, + "reward": 0.85772705078125, + "reward_std": 0.019826620817184448, + "rewards//mean": 0.85772705078125, + "rewards//std": 0.024703379720449448, + "step": 1793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3588, + "grad_norm": 1.9923369884490967, + "kl": 1.3719453066587448, + "learning_rate": 7.240214047851581e-07, + "loss": 0.1372, + "num_tokens": 20764936.0, + "reward": 0.86798095703125, + "reward_std": 0.02498331479728222, + "rewards//mean": 0.86798095703125, + "rewards//std": 0.03216797113418579, + "step": 1794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.359, + "grad_norm": 2.115668535232544, + "kl": 1.197093553841114, + "learning_rate": 7.237376605089476e-07, + "loss": 0.1197, + "num_tokens": 20776544.0, + "reward": 0.8704833984375, + "reward_std": 0.02571459859609604, + "rewards//mean": 0.8704833984375, + "rewards//std": 0.033690787851810455, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3592, + "grad_norm": 2.406409502029419, + "kl": 1.7990645617246628, + "learning_rate": 7.234538261112341e-07, + "loss": 0.1799, + "num_tokens": 20788072.0, + "reward": 0.8447265625, + "reward_std": 0.03887183964252472, + "rewards//mean": 0.8447265625, + "rewards//std": 0.04473885893821716, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3594, + "grad_norm": 2.0437893867492676, + "kl": 0.7303189337253571, + "learning_rate": 7.23169901706346e-07, + "loss": 0.073, + "num_tokens": 20799608.0, + "reward": 0.85052490234375, + "reward_std": 0.015577919781208038, + "rewards//mean": 0.85052490234375, + "rewards//std": 0.02217632532119751, + "step": 1797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3596, + "grad_norm": 2.392359733581543, + "kl": 1.2026055380702019, + "learning_rate": 7.228858874086484e-07, + "loss": 0.1203, + "num_tokens": 20811120.0, + "reward": 0.82037353515625, + "reward_std": 0.017315005883574486, + "rewards//mean": 0.82037353515625, + "rewards//std": 0.02210248075425625, + "step": 1798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3598, + "grad_norm": 2.4181113243103027, + "kl": 1.3697445541620255, + "learning_rate": 7.226017833325419e-07, + "loss": 0.137, + "num_tokens": 20822624.0, + "reward": 0.8482666015625, + "reward_std": 0.02427486702799797, + "rewards//mean": 0.8482666015625, + "rewards//std": 0.03343822807073593, + "step": 1799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.36, + "grad_norm": 1.9573493003845215, + "kl": 0.9237590841948986, + "learning_rate": 7.223175895924637e-07, + "loss": 0.0924, + "num_tokens": 20834136.0, + "reward": 0.8331298828125, + "reward_std": 0.014411898329854012, + "rewards//mean": 0.8331298828125, + "rewards//std": 0.0208271574229002, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3602, + "grad_norm": 4.320690631866455, + "kl": 1.362451121211052, + "learning_rate": 7.220333063028871e-07, + "loss": 0.1362, + "num_tokens": 20845592.0, + "reward": 0.79473876953125, + "reward_std": 0.020523782819509506, + "rewards//mean": 0.79473876953125, + "rewards//std": 0.023395400494337082, + "step": 1801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3604, + "grad_norm": 3.5911426544189453, + "kl": 1.4435570761561394, + "learning_rate": 7.217489335783211e-07, + "loss": 0.1444, + "num_tokens": 20857320.0, + "reward": 0.779541015625, + "reward_std": 0.025481052696704865, + "rewards//mean": 0.779541015625, + "rewards//std": 0.03246547281742096, + "step": 1802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3606, + "grad_norm": 2.538050413131714, + "kl": 1.1679005362093449, + "learning_rate": 7.214644715333114e-07, + "loss": 0.1168, + "num_tokens": 20868856.0, + "reward": 0.8111572265625, + "reward_std": 0.024292390793561935, + "rewards//mean": 0.8111572265625, + "rewards//std": 0.027832916006445885, + "step": 1803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3608, + "grad_norm": 2.1540310382843018, + "kl": 0.9014993757009506, + "learning_rate": 7.211799202824388e-07, + "loss": 0.0901, + "num_tokens": 20880424.0, + "reward": 0.84979248046875, + "reward_std": 0.018111206591129303, + "rewards//mean": 0.84979248046875, + "rewards//std": 0.02453737147152424, + "step": 1804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.361, + "grad_norm": 2.713545799255371, + "kl": 0.9371133744716644, + "learning_rate": 7.20895279940321e-07, + "loss": 0.0937, + "num_tokens": 20891952.0, + "reward": 0.8641357421875, + "reward_std": 0.025076687335968018, + "rewards//mean": 0.8641357421875, + "rewards//std": 0.034670423716306686, + "step": 1805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3612, + "grad_norm": 2.012901782989502, + "kl": 0.8549001961946487, + "learning_rate": 7.206105506216106e-07, + "loss": 0.0855, + "num_tokens": 20903504.0, + "reward": 0.89111328125, + "reward_std": 0.021621763706207275, + "rewards//mean": 0.89111328125, + "rewards//std": 0.034291334450244904, + "step": 1806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3614, + "grad_norm": 1.989739179611206, + "kl": 1.6368464678525925, + "learning_rate": 7.203257324409971e-07, + "loss": 0.1637, + "num_tokens": 20915040.0, + "reward": 0.86016845703125, + "reward_std": 0.03878522291779518, + "rewards//mean": 0.86016845703125, + "rewards//std": 0.054515302181243896, + "step": 1807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3616, + "grad_norm": 3.076871633529663, + "kl": 1.4092880263924599, + "learning_rate": 7.200408255132045e-07, + "loss": 0.1409, + "num_tokens": 20926520.0, + "reward": 0.83135986328125, + "reward_std": 0.024908646941184998, + "rewards//mean": 0.83135986328125, + "rewards//std": 0.03269355371594429, + "step": 1808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3618, + "grad_norm": 2.421795606613159, + "kl": 1.3219827339053154, + "learning_rate": 7.19755829952994e-07, + "loss": 0.1322, + "num_tokens": 20938160.0, + "reward": 0.839599609375, + "reward_std": 0.022169828414916992, + "rewards//mean": 0.839599609375, + "rewards//std": 0.03227842599153519, + "step": 1809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.362, + "grad_norm": 2.0979690551757812, + "kl": 1.7357230260968208, + "learning_rate": 7.194707458751615e-07, + "loss": 0.1736, + "num_tokens": 20949800.0, + "reward": 0.81524658203125, + "reward_std": 0.034807365387678146, + "rewards//mean": 0.81524658203125, + "rewards//std": 0.03792796656489372, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3622, + "grad_norm": 1.8483836650848389, + "kl": 1.2176254279911518, + "learning_rate": 7.191855733945386e-07, + "loss": 0.1218, + "num_tokens": 20961608.0, + "reward": 0.87701416015625, + "reward_std": 0.025760218501091003, + "rewards//mean": 0.87701416015625, + "rewards//std": 0.03494543954730034, + "step": 1811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3624, + "grad_norm": 2.2761688232421875, + "kl": 1.103126622736454, + "learning_rate": 7.189003126259931e-07, + "loss": 0.1103, + "num_tokens": 20973264.0, + "reward": 0.7974853515625, + "reward_std": 0.01822749152779579, + "rewards//mean": 0.7974853515625, + "rewards//std": 0.02570672519505024, + "step": 1812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3626, + "grad_norm": 2.257458448410034, + "kl": 1.1829032078385353, + "learning_rate": 7.186149636844279e-07, + "loss": 0.1183, + "num_tokens": 20984928.0, + "reward": 0.83563232421875, + "reward_std": 0.02465815097093582, + "rewards//mean": 0.83563232421875, + "rewards//std": 0.03171394765377045, + "step": 1813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3628, + "grad_norm": 2.6935060024261475, + "kl": 0.900083377957344, + "learning_rate": 7.183295266847814e-07, + "loss": 0.09, + "num_tokens": 20996488.0, + "reward": 0.8341064453125, + "reward_std": 0.023994866758584976, + "rewards//mean": 0.8341064453125, + "rewards//std": 0.03132716938853264, + "step": 1814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.363, + "grad_norm": 2.7139711380004883, + "kl": 1.190915547311306, + "learning_rate": 7.180440017420276e-07, + "loss": 0.1191, + "num_tokens": 21008000.0, + "reward": 0.88232421875, + "reward_std": 0.027387846261262894, + "rewards//mean": 0.88232421875, + "rewards//std": 0.03336775675415993, + "step": 1815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3632, + "grad_norm": 2.6228814125061035, + "kl": 1.2799627669155598, + "learning_rate": 7.177583889711762e-07, + "loss": 0.128, + "num_tokens": 21019592.0, + "reward": 0.8839111328125, + "reward_std": 0.027077561244368553, + "rewards//mean": 0.8839111328125, + "rewards//std": 0.03611437603831291, + "step": 1816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3634, + "grad_norm": 2.0796706676483154, + "kl": 1.1000299975275993, + "learning_rate": 7.174726884872715e-07, + "loss": 0.11, + "num_tokens": 21031104.0, + "reward": 0.86212158203125, + "reward_std": 0.027495183050632477, + "rewards//mean": 0.86212158203125, + "rewards//std": 0.042253587394952774, + "step": 1817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3636, + "grad_norm": 2.147951126098633, + "kl": 1.8142893761396408, + "learning_rate": 7.17186900405394e-07, + "loss": 0.1814, + "num_tokens": 21042624.0, + "reward": 0.83056640625, + "reward_std": 0.027857396751642227, + "rewards//mean": 0.83056640625, + "rewards//std": 0.03869807720184326, + "step": 1818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3638, + "grad_norm": 2.226424217224121, + "kl": 0.7788757272064686, + "learning_rate": 7.169010248406588e-07, + "loss": 0.0779, + "num_tokens": 21054176.0, + "reward": 0.85382080078125, + "reward_std": 0.014136772602796555, + "rewards//mean": 0.85382080078125, + "rewards//std": 0.023290345445275307, + "step": 1819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.364, + "grad_norm": 2.3745293617248535, + "kl": 1.4284968748688698, + "learning_rate": 7.16615061908217e-07, + "loss": 0.1428, + "num_tokens": 21065784.0, + "reward": 0.84442138671875, + "reward_std": 0.028157565742731094, + "rewards//mean": 0.84442138671875, + "rewards//std": 0.03884834051132202, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3642, + "grad_norm": 2.6427674293518066, + "kl": 0.9856915473937988, + "learning_rate": 7.163290117232541e-07, + "loss": 0.0986, + "num_tokens": 21077288.0, + "reward": 0.8623046875, + "reward_std": 0.023378441110253334, + "rewards//mean": 0.8623046875, + "rewards//std": 0.03257811442017555, + "step": 1821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3644, + "grad_norm": 5.235077381134033, + "kl": 0.9371802806854248, + "learning_rate": 7.160428744009912e-07, + "loss": 0.0937, + "num_tokens": 21088840.0, + "reward": 0.7926025390625, + "reward_std": 0.015834368765354156, + "rewards//mean": 0.7926025390625, + "rewards//std": 0.021505150943994522, + "step": 1822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3646, + "grad_norm": 2.4720630645751953, + "kl": 1.5931561291217804, + "learning_rate": 7.157566500566842e-07, + "loss": 0.1593, + "num_tokens": 21100424.0, + "reward": 0.827392578125, + "reward_std": 0.02814549393951893, + "rewards//mean": 0.827392578125, + "rewards//std": 0.03055935725569725, + "step": 1823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3648, + "grad_norm": 2.719985246658325, + "kl": 1.124546691775322, + "learning_rate": 7.154703388056244e-07, + "loss": 0.1125, + "num_tokens": 21111944.0, + "reward": 0.8492431640625, + "reward_std": 0.02643207460641861, + "rewards//mean": 0.8492431640625, + "rewards//std": 0.034186773002147675, + "step": 1824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.365, + "grad_norm": 2.5736167430877686, + "kl": 1.2172156162559986, + "learning_rate": 7.15183940763138e-07, + "loss": 0.1217, + "num_tokens": 21123504.0, + "reward": 0.82073974609375, + "reward_std": 0.01704571023583412, + "rewards//mean": 0.82073974609375, + "rewards//std": 0.02516297809779644, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3652, + "grad_norm": 3.3437411785125732, + "kl": 1.7058381289243698, + "learning_rate": 7.148974560445858e-07, + "loss": 0.1706, + "num_tokens": 21135112.0, + "reward": 0.8101806640625, + "reward_std": 0.021713092923164368, + "rewards//mean": 0.8101806640625, + "rewards//std": 0.03600859269499779, + "step": 1826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3654, + "grad_norm": 3.3206422328948975, + "kl": 1.0943292677402496, + "learning_rate": 7.146108847653641e-07, + "loss": 0.1094, + "num_tokens": 21146640.0, + "reward": 0.83856201171875, + "reward_std": 0.01721915975213051, + "rewards//mean": 0.83856201171875, + "rewards//std": 0.018401801586151123, + "step": 1827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3656, + "grad_norm": 2.7718751430511475, + "kl": 1.300773099064827, + "learning_rate": 7.143242270409037e-07, + "loss": 0.1301, + "num_tokens": 21158320.0, + "reward": 0.83172607421875, + "reward_std": 0.024241294711828232, + "rewards//mean": 0.83172607421875, + "rewards//std": 0.03455732390284538, + "step": 1828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3658, + "grad_norm": 1.9105443954467773, + "kl": 1.6826754435896873, + "learning_rate": 7.140374829866702e-07, + "loss": 0.1683, + "num_tokens": 21169840.0, + "reward": 0.869140625, + "reward_std": 0.03197915852069855, + "rewards//mean": 0.869140625, + "rewards//std": 0.03418168053030968, + "step": 1829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.366, + "grad_norm": 2.991119384765625, + "kl": 1.163110926747322, + "learning_rate": 7.137506527181643e-07, + "loss": 0.1163, + "num_tokens": 21181424.0, + "reward": 0.76171875, + "reward_std": 0.015834342688322067, + "rewards//mean": 0.76171875, + "rewards//std": 0.020681647583842278, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3662, + "grad_norm": 2.727816343307495, + "kl": 1.794710785150528, + "learning_rate": 7.134637363509209e-07, + "loss": 0.1795, + "num_tokens": 21192968.0, + "reward": 0.80023193359375, + "reward_std": 0.028632547706365585, + "rewards//mean": 0.80023193359375, + "rewards//std": 0.03420907258987427, + "step": 1831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3664, + "grad_norm": 3.1013588905334473, + "kl": 1.1747637502849102, + "learning_rate": 7.131767340005101e-07, + "loss": 0.1175, + "num_tokens": 21204536.0, + "reward": 0.78204345703125, + "reward_std": 0.020896028727293015, + "rewards//mean": 0.78204345703125, + "rewards//std": 0.02792813442647457, + "step": 1832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3666, + "grad_norm": 2.362016201019287, + "kl": 1.2459346428513527, + "learning_rate": 7.128896457825363e-07, + "loss": 0.1246, + "num_tokens": 21216176.0, + "reward": 0.848388671875, + "reward_std": 0.022075079381465912, + "rewards//mean": 0.848388671875, + "rewards//std": 0.02867857925593853, + "step": 1833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3668, + "grad_norm": 2.1927387714385986, + "kl": 1.170629907399416, + "learning_rate": 7.126024718126387e-07, + "loss": 0.1171, + "num_tokens": 21227824.0, + "reward": 0.83929443359375, + "reward_std": 0.02976852096617222, + "rewards//mean": 0.83929443359375, + "rewards//std": 0.03572597727179527, + "step": 1834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.367, + "grad_norm": 2.2330572605133057, + "kl": 1.4790634103119373, + "learning_rate": 7.123152122064908e-07, + "loss": 0.1479, + "num_tokens": 21239328.0, + "reward": 0.85003662109375, + "reward_std": 0.03860097378492355, + "rewards//mean": 0.85003662109375, + "rewards//std": 0.04932039603590965, + "step": 1835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3672, + "grad_norm": 2.3347222805023193, + "kl": 1.3087795227766037, + "learning_rate": 7.120278670798009e-07, + "loss": 0.1309, + "num_tokens": 21250880.0, + "reward": 0.83306884765625, + "reward_std": 0.025860385969281197, + "rewards//mean": 0.83306884765625, + "rewards//std": 0.03740430250763893, + "step": 1836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3674, + "grad_norm": 2.1152026653289795, + "kl": 1.3447279930114746, + "learning_rate": 7.117404365483115e-07, + "loss": 0.1345, + "num_tokens": 21262560.0, + "reward": 0.84967041015625, + "reward_std": 0.026529191061854362, + "rewards//mean": 0.84967041015625, + "rewards//std": 0.03486911579966545, + "step": 1837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3676, + "grad_norm": 1.8147046566009521, + "kl": 0.9853770695626736, + "learning_rate": 7.114529207277995e-07, + "loss": 0.0985, + "num_tokens": 21274160.0, + "reward": 0.848388671875, + "reward_std": 0.022970695048570633, + "rewards//mean": 0.848388671875, + "rewards//std": 0.030176525935530663, + "step": 1838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3678, + "grad_norm": 2.5520293712615967, + "kl": 1.039154201745987, + "learning_rate": 7.111653197340764e-07, + "loss": 0.1039, + "num_tokens": 21285712.0, + "reward": 0.78887939453125, + "reward_std": 0.011490454897284508, + "rewards//mean": 0.78887939453125, + "rewards//std": 0.01679679937660694, + "step": 1839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.368, + "grad_norm": 2.012941598892212, + "kl": 1.5621055737137794, + "learning_rate": 7.108776336829876e-07, + "loss": 0.1562, + "num_tokens": 21297200.0, + "reward": 0.84454345703125, + "reward_std": 0.027528870850801468, + "rewards//mean": 0.84454345703125, + "rewards//std": 0.032683365046978, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3682, + "grad_norm": 2.648344039916992, + "kl": 0.9574632868170738, + "learning_rate": 7.105898626904134e-07, + "loss": 0.0957, + "num_tokens": 21308776.0, + "reward": 0.82373046875, + "reward_std": 0.0180647112429142, + "rewards//mean": 0.82373046875, + "rewards//std": 0.02836117520928383, + "step": 1841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3684, + "grad_norm": 2.5742061138153076, + "kl": 0.974810291081667, + "learning_rate": 7.103020068722674e-07, + "loss": 0.0975, + "num_tokens": 21320416.0, + "reward": 0.78436279296875, + "reward_std": 0.0191167201846838, + "rewards//mean": 0.78436279296875, + "rewards//std": 0.031483039259910583, + "step": 1842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3686, + "grad_norm": 2.0233304500579834, + "kl": 0.8074983917176723, + "learning_rate": 7.100140663444984e-07, + "loss": 0.0807, + "num_tokens": 21332032.0, + "reward": 0.853515625, + "reward_std": 0.01974542997777462, + "rewards//mean": 0.853515625, + "rewards//std": 0.04040956124663353, + "step": 1843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3688, + "grad_norm": 2.200218677520752, + "kl": 1.2285392694175243, + "learning_rate": 7.097260412230885e-07, + "loss": 0.1229, + "num_tokens": 21343608.0, + "reward": 0.84820556640625, + "reward_std": 0.025961916893720627, + "rewards//mean": 0.84820556640625, + "rewards//std": 0.0303605068475008, + "step": 1844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.369, + "grad_norm": 3.2463595867156982, + "kl": 1.5310818329453468, + "learning_rate": 7.094379316240544e-07, + "loss": 0.1531, + "num_tokens": 21355168.0, + "reward": 0.8154296875, + "reward_std": 0.01758919470012188, + "rewards//mean": 0.8154296875, + "rewards//std": 0.022412698715925217, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3692, + "grad_norm": 2.9197614192962646, + "kl": 1.174799956381321, + "learning_rate": 7.091497376634463e-07, + "loss": 0.1175, + "num_tokens": 21366776.0, + "reward": 0.8297119140625, + "reward_std": 0.02122962474822998, + "rewards//mean": 0.8297119140625, + "rewards//std": 0.02845682017505169, + "step": 1846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3694, + "grad_norm": 1.8043817281723022, + "kl": 1.1824671924114227, + "learning_rate": 7.088614594573491e-07, + "loss": 0.1182, + "num_tokens": 21378328.0, + "reward": 0.82354736328125, + "reward_std": 0.025873564183712006, + "rewards//mean": 0.82354736328125, + "rewards//std": 0.030421772971749306, + "step": 1847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3696, + "grad_norm": 3.1440768241882324, + "kl": 1.133433572947979, + "learning_rate": 7.085730971218809e-07, + "loss": 0.1133, + "num_tokens": 21389888.0, + "reward": 0.8284912109375, + "reward_std": 0.0243404358625412, + "rewards//mean": 0.8284912109375, + "rewards//std": 0.03339836746454239, + "step": 1848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3698, + "grad_norm": 2.2296714782714844, + "kl": 1.2767824232578278, + "learning_rate": 7.082846507731941e-07, + "loss": 0.1277, + "num_tokens": 21401464.0, + "reward": 0.8319091796875, + "reward_std": 0.02351824939250946, + "rewards//mean": 0.8319091796875, + "rewards//std": 0.028236806392669678, + "step": 1849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.37, + "grad_norm": 2.0084474086761475, + "kl": 1.4388155788183212, + "learning_rate": 7.079961205274748e-07, + "loss": 0.1439, + "num_tokens": 21413080.0, + "reward": 0.8533935546875, + "reward_std": 0.030745528638362885, + "rewards//mean": 0.8533935546875, + "rewards//std": 0.04685933515429497, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3702, + "grad_norm": 2.2201898097991943, + "kl": 1.2405463084578514, + "learning_rate": 7.077075065009433e-07, + "loss": 0.1241, + "num_tokens": 21424640.0, + "reward": 0.85150146484375, + "reward_std": 0.028616584837436676, + "rewards//mean": 0.85150146484375, + "rewards//std": 0.03153924643993378, + "step": 1851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3704, + "grad_norm": 2.0302395820617676, + "kl": 1.1687540039420128, + "learning_rate": 7.074188088098527e-07, + "loss": 0.1169, + "num_tokens": 21436160.0, + "reward": 0.7918701171875, + "reward_std": 0.02638981305062771, + "rewards//mean": 0.7918701171875, + "rewards//std": 0.03339655324816704, + "step": 1852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3706, + "grad_norm": 2.5490283966064453, + "kl": 1.237761177122593, + "learning_rate": 7.071300275704909e-07, + "loss": 0.1238, + "num_tokens": 21447840.0, + "reward": 0.86492919921875, + "reward_std": 0.03997369483113289, + "rewards//mean": 0.86492919921875, + "rewards//std": 0.046076640486717224, + "step": 1853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3708, + "grad_norm": 1.9968056678771973, + "kl": 1.1842205002903938, + "learning_rate": 7.068411628991787e-07, + "loss": 0.1184, + "num_tokens": 21459512.0, + "reward": 0.8238525390625, + "reward_std": 0.018514256924390793, + "rewards//mean": 0.8238525390625, + "rewards//std": 0.025786688551306725, + "step": 1854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.371, + "grad_norm": 6.782217025756836, + "kl": 1.4949974119663239, + "learning_rate": 7.065522149122709e-07, + "loss": 0.1495, + "num_tokens": 21471056.0, + "reward": 0.834228515625, + "reward_std": 0.024826668202877045, + "rewards//mean": 0.834228515625, + "rewards//std": 0.030368544161319733, + "step": 1855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3712, + "grad_norm": 2.2905330657958984, + "kl": 1.387063018977642, + "learning_rate": 7.062631837261556e-07, + "loss": 0.1387, + "num_tokens": 21482496.0, + "reward": 0.83563232421875, + "reward_std": 0.038428835570812225, + "rewards//mean": 0.83563232421875, + "rewards//std": 0.040560949593782425, + "step": 1856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3714, + "grad_norm": 4.325047492980957, + "kl": 1.2602238208055496, + "learning_rate": 7.059740694572545e-07, + "loss": 0.126, + "num_tokens": 21494056.0, + "reward": 0.80389404296875, + "reward_std": 0.013041941449046135, + "rewards//mean": 0.80389404296875, + "rewards//std": 0.016086013987660408, + "step": 1857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3716, + "grad_norm": 2.519918203353882, + "kl": 1.2274365685880184, + "learning_rate": 7.056848722220228e-07, + "loss": 0.1227, + "num_tokens": 21505648.0, + "reward": 0.8365478515625, + "reward_std": 0.020497407764196396, + "rewards//mean": 0.8365478515625, + "rewards//std": 0.02646803855895996, + "step": 1858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3718, + "grad_norm": 1.8704383373260498, + "kl": 0.6250842437148094, + "learning_rate": 7.053955921369493e-07, + "loss": 0.0625, + "num_tokens": 21517168.0, + "reward": 0.86639404296875, + "reward_std": 0.023417524993419647, + "rewards//mean": 0.86639404296875, + "rewards//std": 0.031406011432409286, + "step": 1859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.372, + "grad_norm": 2.3694162368774414, + "kl": 0.8492283001542091, + "learning_rate": 7.051062293185559e-07, + "loss": 0.0849, + "num_tokens": 21528744.0, + "reward": 0.84112548828125, + "reward_std": 0.013906355947256088, + "rewards//mean": 0.84112548828125, + "rewards//std": 0.02260696515440941, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3722, + "grad_norm": 2.5108556747436523, + "kl": 0.9682294689118862, + "learning_rate": 7.048167838833976e-07, + "loss": 0.0968, + "num_tokens": 21540248.0, + "reward": 0.86566162109375, + "reward_std": 0.024623781442642212, + "rewards//mean": 0.86566162109375, + "rewards//std": 0.02738904021680355, + "step": 1861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3724, + "grad_norm": 2.347398042678833, + "kl": 1.0879295282065868, + "learning_rate": 7.045272559480635e-07, + "loss": 0.1088, + "num_tokens": 21551776.0, + "reward": 0.8331298828125, + "reward_std": 0.017685266211628914, + "rewards//mean": 0.8331298828125, + "rewards//std": 0.02132134884595871, + "step": 1862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3726, + "grad_norm": 1.8929002285003662, + "kl": 1.411936465650797, + "learning_rate": 7.042376456291751e-07, + "loss": 0.1412, + "num_tokens": 21563360.0, + "reward": 0.81451416015625, + "reward_std": 0.01966111734509468, + "rewards//mean": 0.81451416015625, + "rewards//std": 0.030105147510766983, + "step": 1863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3728, + "grad_norm": 2.7733871936798096, + "kl": 1.642739936709404, + "learning_rate": 7.039479530433874e-07, + "loss": 0.1643, + "num_tokens": 21574904.0, + "reward": 0.794677734375, + "reward_std": 0.019234690815210342, + "rewards//mean": 0.794677734375, + "rewards//std": 0.025112468749284744, + "step": 1864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.373, + "grad_norm": 2.4963314533233643, + "kl": 1.2587578296661377, + "learning_rate": 7.036581783073887e-07, + "loss": 0.1259, + "num_tokens": 21586456.0, + "reward": 0.8255615234375, + "reward_std": 0.020940372720360756, + "rewards//mean": 0.8255615234375, + "rewards//std": 0.02961217425763607, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3732, + "grad_norm": 2.6107876300811768, + "kl": 1.2830048948526382, + "learning_rate": 7.033683215379002e-07, + "loss": 0.1283, + "num_tokens": 21598144.0, + "reward": 0.8455810546875, + "reward_std": 0.027098514139652252, + "rewards//mean": 0.8455810546875, + "rewards//std": 0.03163491189479828, + "step": 1866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3734, + "grad_norm": 2.6531925201416016, + "kl": 0.9006496444344521, + "learning_rate": 7.030783828516759e-07, + "loss": 0.0901, + "num_tokens": 21609784.0, + "reward": 0.85675048828125, + "reward_std": 0.022495660930871964, + "rewards//mean": 0.85675048828125, + "rewards//std": 0.02625521458685398, + "step": 1867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3736, + "grad_norm": 2.2903213500976562, + "kl": 1.2670957148075104, + "learning_rate": 7.027883623655034e-07, + "loss": 0.1267, + "num_tokens": 21621280.0, + "reward": 0.84906005859375, + "reward_std": 0.03388458117842674, + "rewards//mean": 0.84906005859375, + "rewards//std": 0.037540845572948456, + "step": 1868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3738, + "grad_norm": 3.316044569015503, + "kl": 0.9689630009233952, + "learning_rate": 7.024982601962026e-07, + "loss": 0.0969, + "num_tokens": 21632904.0, + "reward": 0.8160400390625, + "reward_std": 0.0166662335395813, + "rewards//mean": 0.8160400390625, + "rewards//std": 0.02414722740650177, + "step": 1869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.374, + "grad_norm": 4.086446285247803, + "kl": 1.6912427060306072, + "learning_rate": 7.022080764606271e-07, + "loss": 0.1702, + "num_tokens": 21644390.0, + "reward": 0.81622314453125, + "reward_std": 0.025808367878198624, + "rewards//mean": 0.81622314453125, + "rewards//std": 0.041189782321453094, + "step": 1870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3742, + "grad_norm": 2.712904691696167, + "kl": 1.2799407243728638, + "learning_rate": 7.019178112756625e-07, + "loss": 0.128, + "num_tokens": 21655998.0, + "reward": 0.804931640625, + "reward_std": 0.016617434099316597, + "rewards//mean": 0.804931640625, + "rewards//std": 0.019020797684788704, + "step": 1871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3744, + "grad_norm": 2.12622332572937, + "kl": 1.064117170870304, + "learning_rate": 7.016274647582276e-07, + "loss": 0.1064, + "num_tokens": 21667606.0, + "reward": 0.82232666015625, + "reward_std": 0.03150913119316101, + "rewards//mean": 0.82232666015625, + "rewards//std": 0.03714275732636452, + "step": 1872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3746, + "grad_norm": 4.730565547943115, + "kl": 1.0408611223101616, + "learning_rate": 7.013370370252739e-07, + "loss": 0.1041, + "num_tokens": 21679070.0, + "reward": 0.80804443359375, + "reward_std": 0.014773641712963581, + "rewards//mean": 0.80804443359375, + "rewards//std": 0.021808769553899765, + "step": 1873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3748, + "grad_norm": 3.4536006450653076, + "kl": 1.5231354087591171, + "learning_rate": 7.010465281937858e-07, + "loss": 0.1523, + "num_tokens": 21690582.0, + "reward": 0.861328125, + "reward_std": 0.02167322486639023, + "rewards//mean": 0.861328125, + "rewards//std": 0.023837119340896606, + "step": 1874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.375, + "grad_norm": 2.085533618927002, + "kl": 1.4779358953237534, + "learning_rate": 7.007559383807802e-07, + "loss": 0.1478, + "num_tokens": 21702190.0, + "reward": 0.82720947265625, + "reward_std": 0.023219961673021317, + "rewards//mean": 0.82720947265625, + "rewards//std": 0.03247194364666939, + "step": 1875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3752, + "grad_norm": 2.764420747756958, + "kl": 1.0979639142751694, + "learning_rate": 7.004652677033068e-07, + "loss": 0.1098, + "num_tokens": 21713742.0, + "reward": 0.84295654296875, + "reward_std": 0.023061370477080345, + "rewards//mean": 0.84295654296875, + "rewards//std": 0.030672520399093628, + "step": 1876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3754, + "grad_norm": 2.1521058082580566, + "kl": 1.32046590000391, + "learning_rate": 7.001745162784475e-07, + "loss": 0.132, + "num_tokens": 21725262.0, + "reward": 0.85113525390625, + "reward_std": 0.02199186012148857, + "rewards//mean": 0.85113525390625, + "rewards//std": 0.03081383742392063, + "step": 1877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3756, + "grad_norm": 2.023810386657715, + "kl": 0.9764394909143448, + "learning_rate": 6.998836842233169e-07, + "loss": 0.0976, + "num_tokens": 21736854.0, + "reward": 0.83795166015625, + "reward_std": 0.02377466671168804, + "rewards//mean": 0.83795166015625, + "rewards//std": 0.031094543635845184, + "step": 1878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3758, + "grad_norm": 2.103745698928833, + "kl": 1.1759476996958256, + "learning_rate": 6.995927716550622e-07, + "loss": 0.1176, + "num_tokens": 21748430.0, + "reward": 0.8599853515625, + "reward_std": 0.02712569385766983, + "rewards//mean": 0.8599853515625, + "rewards//std": 0.031375452876091, + "step": 1879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.376, + "grad_norm": 2.05570387840271, + "kl": 1.024823471903801, + "learning_rate": 6.99301778690863e-07, + "loss": 0.1025, + "num_tokens": 21760006.0, + "reward": 0.85986328125, + "reward_std": 0.029398618265986443, + "rewards//mean": 0.85986328125, + "rewards//std": 0.034893475472927094, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3762, + "grad_norm": 2.897322416305542, + "kl": 1.450159963220358, + "learning_rate": 6.990107054479312e-07, + "loss": 0.145, + "num_tokens": 21771598.0, + "reward": 0.82403564453125, + "reward_std": 0.023547764867544174, + "rewards//mean": 0.82403564453125, + "rewards//std": 0.03249943628907204, + "step": 1881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3764, + "grad_norm": 2.5029678344726562, + "kl": 1.2797376401722431, + "learning_rate": 6.987195520435109e-07, + "loss": 0.128, + "num_tokens": 21783206.0, + "reward": 0.84429931640625, + "reward_std": 0.020240548998117447, + "rewards//mean": 0.84429931640625, + "rewards//std": 0.024684377014636993, + "step": 1882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3766, + "grad_norm": 4.163302898406982, + "kl": 1.7621620073914528, + "learning_rate": 6.984283185948789e-07, + "loss": 0.1762, + "num_tokens": 21794774.0, + "reward": 0.83306884765625, + "reward_std": 0.02826301008462906, + "rewards//mean": 0.83306884765625, + "rewards//std": 0.031259626150131226, + "step": 1883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3768, + "grad_norm": 4.0500922203063965, + "kl": 1.2802671939134598, + "learning_rate": 6.981370052193439e-07, + "loss": 0.128, + "num_tokens": 21806550.0, + "reward": 0.78619384765625, + "reward_std": 0.014835093170404434, + "rewards//mean": 0.78619384765625, + "rewards//std": 0.022000884637236595, + "step": 1884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.377, + "grad_norm": 1.9600231647491455, + "kl": 1.1794195249676704, + "learning_rate": 6.978456120342469e-07, + "loss": 0.1179, + "num_tokens": 21818198.0, + "reward": 0.86285400390625, + "reward_std": 0.017326634377241135, + "rewards//mean": 0.86285400390625, + "rewards//std": 0.019870774820446968, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3772, + "grad_norm": 2.660036563873291, + "kl": 1.6466124802827835, + "learning_rate": 6.975541391569609e-07, + "loss": 0.1647, + "num_tokens": 21829718.0, + "reward": 0.836669921875, + "reward_std": 0.04107818379998207, + "rewards//mean": 0.836669921875, + "rewards//std": 0.04708058387041092, + "step": 1886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3774, + "grad_norm": 2.7560276985168457, + "kl": 1.5308156535029411, + "learning_rate": 6.972625867048914e-07, + "loss": 0.1531, + "num_tokens": 21841230.0, + "reward": 0.84771728515625, + "reward_std": 0.023873653262853622, + "rewards//mean": 0.84771728515625, + "rewards//std": 0.031207283958792686, + "step": 1887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3776, + "grad_norm": 2.15228009223938, + "kl": 1.4004219844937325, + "learning_rate": 6.969709547954755e-07, + "loss": 0.14, + "num_tokens": 21852774.0, + "reward": 0.83807373046875, + "reward_std": 0.026315297931432724, + "rewards//mean": 0.83807373046875, + "rewards//std": 0.03440718352794647, + "step": 1888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3778, + "grad_norm": 3.2242748737335205, + "kl": 1.3528359085321426, + "learning_rate": 6.966792435461826e-07, + "loss": 0.1353, + "num_tokens": 21864350.0, + "reward": 0.7568359375, + "reward_std": 0.02913505956530571, + "rewards//mean": 0.7568359375, + "rewards//std": 0.03709060698747635, + "step": 1889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.378, + "grad_norm": 2.5087428092956543, + "kl": 1.0769516751170158, + "learning_rate": 6.963874530745139e-07, + "loss": 0.1077, + "num_tokens": 21875854.0, + "reward": 0.84033203125, + "reward_std": 0.030888887122273445, + "rewards//mean": 0.84033203125, + "rewards//std": 0.0321025513112545, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3782, + "grad_norm": 1.9095038175582886, + "kl": 1.1246378161013126, + "learning_rate": 6.960955834980027e-07, + "loss": 0.1125, + "num_tokens": 21887478.0, + "reward": 0.84942626953125, + "reward_std": 0.028116095811128616, + "rewards//mean": 0.84942626953125, + "rewards//std": 0.037550922483205795, + "step": 1891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3784, + "grad_norm": 4.2185516357421875, + "kl": 1.9244073256850243, + "learning_rate": 6.958036349342139e-07, + "loss": 0.1924, + "num_tokens": 21898974.0, + "reward": 0.842041015625, + "reward_std": 0.026283754035830498, + "rewards//mean": 0.842041015625, + "rewards//std": 0.03995068743824959, + "step": 1892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3786, + "grad_norm": 2.083953380584717, + "kl": 1.439736932516098, + "learning_rate": 6.955116075007442e-07, + "loss": 0.144, + "num_tokens": 21910542.0, + "reward": 0.8472900390625, + "reward_std": 0.03372695669531822, + "rewards//mean": 0.8472900390625, + "rewards//std": 0.04005267098546028, + "step": 1893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3788, + "grad_norm": 2.3784852027893066, + "kl": 1.2228501588106155, + "learning_rate": 6.952195013152225e-07, + "loss": 0.1223, + "num_tokens": 21922054.0, + "reward": 0.8541259765625, + "reward_std": 0.019693493843078613, + "rewards//mean": 0.8541259765625, + "rewards//std": 0.02644973061978817, + "step": 1894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.379, + "grad_norm": 2.232327938079834, + "kl": 1.3002366907894611, + "learning_rate": 6.94927316495309e-07, + "loss": 0.13, + "num_tokens": 21933622.0, + "reward": 0.81072998046875, + "reward_std": 0.021160844713449478, + "rewards//mean": 0.81072998046875, + "rewards//std": 0.025788668543100357, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3792, + "grad_norm": 4.150232791900635, + "kl": 2.0269355326890945, + "learning_rate": 6.946350531586957e-07, + "loss": 0.2027, + "num_tokens": 21945198.0, + "reward": 0.8031005859375, + "reward_std": 0.023904409259557724, + "rewards//mean": 0.8031005859375, + "rewards//std": 0.0341796875, + "step": 1896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3794, + "grad_norm": 4.594144344329834, + "kl": 1.8745916038751602, + "learning_rate": 6.943427114231063e-07, + "loss": 0.1875, + "num_tokens": 21956750.0, + "reward": 0.79876708984375, + "reward_std": 0.0207855012267828, + "rewards//mean": 0.79876708984375, + "rewards//std": 0.02736581675708294, + "step": 1897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3796, + "grad_norm": 2.007786750793457, + "kl": 1.4546396881341934, + "learning_rate": 6.94050291406296e-07, + "loss": 0.1455, + "num_tokens": 21968334.0, + "reward": 0.840087890625, + "reward_std": 0.032166533172130585, + "rewards//mean": 0.840087890625, + "rewards//std": 0.03518638014793396, + "step": 1898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3798, + "grad_norm": 2.1133387088775635, + "kl": 1.6384625285863876, + "learning_rate": 6.937577932260514e-07, + "loss": 0.1638, + "num_tokens": 21980006.0, + "reward": 0.82275390625, + "reward_std": 0.026438500732183456, + "rewards//mean": 0.82275390625, + "rewards//std": 0.02739684469997883, + "step": 1899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.38, + "grad_norm": 2.0073530673980713, + "kl": 0.7344315052032471, + "learning_rate": 6.93465217000191e-07, + "loss": 0.0734, + "num_tokens": 21991526.0, + "reward": 0.81573486328125, + "reward_std": 0.01728064939379692, + "rewards//mean": 0.81573486328125, + "rewards//std": 0.023277342319488525, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3802, + "grad_norm": 2.100390672683716, + "kl": 1.2915073744952679, + "learning_rate": 6.931725628465642e-07, + "loss": 0.1292, + "num_tokens": 22003094.0, + "reward": 0.83428955078125, + "reward_std": 0.01709732413291931, + "rewards//mean": 0.83428955078125, + "rewards//std": 0.024178162217140198, + "step": 1901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3804, + "grad_norm": 1.7310608625411987, + "kl": 0.8223048746585846, + "learning_rate": 6.928798308830523e-07, + "loss": 0.0822, + "num_tokens": 22014654.0, + "reward": 0.83258056640625, + "reward_std": 0.01759016513824463, + "rewards//mean": 0.83258056640625, + "rewards//std": 0.028745997697114944, + "step": 1902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3806, + "grad_norm": 3.002668857574463, + "kl": 1.0076301246881485, + "learning_rate": 6.925870212275676e-07, + "loss": 0.1008, + "num_tokens": 22026206.0, + "reward": 0.8060302734375, + "reward_std": 0.028973504900932312, + "rewards//mean": 0.8060302734375, + "rewards//std": 0.03473846986889839, + "step": 1903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3808, + "grad_norm": 2.599738836288452, + "kl": 1.0267309099435806, + "learning_rate": 6.922941339980537e-07, + "loss": 0.1027, + "num_tokens": 22037766.0, + "reward": 0.857421875, + "reward_std": 0.029489779844880104, + "rewards//mean": 0.857421875, + "rewards//std": 0.03666369989514351, + "step": 1904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.381, + "grad_norm": 2.9339029788970947, + "kl": 1.4535052627325058, + "learning_rate": 6.920011693124856e-07, + "loss": 0.1454, + "num_tokens": 22049390.0, + "reward": 0.8040771484375, + "reward_std": 0.028091510757803917, + "rewards//mean": 0.8040771484375, + "rewards//std": 0.03646643087267876, + "step": 1905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3812, + "grad_norm": 2.3167166709899902, + "kl": 1.0596019960939884, + "learning_rate": 6.917081272888696e-07, + "loss": 0.106, + "num_tokens": 22060966.0, + "reward": 0.8272705078125, + "reward_std": 0.02116527408361435, + "rewards//mean": 0.8272705078125, + "rewards//std": 0.03702831640839577, + "step": 1906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3814, + "grad_norm": 2.17250657081604, + "kl": 0.9719684980809689, + "learning_rate": 6.914150080452428e-07, + "loss": 0.0972, + "num_tokens": 22072542.0, + "reward": 0.8441162109375, + "reward_std": 0.02301687002182007, + "rewards//mean": 0.8441162109375, + "rewards//std": 0.030231406912207603, + "step": 1907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3816, + "grad_norm": 2.7672665119171143, + "kl": 1.065846711397171, + "learning_rate": 6.911218116996736e-07, + "loss": 0.1066, + "num_tokens": 22084102.0, + "reward": 0.8699951171875, + "reward_std": 0.0234740749001503, + "rewards//mean": 0.8699951171875, + "rewards//std": 0.03405723348259926, + "step": 1908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3818, + "grad_norm": 1.9623005390167236, + "kl": 0.9677844122052193, + "learning_rate": 6.908285383702616e-07, + "loss": 0.0968, + "num_tokens": 22095670.0, + "reward": 0.85491943359375, + "reward_std": 0.02602696232497692, + "rewards//mean": 0.85491943359375, + "rewards//std": 0.031900983303785324, + "step": 1909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.382, + "grad_norm": 2.7200767993927, + "kl": 0.93751360476017, + "learning_rate": 6.905351881751371e-07, + "loss": 0.0938, + "num_tokens": 22107190.0, + "reward": 0.857666015625, + "reward_std": 0.02518267184495926, + "rewards//mean": 0.857666015625, + "rewards//std": 0.03546064719557762, + "step": 1910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3822, + "grad_norm": 2.57553768157959, + "kl": 1.7530411593616009, + "learning_rate": 6.902417612324615e-07, + "loss": 0.1753, + "num_tokens": 22118750.0, + "reward": 0.83447265625, + "reward_std": 0.04136385768651962, + "rewards//mean": 0.83447265625, + "rewards//std": 0.044919852167367935, + "step": 1911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3824, + "grad_norm": 2.8907792568206787, + "kl": 1.1979551017284393, + "learning_rate": 6.899482576604274e-07, + "loss": 0.1198, + "num_tokens": 22130366.0, + "reward": 0.8505859375, + "reward_std": 0.028096964582800865, + "rewards//mean": 0.8505859375, + "rewards//std": 0.03619162738323212, + "step": 1912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3826, + "grad_norm": 2.241607666015625, + "kl": 0.8207360915839672, + "learning_rate": 6.896546775772576e-07, + "loss": 0.0821, + "num_tokens": 22141894.0, + "reward": 0.7779541015625, + "reward_std": 0.01730806566774845, + "rewards//mean": 0.7779541015625, + "rewards//std": 0.02106708101928234, + "step": 1913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3828, + "grad_norm": 2.189194917678833, + "kl": 0.9964268058538437, + "learning_rate": 6.893610211012066e-07, + "loss": 0.0996, + "num_tokens": 22153478.0, + "reward": 0.78143310546875, + "reward_std": 0.018990855664014816, + "rewards//mean": 0.78143310546875, + "rewards//std": 0.02294064313173294, + "step": 1914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.383, + "grad_norm": 2.0353662967681885, + "kl": 1.1822843849658966, + "learning_rate": 6.890672883505588e-07, + "loss": 0.1182, + "num_tokens": 22165102.0, + "reward": 0.808349609375, + "reward_std": 0.027226600795984268, + "rewards//mean": 0.808349609375, + "rewards//std": 0.033078886568546295, + "step": 1915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3832, + "grad_norm": 2.295344352722168, + "kl": 1.0946802124381065, + "learning_rate": 6.887734794436299e-07, + "loss": 0.1095, + "num_tokens": 22176662.0, + "reward": 0.792724609375, + "reward_std": 0.014632727019488811, + "rewards//mean": 0.792724609375, + "rewards//std": 0.01635027676820755, + "step": 1916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3834, + "grad_norm": 2.1064538955688477, + "kl": 1.0152682438492775, + "learning_rate": 6.884795944987661e-07, + "loss": 0.1015, + "num_tokens": 22188278.0, + "reward": 0.8062744140625, + "reward_std": 0.0239776112139225, + "rewards//mean": 0.8062744140625, + "rewards//std": 0.03608250617980957, + "step": 1917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3836, + "grad_norm": 2.1158688068389893, + "kl": 1.3351519592106342, + "learning_rate": 6.881856336343441e-07, + "loss": 0.1335, + "num_tokens": 22199782.0, + "reward": 0.83447265625, + "reward_std": 0.021630076691508293, + "rewards//mean": 0.83447265625, + "rewards//std": 0.02306772582232952, + "step": 1918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3838, + "grad_norm": 2.0143818855285645, + "kl": 0.9745525233447552, + "learning_rate": 6.878915969687714e-07, + "loss": 0.0975, + "num_tokens": 22211414.0, + "reward": 0.851806640625, + "reward_std": 0.019271142780780792, + "rewards//mean": 0.851806640625, + "rewards//std": 0.0351794958114624, + "step": 1919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.384, + "grad_norm": 2.144143581390381, + "kl": 0.7959698587656021, + "learning_rate": 6.875974846204858e-07, + "loss": 0.0796, + "num_tokens": 22223110.0, + "reward": 0.8487548828125, + "reward_std": 0.023765940219163895, + "rewards//mean": 0.8487548828125, + "rewards//std": 0.026073908433318138, + "step": 1920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3842, + "grad_norm": 2.2108590602874756, + "kl": 1.0477802753448486, + "learning_rate": 6.87303296707956e-07, + "loss": 0.1048, + "num_tokens": 22234622.0, + "reward": 0.854248046875, + "reward_std": 0.027012929320335388, + "rewards//mean": 0.854248046875, + "rewards//std": 0.03369550779461861, + "step": 1921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3844, + "grad_norm": 2.4872915744781494, + "kl": 1.002775702625513, + "learning_rate": 6.870090333496806e-07, + "loss": 0.1003, + "num_tokens": 22246270.0, + "reward": 0.835693359375, + "reward_std": 0.013506264425814152, + "rewards//mean": 0.835693359375, + "rewards//std": 0.016629349440336227, + "step": 1922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3846, + "grad_norm": 2.8678572177886963, + "kl": 1.1888241469860077, + "learning_rate": 6.867146946641891e-07, + "loss": 0.1189, + "num_tokens": 22257910.0, + "reward": 0.78485107421875, + "reward_std": 0.019732221961021423, + "rewards//mean": 0.78485107421875, + "rewards//std": 0.02564150094985962, + "step": 1923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3848, + "grad_norm": 2.450975179672241, + "kl": 1.540554240345955, + "learning_rate": 6.864202807700407e-07, + "loss": 0.1541, + "num_tokens": 22269470.0, + "reward": 0.84893798828125, + "reward_std": 0.03532283008098602, + "rewards//mean": 0.84893798828125, + "rewards//std": 0.04013926908373833, + "step": 1924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.385, + "grad_norm": 2.7168467044830322, + "kl": 1.441015362739563, + "learning_rate": 6.861257917858257e-07, + "loss": 0.1441, + "num_tokens": 22280942.0, + "reward": 0.82330322265625, + "reward_std": 0.02052254229784012, + "rewards//mean": 0.82330322265625, + "rewards//std": 0.026095019653439522, + "step": 1925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3852, + "grad_norm": 2.446207284927368, + "kl": 0.9312014766037464, + "learning_rate": 6.858312278301637e-07, + "loss": 0.0931, + "num_tokens": 22292502.0, + "reward": 0.8529052734375, + "reward_std": 0.018834896385669708, + "rewards//mean": 0.8529052734375, + "rewards//std": 0.023190442472696304, + "step": 1926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3854, + "grad_norm": 4.938467502593994, + "kl": 1.180168904364109, + "learning_rate": 6.855365890217056e-07, + "loss": 0.118, + "num_tokens": 22304062.0, + "reward": 0.8177490234375, + "reward_std": 0.015795210376381874, + "rewards//mean": 0.8177490234375, + "rewards//std": 0.0182320736348629, + "step": 1927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3856, + "grad_norm": 8.079294204711914, + "kl": 1.456030011177063, + "learning_rate": 6.852418754791316e-07, + "loss": 0.1456, + "num_tokens": 22315694.0, + "reward": 0.764892578125, + "reward_std": 0.02353660762310028, + "rewards//mean": 0.764892578125, + "rewards//std": 0.028846992179751396, + "step": 1928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3858, + "grad_norm": 2.204723834991455, + "kl": 0.9218882322311401, + "learning_rate": 6.849470873211522e-07, + "loss": 0.0922, + "num_tokens": 22327302.0, + "reward": 0.8563232421875, + "reward_std": 0.029092401266098022, + "rewards//mean": 0.8563232421875, + "rewards//std": 0.03547835722565651, + "step": 1929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.386, + "grad_norm": 1.956192135810852, + "kl": 0.7210625521838665, + "learning_rate": 6.846522246665083e-07, + "loss": 0.0721, + "num_tokens": 22338838.0, + "reward": 0.85302734375, + "reward_std": 0.022407853975892067, + "rewards//mean": 0.85302734375, + "rewards//std": 0.02947511151432991, + "step": 1930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3862, + "grad_norm": 2.075031280517578, + "kl": 1.3496750742197037, + "learning_rate": 6.843572876339704e-07, + "loss": 0.135, + "num_tokens": 22350382.0, + "reward": 0.84039306640625, + "reward_std": 0.03117944858968258, + "rewards//mean": 0.84039306640625, + "rewards//std": 0.04039078205823898, + "step": 1931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3864, + "grad_norm": 2.2377443313598633, + "kl": 1.2505203932523727, + "learning_rate": 6.840622763423391e-07, + "loss": 0.1251, + "num_tokens": 22361918.0, + "reward": 0.823974609375, + "reward_std": 0.018474031239748, + "rewards//mean": 0.823974609375, + "rewards//std": 0.021337317302823067, + "step": 1932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3866, + "grad_norm": 2.457510232925415, + "kl": 0.8287389576435089, + "learning_rate": 6.837671909104447e-07, + "loss": 0.0829, + "num_tokens": 22373526.0, + "reward": 0.8563232421875, + "reward_std": 0.02131771109998226, + "rewards//mean": 0.8563232421875, + "rewards//std": 0.02818099595606327, + "step": 1933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3868, + "grad_norm": 2.107923984527588, + "kl": 0.9581423550844193, + "learning_rate": 6.834720314571479e-07, + "loss": 0.0958, + "num_tokens": 22385174.0, + "reward": 0.865966796875, + "reward_std": 0.016492635011672974, + "rewards//mean": 0.865966796875, + "rewards//std": 0.0232285987585783, + "step": 1934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.387, + "grad_norm": 2.2019848823547363, + "kl": 1.0079278871417046, + "learning_rate": 6.831767981013388e-07, + "loss": 0.1008, + "num_tokens": 22396814.0, + "reward": 0.80572509765625, + "reward_std": 0.02353903464972973, + "rewards//mean": 0.80572509765625, + "rewards//std": 0.03509931638836861, + "step": 1935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3872, + "grad_norm": 2.5966720581054688, + "kl": 1.419892705976963, + "learning_rate": 6.828814909619372e-07, + "loss": 0.142, + "num_tokens": 22408414.0, + "reward": 0.86798095703125, + "reward_std": 0.02712082490324974, + "rewards//mean": 0.86798095703125, + "rewards//std": 0.041627004742622375, + "step": 1936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3874, + "grad_norm": 2.3284249305725098, + "kl": 1.2555123940110207, + "learning_rate": 6.82586110157893e-07, + "loss": 0.1256, + "num_tokens": 22420014.0, + "reward": 0.86236572265625, + "reward_std": 0.03376249969005585, + "rewards//mean": 0.86236572265625, + "rewards//std": 0.037883639335632324, + "step": 1937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3876, + "grad_norm": 1.9213993549346924, + "kl": 0.8552495464682579, + "learning_rate": 6.822906558081856e-07, + "loss": 0.0855, + "num_tokens": 22431638.0, + "reward": 0.830322265625, + "reward_std": 0.018061425536870956, + "rewards//mean": 0.830322265625, + "rewards//std": 0.025778761133551598, + "step": 1938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3878, + "grad_norm": 2.084024667739868, + "kl": 0.9936955124139786, + "learning_rate": 6.819951280318236e-07, + "loss": 0.0994, + "num_tokens": 22443198.0, + "reward": 0.82080078125, + "reward_std": 0.024944782257080078, + "rewards//mean": 0.82080078125, + "rewards//std": 0.030730271711945534, + "step": 1939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.388, + "grad_norm": 2.6333167552948, + "kl": 0.9108291156589985, + "learning_rate": 6.816995269478459e-07, + "loss": 0.0911, + "num_tokens": 22454822.0, + "reward": 0.84967041015625, + "reward_std": 0.028491532430052757, + "rewards//mean": 0.84967041015625, + "rewards//std": 0.03924185410141945, + "step": 1940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3882, + "grad_norm": 2.746431827545166, + "kl": 1.5082365423440933, + "learning_rate": 6.814038526753204e-07, + "loss": 0.1508, + "num_tokens": 22466438.0, + "reward": 0.8424072265625, + "reward_std": 0.02745349332690239, + "rewards//mean": 0.8424072265625, + "rewards//std": 0.03135228529572487, + "step": 1941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3884, + "grad_norm": 2.2668051719665527, + "kl": 1.417116105556488, + "learning_rate": 6.811081053333449e-07, + "loss": 0.1417, + "num_tokens": 22477998.0, + "reward": 0.79632568359375, + "reward_std": 0.018675021827220917, + "rewards//mean": 0.79632568359375, + "rewards//std": 0.022945262491703033, + "step": 1942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3886, + "grad_norm": 1.9594568014144897, + "kl": 1.2529239282011986, + "learning_rate": 6.80812285041046e-07, + "loss": 0.1253, + "num_tokens": 22489550.0, + "reward": 0.77947998046875, + "reward_std": 0.01697271689772606, + "rewards//mean": 0.77947998046875, + "rewards//std": 0.021842753514647484, + "step": 1943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3888, + "grad_norm": 2.662357807159424, + "kl": 1.2398503981530666, + "learning_rate": 6.805163919175806e-07, + "loss": 0.124, + "num_tokens": 22501102.0, + "reward": 0.78826904296875, + "reward_std": 0.021219300106167793, + "rewards//mean": 0.78826904296875, + "rewards//std": 0.031429141759872437, + "step": 1944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.389, + "grad_norm": 2.459763288497925, + "kl": 1.3753268048167229, + "learning_rate": 6.80220426082134e-07, + "loss": 0.1375, + "num_tokens": 22512742.0, + "reward": 0.82684326171875, + "reward_std": 0.02809901535511017, + "rewards//mean": 0.82684326171875, + "rewards//std": 0.032008517533540726, + "step": 1945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.3892, + "grad_norm": 1.9752767086029053, + "kl": 0.9818007573485374, + "learning_rate": 6.799243876539213e-07, + "loss": 0.0832, + "num_tokens": 22524281.0, + "reward": 0.8424072265625, + "reward_std": 0.015611130744218826, + "rewards//mean": 0.8424072265625, + "rewards//std": 0.031698014587163925, + "step": 1946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3894, + "grad_norm": 2.2701144218444824, + "kl": 1.1125145517289639, + "learning_rate": 6.796282767521869e-07, + "loss": 0.1113, + "num_tokens": 22535841.0, + "reward": 0.84075927734375, + "reward_std": 0.0177753996104002, + "rewards//mean": 0.84075927734375, + "rewards//std": 0.02644536644220352, + "step": 1947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3896, + "grad_norm": 2.3761866092681885, + "kl": 1.3552896901965141, + "learning_rate": 6.793320934962038e-07, + "loss": 0.1355, + "num_tokens": 22547417.0, + "reward": 0.86602783203125, + "reward_std": 0.025428704917430878, + "rewards//mean": 0.86602783203125, + "rewards//std": 0.03355724737048149, + "step": 1948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3898, + "grad_norm": 2.0793745517730713, + "kl": 0.9650427252054214, + "learning_rate": 6.790358380052751e-07, + "loss": 0.0965, + "num_tokens": 22558985.0, + "reward": 0.86181640625, + "reward_std": 0.031617991626262665, + "rewards//mean": 0.86181640625, + "rewards//std": 0.03650812432169914, + "step": 1949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.39, + "grad_norm": 2.093442678451538, + "kl": 1.1290942281484604, + "learning_rate": 6.787395103987322e-07, + "loss": 0.1129, + "num_tokens": 22570553.0, + "reward": 0.8380126953125, + "reward_std": 0.026497572660446167, + "rewards//mean": 0.8380126953125, + "rewards//std": 0.031179925426840782, + "step": 1950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3902, + "grad_norm": 1.8770012855529785, + "kl": 0.9267642796039581, + "learning_rate": 6.784431107959358e-07, + "loss": 0.0927, + "num_tokens": 22582161.0, + "reward": 0.86358642578125, + "reward_std": 0.02558743767440319, + "rewards//mean": 0.86358642578125, + "rewards//std": 0.027868449687957764, + "step": 1951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3904, + "grad_norm": 2.513589382171631, + "kl": 1.015716016292572, + "learning_rate": 6.781466393162761e-07, + "loss": 0.1016, + "num_tokens": 22593649.0, + "reward": 0.8314208984375, + "reward_std": 0.01639893651008606, + "rewards//mean": 0.8314208984375, + "rewards//std": 0.035937897861003876, + "step": 1952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3906, + "grad_norm": 1.9759652614593506, + "kl": 1.0313180014491081, + "learning_rate": 6.778500960791708e-07, + "loss": 0.1031, + "num_tokens": 22605233.0, + "reward": 0.87890625, + "reward_std": 0.033014338463544846, + "rewards//mean": 0.87890625, + "rewards//std": 0.046771544963121414, + "step": 1953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3908, + "grad_norm": 6.055230140686035, + "kl": 1.596811380237341, + "learning_rate": 6.775534812040686e-07, + "loss": 0.1597, + "num_tokens": 22616849.0, + "reward": 0.77783203125, + "reward_std": 0.014649638906121254, + "rewards//mean": 0.77783203125, + "rewards//std": 0.021525204181671143, + "step": 1954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.391, + "grad_norm": 2.410954475402832, + "kl": 1.2575043737888336, + "learning_rate": 6.772567948104452e-07, + "loss": 0.1258, + "num_tokens": 22628361.0, + "reward": 0.84930419921875, + "reward_std": 0.02656523510813713, + "rewards//mean": 0.84930419921875, + "rewards//std": 0.03706973418593407, + "step": 1955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3912, + "grad_norm": 2.245213508605957, + "kl": 1.2174433693289757, + "learning_rate": 6.769600370178059e-07, + "loss": 0.1217, + "num_tokens": 22639969.0, + "reward": 0.849609375, + "reward_std": 0.030759068205952644, + "rewards//mean": 0.849609375, + "rewards//std": 0.036431752145290375, + "step": 1956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3914, + "grad_norm": 2.2575619220733643, + "kl": 0.9332301430404186, + "learning_rate": 6.766632079456851e-07, + "loss": 0.0933, + "num_tokens": 22651569.0, + "reward": 0.84136962890625, + "reward_std": 0.018497997894883156, + "rewards//mean": 0.84136962890625, + "rewards//std": 0.02772303856909275, + "step": 1957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3916, + "grad_norm": 2.1614198684692383, + "kl": 1.2470111809670925, + "learning_rate": 6.76366307713645e-07, + "loss": 0.1247, + "num_tokens": 22663089.0, + "reward": 0.83416748046875, + "reward_std": 0.026462372392416, + "rewards//mean": 0.83416748046875, + "rewards//std": 0.03796785697340965, + "step": 1958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3918, + "grad_norm": 2.811453104019165, + "kl": 1.361180655658245, + "learning_rate": 6.760693364412775e-07, + "loss": 0.1361, + "num_tokens": 22674681.0, + "reward": 0.867431640625, + "reward_std": 0.03914465010166168, + "rewards//mean": 0.867431640625, + "rewards//std": 0.04713713750243187, + "step": 1959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.392, + "grad_norm": 2.0703468322753906, + "kl": 1.0540809743106365, + "learning_rate": 6.757722942482022e-07, + "loss": 0.1054, + "num_tokens": 22686321.0, + "reward": 0.80328369140625, + "reward_std": 0.0160684697329998, + "rewards//mean": 0.80328369140625, + "rewards//std": 0.022950539365410805, + "step": 1960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3922, + "grad_norm": 2.616929769515991, + "kl": 1.0274102799594402, + "learning_rate": 6.754751812540679e-07, + "loss": 0.1027, + "num_tokens": 22697921.0, + "reward": 0.837646484375, + "reward_std": 0.025173772126436234, + "rewards//mean": 0.837646484375, + "rewards//std": 0.031310975551605225, + "step": 1961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3924, + "grad_norm": 2.3327929973602295, + "kl": 1.5900601744651794, + "learning_rate": 6.751779975785514e-07, + "loss": 0.159, + "num_tokens": 22709497.0, + "reward": 0.84490966796875, + "reward_std": 0.04139019921422005, + "rewards//mean": 0.84490966796875, + "rewards//std": 0.044601574540138245, + "step": 1962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3926, + "grad_norm": 2.4221720695495605, + "kl": 1.2969043031334877, + "learning_rate": 6.748807433413586e-07, + "loss": 0.1297, + "num_tokens": 22721049.0, + "reward": 0.86834716796875, + "reward_std": 0.030164353549480438, + "rewards//mean": 0.86834716796875, + "rewards//std": 0.0337945893406868, + "step": 1963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3928, + "grad_norm": 2.5988028049468994, + "kl": 1.096285305917263, + "learning_rate": 6.745834186622231e-07, + "loss": 0.1096, + "num_tokens": 22732793.0, + "reward": 0.8302001953125, + "reward_std": 0.02839534915983677, + "rewards//mean": 0.8302001953125, + "rewards//std": 0.04331943765282631, + "step": 1964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.393, + "grad_norm": 2.1748151779174805, + "kl": 0.9885699599981308, + "learning_rate": 6.742860236609076e-07, + "loss": 0.0989, + "num_tokens": 22744393.0, + "reward": 0.86102294921875, + "reward_std": 0.024290906265378, + "rewards//mean": 0.86102294921875, + "rewards//std": 0.029641490429639816, + "step": 1965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3932, + "grad_norm": 1.8824915885925293, + "kl": 0.8708079382777214, + "learning_rate": 6.739885584572025e-07, + "loss": 0.0871, + "num_tokens": 22755993.0, + "reward": 0.8660888671875, + "reward_std": 0.017575988546013832, + "rewards//mean": 0.8660888671875, + "rewards//std": 0.023973582312464714, + "step": 1966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3934, + "grad_norm": 2.019141912460327, + "kl": 1.5073408484458923, + "learning_rate": 6.73691023170927e-07, + "loss": 0.1507, + "num_tokens": 22767545.0, + "reward": 0.85272216796875, + "reward_std": 0.02802724391222, + "rewards//mean": 0.85272216796875, + "rewards//std": 0.03134812042117119, + "step": 1967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3936, + "grad_norm": 2.8618412017822266, + "kl": 1.5936407148838043, + "learning_rate": 6.733934179219281e-07, + "loss": 0.1594, + "num_tokens": 22779113.0, + "reward": 0.812255859375, + "reward_std": 0.021878890693187714, + "rewards//mean": 0.812255859375, + "rewards//std": 0.0249673780053854, + "step": 1968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3938, + "grad_norm": 1.7636499404907227, + "kl": 1.1951455026865005, + "learning_rate": 6.730957428300811e-07, + "loss": 0.1195, + "num_tokens": 22790569.0, + "reward": 0.861328125, + "reward_std": 0.027995461598038673, + "rewards//mean": 0.861328125, + "rewards//std": 0.03215908631682396, + "step": 1969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.394, + "grad_norm": 2.1193032264709473, + "kl": 1.2018609568476677, + "learning_rate": 6.727979980152898e-07, + "loss": 0.1202, + "num_tokens": 22802097.0, + "reward": 0.83123779296875, + "reward_std": 0.017481837421655655, + "rewards//mean": 0.83123779296875, + "rewards//std": 0.02195611596107483, + "step": 1970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3942, + "grad_norm": 2.408358097076416, + "kl": 1.0016646422445774, + "learning_rate": 6.725001835974852e-07, + "loss": 0.1002, + "num_tokens": 22813633.0, + "reward": 0.84381103515625, + "reward_std": 0.019560664892196655, + "rewards//mean": 0.84381103515625, + "rewards//std": 0.02184898965060711, + "step": 1971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3944, + "grad_norm": 2.3688950538635254, + "kl": 1.1508092433214188, + "learning_rate": 6.722022996966277e-07, + "loss": 0.1151, + "num_tokens": 22825169.0, + "reward": 0.82904052734375, + "reward_std": 0.02441788837313652, + "rewards//mean": 0.82904052734375, + "rewards//std": 0.026763997972011566, + "step": 1972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3946, + "grad_norm": 2.6117374897003174, + "kl": 1.3751371204853058, + "learning_rate": 6.719043464327042e-07, + "loss": 0.1375, + "num_tokens": 22836673.0, + "reward": 0.7939453125, + "reward_std": 0.033202048391103745, + "rewards//mean": 0.7939453125, + "rewards//std": 0.03822261840105057, + "step": 1973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3948, + "grad_norm": 2.132988452911377, + "kl": 1.6671424508094788, + "learning_rate": 6.716063239257306e-07, + "loss": 0.1667, + "num_tokens": 22848257.0, + "reward": 0.84613037109375, + "reward_std": 0.041371237486600876, + "rewards//mean": 0.84613037109375, + "rewards//std": 0.04565849527716637, + "step": 1974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.395, + "grad_norm": 2.103299379348755, + "kl": 0.9542150497436523, + "learning_rate": 6.713082322957502e-07, + "loss": 0.0954, + "num_tokens": 22859793.0, + "reward": 0.88848876953125, + "reward_std": 0.02322862669825554, + "rewards//mean": 0.88848876953125, + "rewards//std": 0.03524865210056305, + "step": 1975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3952, + "grad_norm": 2.5745863914489746, + "kl": 1.4595101103186607, + "learning_rate": 6.710100716628344e-07, + "loss": 0.146, + "num_tokens": 22871385.0, + "reward": 0.86572265625, + "reward_std": 0.024765420705080032, + "rewards//mean": 0.86572265625, + "rewards//std": 0.039515603333711624, + "step": 1976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3954, + "grad_norm": 2.0096018314361572, + "kl": 1.3218293339014053, + "learning_rate": 6.70711842147082e-07, + "loss": 0.1322, + "num_tokens": 22882881.0, + "reward": 0.85626220703125, + "reward_std": 0.024736080318689346, + "rewards//mean": 0.85626220703125, + "rewards//std": 0.031621214002370834, + "step": 1977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3956, + "grad_norm": 3.7121238708496094, + "kl": 1.1884579882025719, + "learning_rate": 6.704135438686203e-07, + "loss": 0.1188, + "num_tokens": 22894449.0, + "reward": 0.81402587890625, + "reward_std": 0.009900924749672413, + "rewards//mean": 0.81402587890625, + "rewards//std": 0.015066551975905895, + "step": 1978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3958, + "grad_norm": 2.6520092487335205, + "kl": 1.0126413628458977, + "learning_rate": 6.701151769476032e-07, + "loss": 0.1013, + "num_tokens": 22905977.0, + "reward": 0.8277587890625, + "reward_std": 0.02438177913427353, + "rewards//mean": 0.8277587890625, + "rewards//std": 0.031471800059080124, + "step": 1979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.396, + "grad_norm": 1.826638102531433, + "kl": 1.4294871650636196, + "learning_rate": 6.698167415042134e-07, + "loss": 0.1429, + "num_tokens": 22917625.0, + "reward": 0.83697509765625, + "reward_std": 0.019164595752954483, + "rewards//mean": 0.83697509765625, + "rewards//std": 0.04151083528995514, + "step": 1980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3962, + "grad_norm": 3.002192258834839, + "kl": 1.8314725831151009, + "learning_rate": 6.695182376586602e-07, + "loss": 0.1831, + "num_tokens": 22929337.0, + "reward": 0.873046875, + "reward_std": 0.032959818840026855, + "rewards//mean": 0.873046875, + "rewards//std": 0.045057136565446854, + "step": 1981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.3964, + "grad_norm": 2.1859326362609863, + "kl": 1.707338634878397, + "learning_rate": 6.692196655311814e-07, + "loss": 0.1722, + "num_tokens": 22940825.0, + "reward": 0.8455810546875, + "reward_std": 0.025838615372776985, + "rewards//mean": 0.8455810546875, + "rewards//std": 0.02863711304962635, + "step": 1982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3966, + "grad_norm": 1.920676589012146, + "kl": 0.9823421686887741, + "learning_rate": 6.689210252420415e-07, + "loss": 0.0982, + "num_tokens": 22952441.0, + "reward": 0.83807373046875, + "reward_std": 0.01725853607058525, + "rewards//mean": 0.83807373046875, + "rewards//std": 0.021541284397244453, + "step": 1983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3968, + "grad_norm": 5.745406150817871, + "kl": 1.1372113525867462, + "learning_rate": 6.686223169115327e-07, + "loss": 0.1137, + "num_tokens": 22964161.0, + "reward": 0.81024169921875, + "reward_std": 0.011824442073702812, + "rewards//mean": 0.81024169921875, + "rewards//std": 0.016215357929468155, + "step": 1984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.397, + "grad_norm": 2.310302257537842, + "kl": 1.528627086430788, + "learning_rate": 6.683235406599749e-07, + "loss": 0.1529, + "num_tokens": 22975761.0, + "reward": 0.8409423828125, + "reward_std": 0.024245552718639374, + "rewards//mean": 0.8409423828125, + "rewards//std": 0.02933482825756073, + "step": 1985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3972, + "grad_norm": 1.6763873100280762, + "kl": 1.1625756472349167, + "learning_rate": 6.68024696607715e-07, + "loss": 0.1163, + "num_tokens": 22987417.0, + "reward": 0.8477783203125, + "reward_std": 0.020532622933387756, + "rewards//mean": 0.8477783203125, + "rewards//std": 0.025324661284685135, + "step": 1986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3974, + "grad_norm": 2.2681965827941895, + "kl": 0.7213934622704983, + "learning_rate": 6.677257848751276e-07, + "loss": 0.0721, + "num_tokens": 22998985.0, + "reward": 0.8507080078125, + "reward_std": 0.015031334012746811, + "rewards//mean": 0.8507080078125, + "rewards//std": 0.019481195136904716, + "step": 1987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3976, + "grad_norm": 2.561939239501953, + "kl": 0.7832465544342995, + "learning_rate": 6.674268055826138e-07, + "loss": 0.0783, + "num_tokens": 23010641.0, + "reward": 0.881591796875, + "reward_std": 0.021049916744232178, + "rewards//mean": 0.881591796875, + "rewards//std": 0.034589339047670364, + "step": 1988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3978, + "grad_norm": 2.2611231803894043, + "kl": 1.1256866753101349, + "learning_rate": 6.671277588506029e-07, + "loss": 0.1126, + "num_tokens": 23022257.0, + "reward": 0.8214111328125, + "reward_std": 0.026975851505994797, + "rewards//mean": 0.8214111328125, + "rewards//std": 0.039588894695043564, + "step": 1989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.398, + "grad_norm": 4.757654666900635, + "kl": 1.5403595566749573, + "learning_rate": 6.668286447995507e-07, + "loss": 0.154, + "num_tokens": 23033817.0, + "reward": 0.8408203125, + "reward_std": 0.016935892403125763, + "rewards//mean": 0.8408203125, + "rewards//std": 0.02173236943781376, + "step": 1990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3982, + "grad_norm": 1.9753440618515015, + "kl": 1.5218686237931252, + "learning_rate": 6.665294635499403e-07, + "loss": 0.1522, + "num_tokens": 23045401.0, + "reward": 0.87158203125, + "reward_std": 0.026619650423526764, + "rewards//mean": 0.87158203125, + "rewards//std": 0.03940511867403984, + "step": 1991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3984, + "grad_norm": 2.882289171218872, + "kl": 1.3129476457834244, + "learning_rate": 6.66230215222282e-07, + "loss": 0.1313, + "num_tokens": 23056937.0, + "reward": 0.8782958984375, + "reward_std": 0.037924669682979584, + "rewards//mean": 0.8782958984375, + "rewards//std": 0.041714414954185486, + "step": 1992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3986, + "grad_norm": 2.55656361579895, + "kl": 1.5950692631304264, + "learning_rate": 6.659308999371129e-07, + "loss": 0.1595, + "num_tokens": 23068537.0, + "reward": 0.80938720703125, + "reward_std": 0.025430018082261086, + "rewards//mean": 0.80938720703125, + "rewards//std": 0.03127463534474373, + "step": 1993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3988, + "grad_norm": 2.7317755222320557, + "kl": 1.6130149587988853, + "learning_rate": 6.65631517814997e-07, + "loss": 0.1613, + "num_tokens": 23080105.0, + "reward": 0.8624267578125, + "reward_std": 0.031233320012688637, + "rewards//mean": 0.8624267578125, + "rewards//std": 0.03737986087799072, + "step": 1994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.399, + "grad_norm": 2.4232094287872314, + "kl": 1.2709138169884682, + "learning_rate": 6.653320689765256e-07, + "loss": 0.1271, + "num_tokens": 23091665.0, + "reward": 0.8695068359375, + "reward_std": 0.031599655747413635, + "rewards//mean": 0.8695068359375, + "rewards//std": 0.04066479951143265, + "step": 1995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3992, + "grad_norm": 2.372927665710449, + "kl": 1.3732558563351631, + "learning_rate": 6.650325535423166e-07, + "loss": 0.1373, + "num_tokens": 23103217.0, + "reward": 0.8580322265625, + "reward_std": 0.03467214107513428, + "rewards//mean": 0.8580322265625, + "rewards//std": 0.0350162498652935, + "step": 1996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3994, + "grad_norm": 5.435511589050293, + "kl": 1.7903770208358765, + "learning_rate": 6.647329716330147e-07, + "loss": 0.179, + "num_tokens": 23114769.0, + "reward": 0.797607421875, + "reward_std": 0.016040265560150146, + "rewards//mean": 0.797607421875, + "rewards//std": 0.017619412392377853, + "step": 1997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3996, + "grad_norm": 2.4258031845092773, + "kl": 1.0802893340587616, + "learning_rate": 6.644333233692916e-07, + "loss": 0.108, + "num_tokens": 23126513.0, + "reward": 0.82415771484375, + "reward_std": 0.01424010843038559, + "rewards//mean": 0.82415771484375, + "rewards//std": 0.02108171582221985, + "step": 1998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3998, + "grad_norm": 1.774843692779541, + "kl": 1.1401104852557182, + "learning_rate": 6.641336088718456e-07, + "loss": 0.114, + "num_tokens": 23138089.0, + "reward": 0.8304443359375, + "reward_std": 0.019031938165426254, + "rewards//mean": 0.8304443359375, + "rewards//std": 0.0290047749876976, + "step": 1999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4, + "grad_norm": 2.122948408126831, + "kl": 1.2672018259763718, + "learning_rate": 6.638338282614014e-07, + "loss": 0.1267, + "num_tokens": 23149665.0, + "reward": 0.8341064453125, + "reward_std": 0.028114745393395424, + "rewards//mean": 0.8341064453125, + "rewards//std": 0.03302827849984169, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 5000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}