Lansechen commited on
Commit
9876eaa
·
verified ·
1 Parent(s): 8571024

Model save

Browse files
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-3B
3
+ library_name: transformers
4
+ model_name: Qwen2.5-3B-Open-R1-GRPO-math-selected-cosine-v2
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - grpo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for Qwen2.5-3B-Open-R1-GRPO-math-selected-cosine-v2
13
+
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-3B](https://huggingface.co/Qwen/Qwen2.5-3B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="Lansechen/Qwen2.5-3B-Open-R1-GRPO-math-selected-cosine-v2", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/chenran1995-the-chinese-university-of-hong-kong/huggingface/runs/3knvoabh)
31
+
32
+
33
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.16.0
38
+ - Transformers: 4.50.0
39
+ - Pytorch: 2.5.1+cu121
40
+ - Datasets: 3.5.0
41
+ - Tokenizers: 0.21.1
42
+
43
+ ## Citations
44
+
45
+ Cite GRPO as:
46
+
47
+ ```bibtex
48
+ @article{zhihong2024deepseekmath,
49
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
50
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
51
+ year = 2024,
52
+ eprint = {arXiv:2402.03300},
53
+ }
54
+
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": -0.009652293185873921,
4
+ "train_runtime": 33755.1654,
5
+ "train_samples": 11040,
6
+ "train_samples_per_second": 0.654,
7
+ "train_steps_per_second": 0.006
8
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.50.0"
6
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": -0.009652293185873921,
4
+ "train_runtime": 33755.1654,
5
+ "train_samples": 11040,
6
+ "train_samples_per_second": 0.654,
7
+ "train_steps_per_second": 0.006
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,2787 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.9936628643852978,
6
+ "eval_steps": 100,
7
+ "global_step": 196,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio": 0.0,
14
+ "completion_length": 544.7745819091797,
15
+ "epoch": 0.010139416983523447,
16
+ "grad_norm": 0.2070285975933075,
17
+ "learning_rate": 5e-08,
18
+ "loss": -0.1577,
19
+ "num_tokens": 636118.0,
20
+ "reward": -0.35065557062625885,
21
+ "reward_std": 0.656378798186779,
22
+ "rewards/cosine_scaled_reward": -0.35400376841425896,
23
+ "rewards/format_reward": 0.003348214435391128,
24
+ "step": 1
25
+ },
26
+ {
27
+ "clip_ratio": 0.0,
28
+ "completion_length": 549.2812843322754,
29
+ "epoch": 0.020278833967046894,
30
+ "grad_norm": 0.2276979684829712,
31
+ "learning_rate": 1e-07,
32
+ "loss": -0.1565,
33
+ "num_tokens": 1263330.0,
34
+ "reward": -0.3044062051922083,
35
+ "reward_std": 0.6904742792248726,
36
+ "rewards/cosine_scaled_reward": -0.31221868470311165,
37
+ "rewards/format_reward": 0.007812500349245965,
38
+ "step": 2
39
+ },
40
+ {
41
+ "clip_ratio": 0.0,
42
+ "completion_length": 588.2891006469727,
43
+ "epoch": 0.030418250950570342,
44
+ "grad_norm": 0.19815148413181305,
45
+ "learning_rate": 1.5e-07,
46
+ "loss": -0.1123,
47
+ "num_tokens": 1930557.0,
48
+ "reward": -0.2540861386805773,
49
+ "reward_std": 0.6807773411273956,
50
+ "rewards/cosine_scaled_reward": -0.2618986228480935,
51
+ "rewards/format_reward": 0.007812500232830644,
52
+ "step": 3
53
+ },
54
+ {
55
+ "clip_ratio": 0.0,
56
+ "completion_length": 552.0669898986816,
57
+ "epoch": 0.04055766793409379,
58
+ "grad_norm": 0.20235052704811096,
59
+ "learning_rate": 2e-07,
60
+ "loss": -0.1652,
61
+ "num_tokens": 2562161.0,
62
+ "reward": -0.3022355027496815,
63
+ "reward_std": 0.6755559742450714,
64
+ "rewards/cosine_scaled_reward": -0.3122801296412945,
65
+ "rewards/format_reward": 0.010044643189758062,
66
+ "step": 4
67
+ },
68
+ {
69
+ "clip_ratio": 0.0,
70
+ "completion_length": 571.6172142028809,
71
+ "epoch": 0.050697084917617236,
72
+ "grad_norm": 0.19000709056854248,
73
+ "learning_rate": 2.5e-07,
74
+ "loss": -0.1267,
75
+ "num_tokens": 3201114.0,
76
+ "reward": -0.2917697671800852,
77
+ "reward_std": 0.6964326798915863,
78
+ "rewards/cosine_scaled_reward": -0.2984661813825369,
79
+ "rewards/format_reward": 0.006696428870782256,
80
+ "step": 5
81
+ },
82
+ {
83
+ "clip_ratio": 0.0,
84
+ "completion_length": 561.6384201049805,
85
+ "epoch": 0.060836501901140684,
86
+ "grad_norm": 0.21064706146717072,
87
+ "learning_rate": 3e-07,
88
+ "loss": -0.1771,
89
+ "num_tokens": 3833534.0,
90
+ "reward": -0.28072800021618605,
91
+ "reward_std": 0.6883805021643639,
92
+ "rewards/cosine_scaled_reward": -0.2840762068517506,
93
+ "rewards/format_reward": 0.003348214435391128,
94
+ "step": 6
95
+ },
96
+ {
97
+ "clip_ratio": 0.0,
98
+ "completion_length": 542.1149749755859,
99
+ "epoch": 0.07097591888466413,
100
+ "grad_norm": 0.22860734164714813,
101
+ "learning_rate": 3.5e-07,
102
+ "loss": -0.131,
103
+ "num_tokens": 4443693.0,
104
+ "reward": -0.23171719023957849,
105
+ "reward_std": 0.7431022375822067,
106
+ "rewards/cosine_scaled_reward": -0.24287789640948176,
107
+ "rewards/format_reward": 0.011160714668221772,
108
+ "step": 7
109
+ },
110
+ {
111
+ "clip_ratio": 0.0,
112
+ "completion_length": 563.7890892028809,
113
+ "epoch": 0.08111533586818757,
114
+ "grad_norm": 0.19847826659679413,
115
+ "learning_rate": 4e-07,
116
+ "loss": -0.142,
117
+ "num_tokens": 5076400.0,
118
+ "reward": -0.3053848221898079,
119
+ "reward_std": 0.6786770969629288,
120
+ "rewards/cosine_scaled_reward": -0.3109651692211628,
121
+ "rewards/format_reward": 0.005580357392318547,
122
+ "step": 8
123
+ },
124
+ {
125
+ "clip_ratio": 0.0,
126
+ "completion_length": 585.7589569091797,
127
+ "epoch": 0.09125475285171103,
128
+ "grad_norm": 0.1838308870792389,
129
+ "learning_rate": 4.5e-07,
130
+ "loss": -0.1486,
131
+ "num_tokens": 5724568.0,
132
+ "reward": -0.26803811825811863,
133
+ "reward_std": 0.6804082244634628,
134
+ "rewards/cosine_scaled_reward": -0.26915417425334454,
135
+ "rewards/format_reward": 0.0011160714784637094,
136
+ "step": 9
137
+ },
138
+ {
139
+ "clip_ratio": 0.0,
140
+ "completion_length": 516.0636291503906,
141
+ "epoch": 0.10139416983523447,
142
+ "grad_norm": 0.6098268032073975,
143
+ "learning_rate": 5e-07,
144
+ "loss": -0.1271,
145
+ "num_tokens": 6313689.0,
146
+ "reward": -0.2813316825777292,
147
+ "reward_std": 0.7330739945173264,
148
+ "rewards/cosine_scaled_reward": -0.2880280986428261,
149
+ "rewards/format_reward": 0.006696428870782256,
150
+ "step": 10
151
+ },
152
+ {
153
+ "clip_ratio": 0.0,
154
+ "completion_length": 568.8984718322754,
155
+ "epoch": 0.11153358681875793,
156
+ "grad_norm": 0.5140683054924011,
157
+ "learning_rate": 5.5e-07,
158
+ "loss": -0.1441,
159
+ "num_tokens": 6960958.0,
160
+ "reward": -0.2723624687641859,
161
+ "reward_std": 0.6729818060994148,
162
+ "rewards/cosine_scaled_reward": -0.275710666552186,
163
+ "rewards/format_reward": 0.003348214435391128,
164
+ "step": 11
165
+ },
166
+ {
167
+ "clip_ratio": 0.0,
168
+ "completion_length": 584.2366409301758,
169
+ "epoch": 0.12167300380228137,
170
+ "grad_norm": 0.20556923747062683,
171
+ "learning_rate": 6e-07,
172
+ "loss": -0.1173,
173
+ "num_tokens": 7615746.0,
174
+ "reward": -0.21572664007544518,
175
+ "reward_std": 0.7041841298341751,
176
+ "rewards/cosine_scaled_reward": -0.2201909152790904,
177
+ "rewards/format_reward": 0.004464285913854837,
178
+ "step": 12
179
+ },
180
+ {
181
+ "clip_ratio": 0.0,
182
+ "completion_length": 598.1797065734863,
183
+ "epoch": 0.13181242078580482,
184
+ "grad_norm": 0.19161486625671387,
185
+ "learning_rate": 6.5e-07,
186
+ "loss": -0.1216,
187
+ "num_tokens": 8279899.0,
188
+ "reward": -0.23263627663254738,
189
+ "reward_std": 0.7194378077983856,
190
+ "rewards/cosine_scaled_reward": -0.23933268897235394,
191
+ "rewards/format_reward": 0.006696428870782256,
192
+ "step": 13
193
+ },
194
+ {
195
+ "clip_ratio": 0.0,
196
+ "completion_length": 539.4096183776855,
197
+ "epoch": 0.14195183776932827,
198
+ "grad_norm": 0.20079070329666138,
199
+ "learning_rate": 7e-07,
200
+ "loss": -0.1091,
201
+ "num_tokens": 8895386.0,
202
+ "reward": -0.21676091896370053,
203
+ "reward_std": 0.7302467301487923,
204
+ "rewards/cosine_scaled_reward": -0.22209045942872763,
205
+ "rewards/format_reward": 0.0011160714784637094,
206
+ "step": 14
207
+ },
208
+ {
209
+ "clip_ratio": 0.0,
210
+ "completion_length": 581.7645378112793,
211
+ "epoch": 0.1520912547528517,
212
+ "grad_norm": 0.21947209537029266,
213
+ "learning_rate": 7.5e-07,
214
+ "loss": -0.097,
215
+ "num_tokens": 9545087.0,
216
+ "reward": -0.10511145042255521,
217
+ "reward_std": 0.7483287081122398,
218
+ "rewards/cosine_scaled_reward": -0.10957573121413589,
219
+ "rewards/format_reward": 0.004464285797439516,
220
+ "step": 15
221
+ },
222
+ {
223
+ "clip_ratio": 0.0,
224
+ "completion_length": 587.3013725280762,
225
+ "epoch": 0.16223067173637515,
226
+ "grad_norm": 0.20107129216194153,
227
+ "learning_rate": 8e-07,
228
+ "loss": -0.0894,
229
+ "num_tokens": 10198237.0,
230
+ "reward": -0.14020115474704653,
231
+ "reward_std": 0.7795401588082314,
232
+ "rewards/cosine_scaled_reward": -0.14689757686574012,
233
+ "rewards/format_reward": 0.006696428870782256,
234
+ "step": 16
235
+ },
236
+ {
237
+ "clip_ratio": 0.0,
238
+ "completion_length": 616.6361923217773,
239
+ "epoch": 0.17237008871989862,
240
+ "grad_norm": 0.17595961689949036,
241
+ "learning_rate": 8.499999999999999e-07,
242
+ "loss": -0.1258,
243
+ "num_tokens": 10885255.0,
244
+ "reward": -0.16032503126189113,
245
+ "reward_std": 0.7151542007923126,
246
+ "rewards/cosine_scaled_reward": -0.16478930541779846,
247
+ "rewards/format_reward": 0.004464285913854837,
248
+ "step": 17
249
+ },
250
+ {
251
+ "clip_ratio": 0.0,
252
+ "completion_length": 614.3348388671875,
253
+ "epoch": 0.18250950570342206,
254
+ "grad_norm": 0.17326559126377106,
255
+ "learning_rate": 9e-07,
256
+ "loss": -0.1223,
257
+ "num_tokens": 11559043.0,
258
+ "reward": -0.05317553365603089,
259
+ "reward_std": 0.8043533340096474,
260
+ "rewards/cosine_scaled_reward": -0.055407675448805094,
261
+ "rewards/format_reward": 0.0022321429569274187,
262
+ "step": 18
263
+ },
264
+ {
265
+ "clip_ratio": 0.0,
266
+ "completion_length": 576.6774749755859,
267
+ "epoch": 0.1926489226869455,
268
+ "grad_norm": 0.19331927597522736,
269
+ "learning_rate": 9.499999999999999e-07,
270
+ "loss": -0.0965,
271
+ "num_tokens": 12207730.0,
272
+ "reward": -0.01400054944679141,
273
+ "reward_std": 0.8140848129987717,
274
+ "rewards/cosine_scaled_reward": -0.017348763067275286,
275
+ "rewards/format_reward": 0.003348214435391128,
276
+ "step": 19
277
+ },
278
+ {
279
+ "clip_ratio": 0.0,
280
+ "completion_length": 659.9911117553711,
281
+ "epoch": 0.20278833967046894,
282
+ "grad_norm": 0.16300565004348755,
283
+ "learning_rate": 1e-06,
284
+ "loss": -0.0491,
285
+ "num_tokens": 12919506.0,
286
+ "reward": 0.12035302398726344,
287
+ "reward_std": 0.7825085148215294,
288
+ "rewards/cosine_scaled_reward": 0.11588872922584414,
289
+ "rewards/format_reward": 0.004464285797439516,
290
+ "step": 20
291
+ },
292
+ {
293
+ "clip_ratio": 0.0,
294
+ "completion_length": 641.3013687133789,
295
+ "epoch": 0.21292775665399238,
296
+ "grad_norm": 0.24253493547439575,
297
+ "learning_rate": 9.999203468625015e-07,
298
+ "loss": -0.0523,
299
+ "num_tokens": 13615384.0,
300
+ "reward": 0.19580535404384136,
301
+ "reward_std": 0.8355192169547081,
302
+ "rewards/cosine_scaled_reward": 0.19357320480048656,
303
+ "rewards/format_reward": 0.0022321429569274187,
304
+ "step": 21
305
+ },
306
+ {
307
+ "clip_ratio": 0.0,
308
+ "completion_length": 694.412971496582,
309
+ "epoch": 0.22306717363751585,
310
+ "grad_norm": 0.22512537240982056,
311
+ "learning_rate": 9.99681412828496e-07,
312
+ "loss": -0.0494,
313
+ "num_tokens": 14370738.0,
314
+ "reward": 0.2461150388699025,
315
+ "reward_std": 0.7426658719778061,
316
+ "rewards/cosine_scaled_reward": 0.24388288520276546,
317
+ "rewards/format_reward": 0.0022321429569274187,
318
+ "step": 22
319
+ },
320
+ {
321
+ "clip_ratio": 0.0,
322
+ "completion_length": 675.4542617797852,
323
+ "epoch": 0.2332065906210393,
324
+ "grad_norm": 0.16492308676242828,
325
+ "learning_rate": 9.992832740253644e-07,
326
+ "loss": -0.0398,
327
+ "num_tokens": 15114193.0,
328
+ "reward": 0.2721172561869025,
329
+ "reward_std": 0.8025888651609421,
330
+ "rewards/cosine_scaled_reward": 0.2710011739982292,
331
+ "rewards/format_reward": 0.0011160714784637094,
332
+ "step": 23
333
+ },
334
+ {
335
+ "clip_ratio": 0.0,
336
+ "completion_length": 719.147346496582,
337
+ "epoch": 0.24334600760456274,
338
+ "grad_norm": 0.1536169797182083,
339
+ "learning_rate": 9.987260573051267e-07,
340
+ "loss": -0.0321,
341
+ "num_tokens": 15888093.0,
342
+ "reward": 0.30394730158150196,
343
+ "reward_std": 0.7235733941197395,
344
+ "rewards/cosine_scaled_reward": 0.30394728668034077,
345
+ "rewards/format_reward": 0.0,
346
+ "step": 24
347
+ },
348
+ {
349
+ "clip_ratio": 0.0,
350
+ "completion_length": 665.2589645385742,
351
+ "epoch": 0.2534854245880862,
352
+ "grad_norm": 0.15167948603630066,
353
+ "learning_rate": 9.98009940204023e-07,
354
+ "loss": -0.0196,
355
+ "num_tokens": 16615493.0,
356
+ "reward": 0.4243004210293293,
357
+ "reward_std": 0.784424401819706,
358
+ "rewards/cosine_scaled_reward": 0.42318433709442616,
359
+ "rewards/format_reward": 0.0011160714784637094,
360
+ "step": 25
361
+ },
362
+ {
363
+ "clip_ratio": 0.0,
364
+ "completion_length": 680.0814971923828,
365
+ "epoch": 0.26362484157160965,
366
+ "grad_norm": 0.1787332445383072,
367
+ "learning_rate": 9.971351508859486e-07,
368
+ "loss": -0.0554,
369
+ "num_tokens": 17343678.0,
370
+ "reward": 0.5126287564635277,
371
+ "reward_std": 0.7500302940607071,
372
+ "rewards/cosine_scaled_reward": 0.5126287154853344,
373
+ "rewards/format_reward": 0.0,
374
+ "step": 26
375
+ },
376
+ {
377
+ "clip_ratio": 0.0,
378
+ "completion_length": 691.2957992553711,
379
+ "epoch": 0.2737642585551331,
380
+ "grad_norm": 0.15422694385051727,
381
+ "learning_rate": 9.961019680697591e-07,
382
+ "loss": 0.0019,
383
+ "num_tokens": 18091255.0,
384
+ "reward": 0.478223854675889,
385
+ "reward_std": 0.7884277924895287,
386
+ "rewards/cosine_scaled_reward": 0.4771077586337924,
387
+ "rewards/format_reward": 0.0011160714784637094,
388
+ "step": 27
389
+ },
390
+ {
391
+ "clip_ratio": 0.0,
392
+ "completion_length": 724.2433319091797,
393
+ "epoch": 0.28390367553865653,
394
+ "grad_norm": 0.14453695714473724,
395
+ "learning_rate": 9.949107209404663e-07,
396
+ "loss": 0.0015,
397
+ "num_tokens": 18873817.0,
398
+ "reward": 0.6080499514937401,
399
+ "reward_std": 0.7710720151662827,
400
+ "rewards/cosine_scaled_reward": 0.6080499142408371,
401
+ "rewards/format_reward": 0.0,
402
+ "step": 28
403
+ },
404
+ {
405
+ "clip_ratio": 0.0,
406
+ "completion_length": 719.5837326049805,
407
+ "epoch": 0.29404309252217997,
408
+ "grad_norm": 0.14530107378959656,
409
+ "learning_rate": 9.935617890443554e-07,
410
+ "loss": 0.0026,
411
+ "num_tokens": 19659300.0,
412
+ "reward": 0.6268112398684025,
413
+ "reward_std": 0.6641118600964546,
414
+ "rewards/cosine_scaled_reward": 0.6268112063407898,
415
+ "rewards/format_reward": 0.0,
416
+ "step": 29
417
+ },
418
+ {
419
+ "clip_ratio": 0.0,
420
+ "completion_length": 793.0312805175781,
421
+ "epoch": 0.3041825095057034,
422
+ "grad_norm": 0.1358717679977417,
423
+ "learning_rate": 9.92055602168058e-07,
424
+ "loss": -0.0059,
425
+ "num_tokens": 20515344.0,
426
+ "reward": 0.6840809024870396,
427
+ "reward_std": 0.6523778513073921,
428
+ "rewards/cosine_scaled_reward": 0.6840808726847172,
429
+ "rewards/format_reward": 0.0,
430
+ "step": 30
431
+ },
432
+ {
433
+ "clip_ratio": 0.0,
434
+ "completion_length": 766.2176818847656,
435
+ "epoch": 0.31432192648922685,
436
+ "grad_norm": 0.13609299063682556,
437
+ "learning_rate": 9.90392640201615e-07,
438
+ "loss": 0.0048,
439
+ "num_tokens": 21329483.0,
440
+ "reward": 0.7440241314470768,
441
+ "reward_std": 0.6845037117600441,
442
+ "rewards/cosine_scaled_reward": 0.7440241128206253,
443
+ "rewards/format_reward": 0.0,
444
+ "step": 31
445
+ },
446
+ {
447
+ "clip_ratio": 0.0,
448
+ "completion_length": 760.3225784301758,
449
+ "epoch": 0.3244613434727503,
450
+ "grad_norm": 0.1422865241765976,
451
+ "learning_rate": 9.885734329855797e-07,
452
+ "loss": 0.0076,
453
+ "num_tokens": 22130476.0,
454
+ "reward": 0.8551607951521873,
455
+ "reward_std": 0.6532174497842789,
456
+ "rewards/cosine_scaled_reward": 0.8551607728004456,
457
+ "rewards/format_reward": 0.0,
458
+ "step": 32
459
+ },
460
+ {
461
+ "clip_ratio": 0.0,
462
+ "completion_length": 788.7120895385742,
463
+ "epoch": 0.33460076045627374,
464
+ "grad_norm": 0.13245096802711487,
465
+ "learning_rate": 9.865985601422017e-07,
466
+ "loss": -0.0077,
467
+ "num_tokens": 22971218.0,
468
+ "reward": 0.7257559858262539,
469
+ "reward_std": 0.6099549047648907,
470
+ "rewards/cosine_scaled_reward": 0.7302779853343964,
471
+ "rewards/format_reward": 0.0,
472
+ "step": 33
473
+ },
474
+ {
475
+ "clip_ratio": 0.0,
476
+ "completion_length": 802.2344055175781,
477
+ "epoch": 0.34474017743979724,
478
+ "grad_norm": 0.13606438040733337,
479
+ "learning_rate": 9.844686508907537e-07,
480
+ "loss": -0.0022,
481
+ "num_tokens": 23817508.0,
482
+ "reward": 0.9126424044370651,
483
+ "reward_std": 0.6410668790340424,
484
+ "rewards/cosine_scaled_reward": 0.9126423671841621,
485
+ "rewards/format_reward": 0.0,
486
+ "step": 34
487
+ },
488
+ {
489
+ "clip_ratio": 0.0,
490
+ "completion_length": 779.1975784301758,
491
+ "epoch": 0.3548795944233207,
492
+ "grad_norm": 0.13803985714912415,
493
+ "learning_rate": 9.821843838470534e-07,
494
+ "loss": -0.0014,
495
+ "num_tokens": 24658397.0,
496
+ "reward": 0.951663188636303,
497
+ "reward_std": 0.6128395721316338,
498
+ "rewards/cosine_scaled_reward": 0.9516631364822388,
499
+ "rewards/format_reward": 0.0,
500
+ "step": 35
501
+ },
502
+ {
503
+ "clip_ratio": 0.0,
504
+ "completion_length": 790.4141006469727,
505
+ "epoch": 0.3650190114068441,
506
+ "grad_norm": 0.15671393275260925,
507
+ "learning_rate": 9.797464868072486e-07,
508
+ "loss": 0.0435,
509
+ "num_tokens": 25494720.0,
510
+ "reward": 0.952189639210701,
511
+ "reward_std": 0.6134774163365364,
512
+ "rewards/cosine_scaled_reward": 0.9521896243095398,
513
+ "rewards/format_reward": 0.0,
514
+ "step": 36
515
+ },
516
+ {
517
+ "clip_ratio": 0.0,
518
+ "completion_length": 780.6808395385742,
519
+ "epoch": 0.37515842839036756,
520
+ "grad_norm": 0.13327902555465698,
521
+ "learning_rate": 9.771557365159319e-07,
522
+ "loss": -0.0051,
523
+ "num_tokens": 26319738.0,
524
+ "reward": 0.9673999920487404,
525
+ "reward_std": 0.5673549436032772,
526
+ "rewards/cosine_scaled_reward": 0.9673999547958374,
527
+ "rewards/format_reward": 0.0,
528
+ "step": 37
529
+ },
530
+ {
531
+ "clip_ratio": 0.0,
532
+ "completion_length": 797.1105194091797,
533
+ "epoch": 0.385297845373891,
534
+ "grad_norm": 0.14108316600322723,
535
+ "learning_rate": 9.744129584186597e-07,
536
+ "loss": -0.0108,
537
+ "num_tokens": 27160557.0,
538
+ "reward": 0.9532012119889259,
539
+ "reward_std": 0.5637913830578327,
540
+ "rewards/cosine_scaled_reward": 0.9532012045383453,
541
+ "rewards/format_reward": 0.0,
542
+ "step": 38
543
+ },
544
+ {
545
+ "clip_ratio": 0.0,
546
+ "completion_length": 857.9230270385742,
547
+ "epoch": 0.39543726235741444,
548
+ "grad_norm": 0.13035330176353455,
549
+ "learning_rate": 9.71519026398956e-07,
550
+ "loss": 0.0268,
551
+ "num_tokens": 28067040.0,
552
+ "reward": 0.9902723953127861,
553
+ "reward_std": 0.5246665701270103,
554
+ "rewards/cosine_scaled_reward": 0.9902723580598831,
555
+ "rewards/format_reward": 0.0,
556
+ "step": 39
557
+ },
558
+ {
559
+ "clip_ratio": 0.0,
560
+ "completion_length": 793.2377471923828,
561
+ "epoch": 0.4055766793409379,
562
+ "grad_norm": 0.13015364110469818,
563
+ "learning_rate": 9.68474862499881e-07,
564
+ "loss": 0.0268,
565
+ "num_tokens": 28897045.0,
566
+ "reward": 1.063771203160286,
567
+ "reward_std": 0.49807993322610855,
568
+ "rewards/cosine_scaled_reward": 1.0637711435556412,
569
+ "rewards/format_reward": 0.0,
570
+ "step": 40
571
+ },
572
+ {
573
+ "clip_ratio": 0.0,
574
+ "completion_length": 776.3783798217773,
575
+ "epoch": 0.4157160963244613,
576
+ "grad_norm": 0.12312573939561844,
577
+ "learning_rate": 9.652814366302568e-07,
578
+ "loss": 0.0136,
579
+ "num_tokens": 29718656.0,
580
+ "reward": 1.1480179950594902,
581
+ "reward_std": 0.5026725828647614,
582
+ "rewards/cosine_scaled_reward": 1.1480179578065872,
583
+ "rewards/format_reward": 0.0,
584
+ "step": 41
585
+ },
586
+ {
587
+ "clip_ratio": 0.0,
588
+ "completion_length": 782.9832916259766,
589
+ "epoch": 0.42585551330798477,
590
+ "grad_norm": 0.12757915258407593,
591
+ "learning_rate": 9.619397662556433e-07,
592
+ "loss": 0.0089,
593
+ "num_tokens": 30549177.0,
594
+ "reward": 1.153884269297123,
595
+ "reward_std": 0.43890631198883057,
596
+ "rewards/cosine_scaled_reward": 1.1538842469453812,
597
+ "rewards/format_reward": 0.0,
598
+ "step": 42
599
+ },
600
+ {
601
+ "clip_ratio": 0.0,
602
+ "completion_length": 855.8906631469727,
603
+ "epoch": 0.43599493029150826,
604
+ "grad_norm": 0.11601021140813828,
605
+ "learning_rate": 9.5845091607416e-07,
606
+ "loss": -0.0027,
607
+ "num_tokens": 31434759.0,
608
+ "reward": 1.1496833115816116,
609
+ "reward_std": 0.44448646157979965,
610
+ "rewards/cosine_scaled_reward": 1.1496832445263863,
611
+ "rewards/format_reward": 0.0,
612
+ "step": 43
613
+ },
614
+ {
615
+ "clip_ratio": 0.0,
616
+ "completion_length": 862.2332916259766,
617
+ "epoch": 0.4461343472750317,
618
+ "grad_norm": 0.11746807396411896,
619
+ "learning_rate": 9.548159976772592e-07,
620
+ "loss": 0.0197,
621
+ "num_tokens": 32337368.0,
622
+ "reward": 1.1307310312986374,
623
+ "reward_std": 0.4776446260511875,
624
+ "rewards/cosine_scaled_reward": 1.130730964243412,
625
+ "rewards/format_reward": 0.0,
626
+ "step": 44
627
+ },
628
+ {
629
+ "clip_ratio": 0.0,
630
+ "completion_length": 900.1975860595703,
631
+ "epoch": 0.45627376425855515,
632
+ "grad_norm": 0.1248885914683342,
633
+ "learning_rate": 9.510361691955606e-07,
634
+ "loss": 0.0165,
635
+ "num_tokens": 33266393.0,
636
+ "reward": 1.060933604836464,
637
+ "reward_std": 0.46989936381578445,
638
+ "rewards/cosine_scaled_reward": 1.060933567583561,
639
+ "rewards/format_reward": 0.0,
640
+ "step": 45
641
+ },
642
+ {
643
+ "clip_ratio": 0.0,
644
+ "completion_length": 815.2812881469727,
645
+ "epoch": 0.4664131812420786,
646
+ "grad_norm": 0.9569998979568481,
647
+ "learning_rate": 9.471126349298556e-07,
648
+ "loss": 0.0354,
649
+ "num_tokens": 34127101.0,
650
+ "reward": 1.151917465031147,
651
+ "reward_std": 0.5036191828548908,
652
+ "rewards/cosine_scaled_reward": 1.1519174352288246,
653
+ "rewards/format_reward": 0.0,
654
+ "step": 46
655
+ },
656
+ {
657
+ "clip_ratio": 0.0,
658
+ "completion_length": 833.6239242553711,
659
+ "epoch": 0.47655259822560203,
660
+ "grad_norm": 0.15996558964252472,
661
+ "learning_rate": 9.430466449674013e-07,
662
+ "loss": -0.0196,
663
+ "num_tokens": 35022380.0,
664
+ "reward": 1.0830270573496819,
665
+ "reward_std": 0.4072240814566612,
666
+ "rewards/cosine_scaled_reward": 1.0830269828438759,
667
+ "rewards/format_reward": 0.0,
668
+ "step": 47
669
+ },
670
+ {
671
+ "clip_ratio": 0.0,
672
+ "completion_length": 892.449821472168,
673
+ "epoch": 0.4866920152091255,
674
+ "grad_norm": 0.11413749307394028,
675
+ "learning_rate": 9.388394947836278e-07,
676
+ "loss": -0.0125,
677
+ "num_tokens": 35948751.0,
678
+ "reward": 1.0179572254419327,
679
+ "reward_std": 0.45433366298675537,
680
+ "rewards/cosine_scaled_reward": 1.0179571583867073,
681
+ "rewards/format_reward": 0.0,
682
+ "step": 48
683
+ },
684
+ {
685
+ "clip_ratio": 0.0,
686
+ "completion_length": 912.2645492553711,
687
+ "epoch": 0.4968314321926489,
688
+ "grad_norm": 0.11429373919963837,
689
+ "learning_rate": 9.344925248293835e-07,
690
+ "loss": -0.001,
691
+ "num_tokens": 36887148.0,
692
+ "reward": 1.1400989294052124,
693
+ "reward_std": 0.4401072785258293,
694
+ "rewards/cosine_scaled_reward": 1.1400988548994064,
695
+ "rewards/format_reward": 0.0,
696
+ "step": 49
697
+ },
698
+ {
699
+ "clip_ratio": 0.0,
700
+ "completion_length": 866.2511596679688,
701
+ "epoch": 0.5069708491761724,
702
+ "grad_norm": 0.11980035156011581,
703
+ "learning_rate": 9.300071201038501e-07,
704
+ "loss": 0.0215,
705
+ "num_tokens": 37789485.0,
706
+ "reward": 1.13294617831707,
707
+ "reward_std": 0.4442785568535328,
708
+ "rewards/cosine_scaled_reward": 1.1329461187124252,
709
+ "rewards/format_reward": 0.0,
710
+ "step": 50
711
+ },
712
+ {
713
+ "clip_ratio": 0.0,
714
+ "completion_length": 799.482177734375,
715
+ "epoch": 0.5171102661596958,
716
+ "grad_norm": 0.122144915163517,
717
+ "learning_rate": 9.253847097132655e-07,
718
+ "loss": 0.0044,
719
+ "num_tokens": 38636645.0,
720
+ "reward": 1.2101461067795753,
721
+ "reward_std": 0.41771573945879936,
722
+ "rewards/cosine_scaled_reward": 1.210146002471447,
723
+ "rewards/format_reward": 0.0,
724
+ "step": 51
725
+ },
726
+ {
727
+ "clip_ratio": 0.0,
728
+ "completion_length": 847.4330749511719,
729
+ "epoch": 0.5272496831432193,
730
+ "grad_norm": 0.1199701800942421,
731
+ "learning_rate": 9.206267664155906e-07,
732
+ "loss": 0.0082,
733
+ "num_tokens": 39516321.0,
734
+ "reward": 1.1085221618413925,
735
+ "reward_std": 0.4500999916344881,
736
+ "rewards/cosine_scaled_reward": 1.1085221245884895,
737
+ "rewards/format_reward": 0.0,
738
+ "step": 52
739
+ },
740
+ {
741
+ "clip_ratio": 0.0,
742
+ "completion_length": 873.8102951049805,
743
+ "epoch": 0.5373891001267427,
744
+ "grad_norm": 0.14303860068321228,
745
+ "learning_rate": 9.157348061512726e-07,
746
+ "loss": -0.0033,
747
+ "num_tokens": 40435335.0,
748
+ "reward": 1.1145585402846336,
749
+ "reward_std": 0.47080911323428154,
750
+ "rewards/cosine_scaled_reward": 1.1145584508776665,
751
+ "rewards/format_reward": 0.0,
752
+ "step": 53
753
+ },
754
+ {
755
+ "clip_ratio": 0.0,
756
+ "completion_length": 961.9029312133789,
757
+ "epoch": 0.5475285171102662,
758
+ "grad_norm": 0.11308898031711578,
759
+ "learning_rate": 9.107103875602458e-07,
760
+ "loss": -0.0123,
761
+ "num_tokens": 41415752.0,
762
+ "reward": 1.0546733513474464,
763
+ "reward_std": 0.41840433329343796,
764
+ "rewards/cosine_scaled_reward": 1.054673284292221,
765
+ "rewards/format_reward": 0.0,
766
+ "step": 54
767
+ },
768
+ {
769
+ "clip_ratio": 0.0,
770
+ "completion_length": 892.6484680175781,
771
+ "epoch": 0.5576679340937896,
772
+ "grad_norm": 0.12083946913480759,
773
+ "learning_rate": 9.055551114853295e-07,
774
+ "loss": 0.0043,
775
+ "num_tokens": 42345957.0,
776
+ "reward": 1.0607689917087555,
777
+ "reward_std": 0.4757756106555462,
778
+ "rewards/cosine_scaled_reward": 1.0607689544558525,
779
+ "rewards/format_reward": 0.0,
780
+ "step": 55
781
+ },
782
+ {
783
+ "clip_ratio": 0.0,
784
+ "completion_length": 823.7578506469727,
785
+ "epoch": 0.5678073510773131,
786
+ "grad_norm": 0.21660108864307404,
787
+ "learning_rate": 9.002706204621802e-07,
788
+ "loss": 0.0311,
789
+ "num_tokens": 43207724.0,
790
+ "reward": 1.1668334901332855,
791
+ "reward_std": 0.482263408601284,
792
+ "rewards/cosine_scaled_reward": 1.1668334528803825,
793
+ "rewards/format_reward": 0.0,
794
+ "step": 56
795
+ },
796
+ {
797
+ "clip_ratio": 0.0,
798
+ "completion_length": 844.7210159301758,
799
+ "epoch": 0.5779467680608364,
800
+ "grad_norm": 0.4439857602119446,
801
+ "learning_rate": 8.948585981959578e-07,
802
+ "loss": 0.0112,
803
+ "num_tokens": 44096178.0,
804
+ "reward": 1.1872114390134811,
805
+ "reward_std": 0.44185368344187737,
806
+ "rewards/cosine_scaled_reward": 1.1872113794088364,
807
+ "rewards/format_reward": 0.0,
808
+ "step": 57
809
+ },
810
+ {
811
+ "clip_ratio": 0.0,
812
+ "completion_length": 832.4553909301758,
813
+ "epoch": 0.5880861850443599,
814
+ "grad_norm": 0.12814383208751678,
815
+ "learning_rate": 8.893207690248775e-07,
816
+ "loss": -0.0022,
817
+ "num_tokens": 44963122.0,
818
+ "reward": 1.1495825573801994,
819
+ "reward_std": 0.43896420300006866,
820
+ "rewards/cosine_scaled_reward": 1.149582527577877,
821
+ "rewards/format_reward": 0.0,
822
+ "step": 58
823
+ },
824
+ {
825
+ "clip_ratio": 0.0,
826
+ "completion_length": 819.3661117553711,
827
+ "epoch": 0.5982256020278834,
828
+ "grad_norm": 0.12473784387111664,
829
+ "learning_rate": 8.836588973708128e-07,
830
+ "loss": -0.0152,
831
+ "num_tokens": 45838866.0,
832
+ "reward": 1.101597860455513,
833
+ "reward_std": 0.44026929885149,
834
+ "rewards/cosine_scaled_reward": 1.1015978381037712,
835
+ "rewards/format_reward": 0.0,
836
+ "step": 59
837
+ },
838
+ {
839
+ "clip_ratio": 0.0,
840
+ "completion_length": 883.5223617553711,
841
+ "epoch": 0.6083650190114068,
842
+ "grad_norm": 0.12360174208879471,
843
+ "learning_rate": 8.778747871771291e-07,
844
+ "loss": 0.0119,
845
+ "num_tokens": 46767750.0,
846
+ "reward": 1.1120553016662598,
847
+ "reward_std": 0.43890899419784546,
848
+ "rewards/cosine_scaled_reward": 1.1120552495121956,
849
+ "rewards/format_reward": 0.0,
850
+ "step": 60
851
+ },
852
+ {
853
+ "clip_ratio": 0.0,
854
+ "completion_length": 830.4174499511719,
855
+ "epoch": 0.6185044359949303,
856
+ "grad_norm": 0.12516207993030548,
857
+ "learning_rate": 8.719702813339247e-07,
858
+ "loss": 0.007,
859
+ "num_tokens": 47634148.0,
860
+ "reward": 1.1350341737270355,
861
+ "reward_std": 0.3932863809168339,
862
+ "rewards/cosine_scaled_reward": 1.1350341215729713,
863
+ "rewards/format_reward": 0.0,
864
+ "step": 61
865
+ },
866
+ {
867
+ "clip_ratio": 0.0,
868
+ "completion_length": 860.8895492553711,
869
+ "epoch": 0.6286438529784537,
870
+ "grad_norm": 0.11765479296445847,
871
+ "learning_rate": 8.659472610908627e-07,
872
+ "loss": -0.0127,
873
+ "num_tokens": 48534897.0,
874
+ "reward": 1.0959327295422554,
875
+ "reward_std": 0.43786268681287766,
876
+ "rewards/cosine_scaled_reward": 1.09593266248703,
877
+ "rewards/format_reward": 0.0,
878
+ "step": 62
879
+ },
880
+ {
881
+ "clip_ratio": 0.0,
882
+ "completion_length": 789.957633972168,
883
+ "epoch": 0.6387832699619772,
884
+ "grad_norm": 0.12538108229637146,
885
+ "learning_rate": 8.598076454577814e-07,
886
+ "loss": 0.0043,
887
+ "num_tokens": 49366995.0,
888
+ "reward": 1.2128012925386429,
889
+ "reward_std": 0.5132176093757153,
890
+ "rewards/cosine_scaled_reward": 1.2128012254834175,
891
+ "rewards/format_reward": 0.0,
892
+ "step": 63
893
+ },
894
+ {
895
+ "clip_ratio": 0.0,
896
+ "completion_length": 814.1540603637695,
897
+ "epoch": 0.6489226869455006,
898
+ "grad_norm": 0.12521201372146606,
899
+ "learning_rate": 8.535533905932737e-07,
900
+ "loss": 0.031,
901
+ "num_tokens": 50244605.0,
902
+ "reward": 1.2290623039007187,
903
+ "reward_std": 0.35417212545871735,
904
+ "rewards/cosine_scaled_reward": 1.2290622666478157,
905
+ "rewards/format_reward": 0.0,
906
+ "step": 64
907
+ },
908
+ {
909
+ "clip_ratio": 0.0,
910
+ "completion_length": 826.7444610595703,
911
+ "epoch": 0.6590621039290241,
912
+ "grad_norm": 0.1192706748843193,
913
+ "learning_rate": 8.471864891814304e-07,
914
+ "loss": 0.0187,
915
+ "num_tokens": 51104472.0,
916
+ "reward": 1.2224428355693817,
917
+ "reward_std": 0.4238445721566677,
918
+ "rewards/cosine_scaled_reward": 1.2224428057670593,
919
+ "rewards/format_reward": 0.0,
920
+ "step": 65
921
+ },
922
+ {
923
+ "clip_ratio": 0.0,
924
+ "completion_length": 890.8181228637695,
925
+ "epoch": 0.6692015209125475,
926
+ "grad_norm": 0.11663277447223663,
927
+ "learning_rate": 8.407089697969456e-07,
928
+ "loss": -0.0075,
929
+ "num_tokens": 52027509.0,
930
+ "reward": 1.1307330876588821,
931
+ "reward_std": 0.43055112659931183,
932
+ "rewards/cosine_scaled_reward": 1.130733035504818,
933
+ "rewards/format_reward": 0.0,
934
+ "step": 66
935
+ },
936
+ {
937
+ "clip_ratio": 0.0,
938
+ "completion_length": 896.5524978637695,
939
+ "epoch": 0.679340937896071,
940
+ "grad_norm": 0.11125318706035614,
941
+ "learning_rate": 8.341228962587881e-07,
942
+ "loss": -0.0128,
943
+ "num_tokens": 52966948.0,
944
+ "reward": 1.1304329186677933,
945
+ "reward_std": 0.38643651083111763,
946
+ "rewards/cosine_scaled_reward": 1.1304328739643097,
947
+ "rewards/format_reward": 0.0,
948
+ "step": 67
949
+ },
950
+ {
951
+ "clip_ratio": 0.0,
952
+ "completion_length": 805.1919937133789,
953
+ "epoch": 0.6894803548795945,
954
+ "grad_norm": 0.1318301111459732,
955
+ "learning_rate": 8.274303669726426e-07,
956
+ "loss": 0.0267,
957
+ "num_tokens": 53809720.0,
958
+ "reward": 1.2848119288682938,
959
+ "reward_std": 0.4024609997868538,
960
+ "rewards/cosine_scaled_reward": 1.2848118543624878,
961
+ "rewards/format_reward": 0.0,
962
+ "step": 68
963
+ },
964
+ {
965
+ "clip_ratio": 0.0,
966
+ "completion_length": 897.3783950805664,
967
+ "epoch": 0.6996197718631179,
968
+ "grad_norm": 0.11534889042377472,
969
+ "learning_rate": 8.206335142623304e-07,
970
+ "loss": -0.0213,
971
+ "num_tokens": 54743531.0,
972
+ "reward": 1.0870068296790123,
973
+ "reward_std": 0.396763913333416,
974
+ "rewards/cosine_scaled_reward": 1.0870067551732063,
975
+ "rewards/format_reward": 0.0,
976
+ "step": 69
977
+ },
978
+ {
979
+ "clip_ratio": 0.0,
980
+ "completion_length": 849.7935562133789,
981
+ "epoch": 0.7097591888466414,
982
+ "grad_norm": 0.11737702041864395,
983
+ "learning_rate": 8.137345036904259e-07,
984
+ "loss": 0.0049,
985
+ "num_tokens": 55624322.0,
986
+ "reward": 1.1217299401760101,
987
+ "reward_std": 0.4600865840911865,
988
+ "rewards/cosine_scaled_reward": 1.1217298731207848,
989
+ "rewards/format_reward": 0.0,
990
+ "step": 70
991
+ },
992
+ {
993
+ "clip_ratio": 0.0,
994
+ "completion_length": 843.4699020385742,
995
+ "epoch": 0.7198986058301647,
996
+ "grad_norm": 0.12105900794267654,
997
+ "learning_rate": 8.067355333682797e-07,
998
+ "loss": 0.0058,
999
+ "num_tokens": 56518839.0,
1000
+ "reward": 1.128207340836525,
1001
+ "reward_std": 0.39387310668826103,
1002
+ "rewards/cosine_scaled_reward": 1.1282072588801384,
1003
+ "rewards/format_reward": 0.0,
1004
+ "step": 71
1005
+ },
1006
+ {
1007
+ "clip_ratio": 0.0,
1008
+ "completion_length": 854.9007110595703,
1009
+ "epoch": 0.7300380228136882,
1010
+ "grad_norm": 0.11754778772592545,
1011
+ "learning_rate": 7.996388332556734e-07,
1012
+ "loss": 0.0184,
1013
+ "num_tokens": 57413550.0,
1014
+ "reward": 1.1404274106025696,
1015
+ "reward_std": 0.45089153945446014,
1016
+ "rewards/cosine_scaled_reward": 1.1404273584485054,
1017
+ "rewards/format_reward": 0.0,
1018
+ "step": 72
1019
+ },
1020
+ {
1021
+ "clip_ratio": 0.0,
1022
+ "completion_length": 900.8973541259766,
1023
+ "epoch": 0.7401774397972116,
1024
+ "grad_norm": 0.11590257287025452,
1025
+ "learning_rate": 7.924466644503264e-07,
1026
+ "loss": -0.0059,
1027
+ "num_tokens": 58353250.0,
1028
+ "reward": 1.1319697871804237,
1029
+ "reward_std": 0.4219101257622242,
1030
+ "rewards/cosine_scaled_reward": 1.131969727575779,
1031
+ "rewards/format_reward": 0.0,
1032
+ "step": 73
1033
+ },
1034
+ {
1035
+ "clip_ratio": 0.0,
1036
+ "completion_length": 806.4665603637695,
1037
+ "epoch": 0.7503168567807351,
1038
+ "grad_norm": 0.12108156830072403,
1039
+ "learning_rate": 7.85161318467482e-07,
1040
+ "loss": -0.0001,
1041
+ "num_tokens": 59211228.0,
1042
+ "reward": 1.1962338984012604,
1043
+ "reward_std": 0.45812737941741943,
1044
+ "rewards/cosine_scaled_reward": 1.196233868598938,
1045
+ "rewards/format_reward": 0.0,
1046
+ "step": 74
1047
+ },
1048
+ {
1049
+ "clip_ratio": 0.0,
1050
+ "completion_length": 812.6629791259766,
1051
+ "epoch": 0.7604562737642585,
1052
+ "grad_norm": 0.12231780588626862,
1053
+ "learning_rate": 7.777851165098011e-07,
1054
+ "loss": 0.0223,
1055
+ "num_tokens": 60070054.0,
1056
+ "reward": 1.2705483883619308,
1057
+ "reward_std": 0.4315441697835922,
1058
+ "rewards/cosine_scaled_reward": 1.2705483138561249,
1059
+ "rewards/format_reward": 0.0,
1060
+ "step": 75
1061
+ },
1062
+ {
1063
+ "clip_ratio": 0.0,
1064
+ "completion_length": 792.5323944091797,
1065
+ "epoch": 0.770595690747782,
1066
+ "grad_norm": 0.13156548142433167,
1067
+ "learning_rate": 7.703204087277988e-07,
1068
+ "loss": 0.0054,
1069
+ "num_tokens": 60907403.0,
1070
+ "reward": 1.0341035649180412,
1071
+ "reward_std": 0.4903283417224884,
1072
+ "rewards/cosine_scaled_reward": 1.0341035276651382,
1073
+ "rewards/format_reward": 0.0,
1074
+ "step": 76
1075
+ },
1076
+ {
1077
+ "clip_ratio": 0.0,
1078
+ "completion_length": 843.8315200805664,
1079
+ "epoch": 0.7807351077313055,
1080
+ "grad_norm": 0.12101699411869049,
1081
+ "learning_rate": 7.627695734710564e-07,
1082
+ "loss": 0.0109,
1083
+ "num_tokens": 61785700.0,
1084
+ "reward": 1.226922646164894,
1085
+ "reward_std": 0.38846197351813316,
1086
+ "rewards/cosine_scaled_reward": 1.2269226014614105,
1087
+ "rewards/format_reward": 0.0,
1088
+ "step": 77
1089
+ },
1090
+ {
1091
+ "clip_ratio": 0.0,
1092
+ "completion_length": 859.2433395385742,
1093
+ "epoch": 0.7908745247148289,
1094
+ "grad_norm": 0.1146969199180603,
1095
+ "learning_rate": 7.551350165304499e-07,
1096
+ "loss": 0.0219,
1097
+ "num_tokens": 62673358.0,
1098
+ "reward": 1.2758738100528717,
1099
+ "reward_std": 0.4126880206167698,
1100
+ "rewards/cosine_scaled_reward": 1.275873750448227,
1101
+ "rewards/format_reward": 0.0,
1102
+ "step": 78
1103
+ },
1104
+ {
1105
+ "clip_ratio": 0.0,
1106
+ "completion_length": 847.4732513427734,
1107
+ "epoch": 0.8010139416983524,
1108
+ "grad_norm": 0.11725428700447083,
1109
+ "learning_rate": 7.474191703716338e-07,
1110
+ "loss": -0.0134,
1111
+ "num_tokens": 63563254.0,
1112
+ "reward": 1.1727145612239838,
1113
+ "reward_std": 0.3684752322733402,
1114
+ "rewards/cosine_scaled_reward": 1.1727144792675972,
1115
+ "rewards/format_reward": 0.0,
1116
+ "step": 79
1117
+ },
1118
+ {
1119
+ "clip_ratio": 0.0,
1120
+ "completion_length": 831.0089721679688,
1121
+ "epoch": 0.8111533586818758,
1122
+ "grad_norm": 0.1215280145406723,
1123
+ "learning_rate": 7.396244933600284e-07,
1124
+ "loss": 0.0269,
1125
+ "num_tokens": 64432126.0,
1126
+ "reward": 1.1777353882789612,
1127
+ "reward_std": 0.420895554125309,
1128
+ "rewards/cosine_scaled_reward": 1.1777353659272194,
1129
+ "rewards/format_reward": 0.0,
1130
+ "step": 80
1131
+ },
1132
+ {
1133
+ "clip_ratio": 0.0,
1134
+ "completion_length": 848.4598617553711,
1135
+ "epoch": 0.8212927756653993,
1136
+ "grad_norm": 0.13223014771938324,
1137
+ "learning_rate": 7.317534689775527e-07,
1138
+ "loss": 0.0072,
1139
+ "num_tokens": 65312050.0,
1140
+ "reward": 1.1698129922151566,
1141
+ "reward_std": 0.4710509404540062,
1142
+ "rewards/cosine_scaled_reward": 1.1698129400610924,
1143
+ "rewards/format_reward": 0.0,
1144
+ "step": 81
1145
+ },
1146
+ {
1147
+ "clip_ratio": 0.0,
1148
+ "completion_length": 811.4230346679688,
1149
+ "epoch": 0.8314321926489227,
1150
+ "grad_norm": 0.12015990167856216,
1151
+ "learning_rate": 7.238086050313562e-07,
1152
+ "loss": -0.0264,
1153
+ "num_tokens": 66174341.0,
1154
+ "reward": 1.124290645122528,
1155
+ "reward_std": 0.40254683420062065,
1156
+ "rewards/cosine_scaled_reward": 1.1242906004190445,
1157
+ "rewards/format_reward": 0.0,
1158
+ "step": 82
1159
+ },
1160
+ {
1161
+ "clip_ratio": 0.0,
1162
+ "completion_length": 819.8382110595703,
1163
+ "epoch": 0.8415716096324461,
1164
+ "grad_norm": 0.14578643441200256,
1165
+ "learning_rate": 7.157924328548002e-07,
1166
+ "loss": 0.026,
1167
+ "num_tokens": 67042508.0,
1168
+ "reward": 1.2035124003887177,
1169
+ "reward_std": 0.4501505568623543,
1170
+ "rewards/cosine_scaled_reward": 1.2035123333334923,
1171
+ "rewards/format_reward": 0.0,
1172
+ "step": 83
1173
+ },
1174
+ {
1175
+ "clip_ratio": 0.0,
1176
+ "completion_length": 832.864990234375,
1177
+ "epoch": 0.8517110266159695,
1178
+ "grad_norm": 0.11915823817253113,
1179
+ "learning_rate": 7.077075065009433e-07,
1180
+ "loss": 0.0419,
1181
+ "num_tokens": 67922067.0,
1182
+ "reward": 1.2239351123571396,
1183
+ "reward_std": 0.4415438733994961,
1184
+ "rewards/cosine_scaled_reward": 1.2239350602030754,
1185
+ "rewards/format_reward": 0.0,
1186
+ "step": 84
1187
+ },
1188
+ {
1189
+ "clip_ratio": 0.0,
1190
+ "completion_length": 794.1897659301758,
1191
+ "epoch": 0.861850443599493,
1192
+ "grad_norm": 0.12470311671495438,
1193
+ "learning_rate": 6.995564019287869e-07,
1194
+ "loss": 0.0085,
1195
+ "num_tokens": 68763477.0,
1196
+ "reward": 1.1386212185025215,
1197
+ "reward_std": 0.4416712336242199,
1198
+ "rewards/cosine_scaled_reward": 1.1386211588978767,
1199
+ "rewards/format_reward": 0.0,
1200
+ "step": 85
1201
+ },
1202
+ {
1203
+ "clip_ratio": 0.0,
1204
+ "completion_length": 865.2344207763672,
1205
+ "epoch": 0.8719898605830165,
1206
+ "grad_norm": 0.1167558878660202,
1207
+ "learning_rate": 6.913417161825449e-07,
1208
+ "loss": -0.0029,
1209
+ "num_tokens": 69673639.0,
1210
+ "reward": 1.1339842602610588,
1211
+ "reward_std": 0.44360488280653954,
1212
+ "rewards/cosine_scaled_reward": 1.1339841783046722,
1213
+ "rewards/format_reward": 0.0,
1214
+ "step": 86
1215
+ },
1216
+ {
1217
+ "clip_ratio": 0.0,
1218
+ "completion_length": 807.192008972168,
1219
+ "epoch": 0.8821292775665399,
1220
+ "grad_norm": 0.22383332252502441,
1221
+ "learning_rate": 6.830660665641897e-07,
1222
+ "loss": 0.0167,
1223
+ "num_tokens": 70526603.0,
1224
+ "reward": 1.1783367395401,
1225
+ "reward_std": 0.44941645860671997,
1226
+ "rewards/cosine_scaled_reward": 1.1783367022871971,
1227
+ "rewards/format_reward": 0.0,
1228
+ "step": 87
1229
+ },
1230
+ {
1231
+ "clip_ratio": 0.0,
1232
+ "completion_length": 842.9832916259766,
1233
+ "epoch": 0.8922686945500634,
1234
+ "grad_norm": 0.12214665859937668,
1235
+ "learning_rate": 6.747320897995492e-07,
1236
+ "loss": 0.0028,
1237
+ "num_tokens": 71410908.0,
1238
+ "reward": 1.1979937851428986,
1239
+ "reward_std": 0.41686391085386276,
1240
+ "rewards/cosine_scaled_reward": 1.1979937255382538,
1241
+ "rewards/format_reward": 0.0,
1242
+ "step": 88
1243
+ },
1244
+ {
1245
+ "clip_ratio": 0.0,
1246
+ "completion_length": 752.7991409301758,
1247
+ "epoch": 0.9024081115335868,
1248
+ "grad_norm": 0.12918852269649506,
1249
+ "learning_rate": 6.66342441198212e-07,
1250
+ "loss": -0.0199,
1251
+ "num_tokens": 72209320.0,
1252
+ "reward": 1.1556047648191452,
1253
+ "reward_std": 0.39559851959347725,
1254
+ "rewards/cosine_scaled_reward": 1.1556047424674034,
1255
+ "rewards/format_reward": 0.0,
1256
+ "step": 89
1257
+ },
1258
+ {
1259
+ "clip_ratio": 0.0,
1260
+ "completion_length": 788.5815048217773,
1261
+ "epoch": 0.9125475285171103,
1262
+ "grad_norm": 0.2962648272514343,
1263
+ "learning_rate": 6.578997938075125e-07,
1264
+ "loss": 0.0258,
1265
+ "num_tokens": 73040361.0,
1266
+ "reward": 1.1919841021299362,
1267
+ "reward_std": 0.40032988227903843,
1268
+ "rewards/cosine_scaled_reward": 1.1919840648770332,
1269
+ "rewards/format_reward": 0.0,
1270
+ "step": 90
1271
+ },
1272
+ {
1273
+ "clip_ratio": 0.0,
1274
+ "completion_length": 787.3214645385742,
1275
+ "epoch": 0.9226869455006337,
1276
+ "grad_norm": 0.15694542229175568,
1277
+ "learning_rate": 6.494068375608646e-07,
1278
+ "loss": -0.0147,
1279
+ "num_tokens": 73880425.0,
1280
+ "reward": 1.1829868927598,
1281
+ "reward_std": 0.3718248065561056,
1282
+ "rewards/cosine_scaled_reward": 1.1829868257045746,
1283
+ "rewards/format_reward": 0.0,
1284
+ "step": 91
1285
+ },
1286
+ {
1287
+ "clip_ratio": 0.0,
1288
+ "completion_length": 821.3850708007812,
1289
+ "epoch": 0.9328263624841572,
1290
+ "grad_norm": 0.11674854159355164,
1291
+ "learning_rate": 6.408662784207149e-07,
1292
+ "loss": 0.0095,
1293
+ "num_tokens": 74755762.0,
1294
+ "reward": 1.2243514209985733,
1295
+ "reward_std": 0.4170701913535595,
1296
+ "rewards/cosine_scaled_reward": 1.2243513762950897,
1297
+ "rewards/format_reward": 0.0,
1298
+ "step": 92
1299
+ },
1300
+ {
1301
+ "clip_ratio": 0.0,
1302
+ "completion_length": 761.3248138427734,
1303
+ "epoch": 0.9429657794676806,
1304
+ "grad_norm": 0.12954537570476532,
1305
+ "learning_rate": 6.322808375163895e-07,
1306
+ "loss": 0.0386,
1307
+ "num_tokens": 75564789.0,
1308
+ "reward": 1.1891243010759354,
1309
+ "reward_std": 0.45389125496149063,
1310
+ "rewards/cosine_scaled_reward": 1.1891242563724518,
1311
+ "rewards/format_reward": 0.0,
1312
+ "step": 93
1313
+ },
1314
+ {
1315
+ "clip_ratio": 0.0,
1316
+ "completion_length": 793.9643249511719,
1317
+ "epoch": 0.9531051964512041,
1318
+ "grad_norm": 0.12744048237800598,
1319
+ "learning_rate": 6.236532502771077e-07,
1320
+ "loss": 0.0271,
1321
+ "num_tokens": 76409781.0,
1322
+ "reward": 1.1992642730474472,
1323
+ "reward_std": 0.4242793843150139,
1324
+ "rewards/cosine_scaled_reward": 1.1992641612887383,
1325
+ "rewards/format_reward": 0.0,
1326
+ "step": 94
1327
+ },
1328
+ {
1329
+ "clip_ratio": 0.0,
1330
+ "completion_length": 789.4219207763672,
1331
+ "epoch": 0.9632446134347274,
1332
+ "grad_norm": 0.12654094398021698,
1333
+ "learning_rate": 6.149862655604403e-07,
1334
+ "loss": 0.0339,
1335
+ "num_tokens": 77242791.0,
1336
+ "reward": 1.2284981906414032,
1337
+ "reward_std": 0.42066042125225067,
1338
+ "rewards/cosine_scaled_reward": 1.2284981459379196,
1339
+ "rewards/format_reward": 0.0,
1340
+ "step": 95
1341
+ },
1342
+ {
1343
+ "clip_ratio": 0.0,
1344
+ "completion_length": 795.716552734375,
1345
+ "epoch": 0.973384030418251,
1346
+ "grad_norm": 0.12548381090164185,
1347
+ "learning_rate": 6.062826447764883e-07,
1348
+ "loss": 0.008,
1349
+ "num_tokens": 78078497.0,
1350
+ "reward": 1.2442717999219894,
1351
+ "reward_std": 0.35396804101765156,
1352
+ "rewards/cosine_scaled_reward": 1.2442717254161835,
1353
+ "rewards/format_reward": 0.0,
1354
+ "step": 96
1355
+ },
1356
+ {
1357
+ "clip_ratio": 0.0,
1358
+ "completion_length": 793.0692291259766,
1359
+ "epoch": 0.9835234474017744,
1360
+ "grad_norm": 0.12744414806365967,
1361
+ "learning_rate": 5.975451610080642e-07,
1362
+ "loss": -0.0194,
1363
+ "num_tokens": 78916895.0,
1364
+ "reward": 1.1445267572999,
1365
+ "reward_std": 0.37857919186353683,
1366
+ "rewards/cosine_scaled_reward": 1.1445267051458359,
1367
+ "rewards/format_reward": 0.0,
1368
+ "step": 97
1369
+ },
1370
+ {
1371
+ "clip_ratio": 0.0,
1372
+ "completion_length": 782.1171875,
1373
+ "epoch": 0.9936628643852978,
1374
+ "grad_norm": 0.12226420640945435,
1375
+ "learning_rate": 5.887765981271517e-07,
1376
+ "loss": 0.0066,
1377
+ "num_tokens": 79784779.0,
1378
+ "reward": 1.168272152543068,
1379
+ "reward_std": 0.39856788516044617,
1380
+ "rewards/cosine_scaled_reward": 1.1682721078395844,
1381
+ "rewards/format_reward": 0.0,
1382
+ "step": 98
1383
+ },
1384
+ {
1385
+ "clip_ratio": 0.0,
1386
+ "completion_length": 783.4386367797852,
1387
+ "epoch": 1.0101394169835234,
1388
+ "grad_norm": 0.13228166103363037,
1389
+ "learning_rate": 5.7997974990793e-07,
1390
+ "loss": 0.0145,
1391
+ "num_tokens": 80615732.0,
1392
+ "reward": 1.2336678951978683,
1393
+ "reward_std": 0.38141428492963314,
1394
+ "rewards/cosine_scaled_reward": 1.233667865395546,
1395
+ "rewards/format_reward": 0.0,
1396
+ "step": 99
1397
+ },
1398
+ {
1399
+ "clip_ratio": 0.0,
1400
+ "completion_length": 806.2355346679688,
1401
+ "epoch": 1.020278833967047,
1402
+ "grad_norm": 0.12670038640499115,
1403
+ "learning_rate": 5.711574191366427e-07,
1404
+ "loss": 0.0028,
1405
+ "num_tokens": 81465495.0,
1406
+ "reward": 1.1623064577579498,
1407
+ "reward_std": 0.3932801876217127,
1408
+ "rewards/cosine_scaled_reward": 1.1623064130544662,
1409
+ "rewards/format_reward": 0.0,
1410
+ "step": 100
1411
+ },
1412
+ {
1413
+ "clip_ratio": 0.0,
1414
+ "completion_length": 790.1205749511719,
1415
+ "epoch": 1.0304182509505704,
1416
+ "grad_norm": 0.12086564302444458,
1417
+ "learning_rate": 5.623124167185929e-07,
1418
+ "loss": 0.0063,
1419
+ "num_tokens": 82295971.0,
1420
+ "reward": 1.2574269622564316,
1421
+ "reward_std": 0.3898525554686785,
1422
+ "rewards/cosine_scaled_reward": 1.257426917552948,
1423
+ "rewards/format_reward": 0.0,
1424
+ "step": 101
1425
+ },
1426
+ {
1427
+ "clip_ratio": 0.0,
1428
+ "completion_length": 758.897346496582,
1429
+ "epoch": 1.0405576679340938,
1430
+ "grad_norm": 0.14097033441066742,
1431
+ "learning_rate": 5.534475607825565e-07,
1432
+ "loss": -0.0126,
1433
+ "num_tokens": 83103231.0,
1434
+ "reward": 1.2127035036683083,
1435
+ "reward_std": 0.3951477538794279,
1436
+ "rewards/cosine_scaled_reward": 1.2127034589648247,
1437
+ "rewards/format_reward": 0.0,
1438
+ "step": 102
1439
+ },
1440
+ {
1441
+ "clip_ratio": 0.0,
1442
+ "completion_length": 823.2377624511719,
1443
+ "epoch": 1.0506970849176172,
1444
+ "grad_norm": 0.11900278925895691,
1445
+ "learning_rate": 5.445656757828879e-07,
1446
+ "loss": -0.0341,
1447
+ "num_tokens": 83972156.0,
1448
+ "reward": 1.0786512047052383,
1449
+ "reward_std": 0.386748855933547,
1450
+ "rewards/cosine_scaled_reward": 1.078651174902916,
1451
+ "rewards/format_reward": 0.0,
1452
+ "step": 103
1453
+ },
1454
+ {
1455
+ "clip_ratio": 0.0,
1456
+ "completion_length": 778.1082992553711,
1457
+ "epoch": 1.0608365019011408,
1458
+ "grad_norm": 0.12485373765230179,
1459
+ "learning_rate": 5.356695915996161e-07,
1460
+ "loss": -0.0152,
1461
+ "num_tokens": 84803701.0,
1462
+ "reward": 1.1336441561579704,
1463
+ "reward_std": 0.43268290534615517,
1464
+ "rewards/cosine_scaled_reward": 1.1336440965533257,
1465
+ "rewards/format_reward": 0.0,
1466
+ "step": 104
1467
+ },
1468
+ {
1469
+ "clip_ratio": 0.0,
1470
+ "completion_length": 747.7790451049805,
1471
+ "epoch": 1.0709759188846641,
1472
+ "grad_norm": 0.14275644719600677,
1473
+ "learning_rate": 5.267621426368075e-07,
1474
+ "loss": 0.0402,
1475
+ "num_tokens": 85592927.0,
1476
+ "reward": 1.2611572295427322,
1477
+ "reward_std": 0.4418743886053562,
1478
+ "rewards/cosine_scaled_reward": 1.2611571997404099,
1479
+ "rewards/format_reward": 0.0,
1480
+ "step": 105
1481
+ },
1482
+ {
1483
+ "clip_ratio": 0.0,
1484
+ "completion_length": 686.8861770629883,
1485
+ "epoch": 1.0811153358681875,
1486
+ "grad_norm": 0.2392619252204895,
1487
+ "learning_rate": 5.178461669194903e-07,
1488
+ "loss": 0.0233,
1489
+ "num_tokens": 86335585.0,
1490
+ "reward": 1.2902618646621704,
1491
+ "reward_std": 0.4318929500877857,
1492
+ "rewards/cosine_scaled_reward": 1.2902617901563644,
1493
+ "rewards/format_reward": 0.0,
1494
+ "step": 106
1495
+ },
1496
+ {
1497
+ "clip_ratio": 0.0,
1498
+ "completion_length": 765.6752548217773,
1499
+ "epoch": 1.091254752851711,
1500
+ "grad_norm": 0.12888778746128082,
1501
+ "learning_rate": 5.08924505189423e-07,
1502
+ "loss": -0.0053,
1503
+ "num_tokens": 87147590.0,
1504
+ "reward": 1.2406404912471771,
1505
+ "reward_std": 0.3950204625725746,
1506
+ "rewards/cosine_scaled_reward": 1.2406404465436935,
1507
+ "rewards/format_reward": 0.0,
1508
+ "step": 107
1509
+ },
1510
+ {
1511
+ "clip_ratio": 0.0,
1512
+ "completion_length": 774.6596298217773,
1513
+ "epoch": 1.1013941698352345,
1514
+ "grad_norm": 0.16063496470451355,
1515
+ "learning_rate": 5e-07,
1516
+ "loss": 0.0407,
1517
+ "num_tokens": 87965805.0,
1518
+ "reward": 1.2785077691078186,
1519
+ "reward_std": 0.39564304798841476,
1520
+ "rewards/cosine_scaled_reward": 1.2785076946020126,
1521
+ "rewards/format_reward": 0.0,
1522
+ "step": 108
1523
+ },
1524
+ {
1525
+ "clip_ratio": 0.0,
1526
+ "completion_length": 800.3393249511719,
1527
+ "epoch": 1.111533586818758,
1528
+ "grad_norm": 0.23322370648384094,
1529
+ "learning_rate": 4.91075494810577e-07,
1530
+ "loss": 0.0181,
1531
+ "num_tokens": 88800901.0,
1532
+ "reward": 1.2374378964304924,
1533
+ "reward_std": 0.39184754714369774,
1534
+ "rewards/cosine_scaled_reward": 1.2374378442764282,
1535
+ "rewards/format_reward": 0.0,
1536
+ "step": 109
1537
+ },
1538
+ {
1539
+ "clip_ratio": 0.0,
1540
+ "completion_length": 802.3839721679688,
1541
+ "epoch": 1.1216730038022813,
1542
+ "grad_norm": 0.12637439370155334,
1543
+ "learning_rate": 4.821538330805098e-07,
1544
+ "loss": -0.0142,
1545
+ "num_tokens": 89646357.0,
1546
+ "reward": 1.2594422698020935,
1547
+ "reward_std": 0.3456545788794756,
1548
+ "rewards/cosine_scaled_reward": 1.2594422399997711,
1549
+ "rewards/format_reward": 0.0,
1550
+ "step": 110
1551
+ },
1552
+ {
1553
+ "clip_ratio": 0.0,
1554
+ "completion_length": 804.2109680175781,
1555
+ "epoch": 1.131812420785805,
1556
+ "grad_norm": 0.4338585138320923,
1557
+ "learning_rate": 4.732378573631924e-07,
1558
+ "loss": -0.0124,
1559
+ "num_tokens": 90489658.0,
1560
+ "reward": 1.204567864537239,
1561
+ "reward_std": 0.44952966272830963,
1562
+ "rewards/cosine_scaled_reward": 1.2045678049325943,
1563
+ "rewards/format_reward": 0.0,
1564
+ "step": 111
1565
+ },
1566
+ {
1567
+ "clip_ratio": 0.0,
1568
+ "completion_length": 814.7243728637695,
1569
+ "epoch": 1.1419518377693283,
1570
+ "grad_norm": 0.12191635370254517,
1571
+ "learning_rate": 4.643304084003838e-07,
1572
+ "loss": 0.0138,
1573
+ "num_tokens": 91347099.0,
1574
+ "reward": 1.1750361323356628,
1575
+ "reward_std": 0.4308737814426422,
1576
+ "rewards/cosine_scaled_reward": 1.1750360652804375,
1577
+ "rewards/format_reward": 0.0,
1578
+ "step": 112
1579
+ },
1580
+ {
1581
+ "clip_ratio": 0.0,
1582
+ "completion_length": 767.3794937133789,
1583
+ "epoch": 1.1520912547528517,
1584
+ "grad_norm": 0.13367751240730286,
1585
+ "learning_rate": 4.55434324217112e-07,
1586
+ "loss": -0.026,
1587
+ "num_tokens": 92163911.0,
1588
+ "reward": 1.0875676944851875,
1589
+ "reward_std": 0.4285357743501663,
1590
+ "rewards/cosine_scaled_reward": 1.0875676572322845,
1591
+ "rewards/format_reward": 0.0,
1592
+ "step": 113
1593
+ },
1594
+ {
1595
+ "clip_ratio": 0.0,
1596
+ "completion_length": 759.1138687133789,
1597
+ "epoch": 1.162230671736375,
1598
+ "grad_norm": 0.6044357419013977,
1599
+ "learning_rate": 4.4655243921744367e-07,
1600
+ "loss": -0.0034,
1601
+ "num_tokens": 92969037.0,
1602
+ "reward": 1.1716800779104233,
1603
+ "reward_std": 0.40124501287937164,
1604
+ "rewards/cosine_scaled_reward": 1.1716800406575203,
1605
+ "rewards/format_reward": 0.0,
1606
+ "step": 114
1607
+ },
1608
+ {
1609
+ "clip_ratio": 0.0,
1610
+ "completion_length": 742.3248138427734,
1611
+ "epoch": 1.1723700887198987,
1612
+ "grad_norm": 0.12603311240673065,
1613
+ "learning_rate": 4.37687583281407e-07,
1614
+ "loss": -0.0106,
1615
+ "num_tokens": 93761376.0,
1616
+ "reward": 1.2006227523088455,
1617
+ "reward_std": 0.3855649037286639,
1618
+ "rewards/cosine_scaled_reward": 1.2006226852536201,
1619
+ "rewards/format_reward": 0.0,
1620
+ "step": 115
1621
+ },
1622
+ {
1623
+ "clip_ratio": 0.0,
1624
+ "completion_length": 750.4765930175781,
1625
+ "epoch": 1.182509505703422,
1626
+ "grad_norm": 0.13189645111560822,
1627
+ "learning_rate": 4.2884258086335745e-07,
1628
+ "loss": 0.0114,
1629
+ "num_tokens": 94573043.0,
1630
+ "reward": 1.2511205673217773,
1631
+ "reward_std": 0.408273559063673,
1632
+ "rewards/cosine_scaled_reward": 1.2511205524206161,
1633
+ "rewards/format_reward": 0.0,
1634
+ "step": 116
1635
+ },
1636
+ {
1637
+ "clip_ratio": 0.0,
1638
+ "completion_length": 748.3750305175781,
1639
+ "epoch": 1.1926489226869454,
1640
+ "grad_norm": 0.13386943936347961,
1641
+ "learning_rate": 4.2002025009206987e-07,
1642
+ "loss": 0.0081,
1643
+ "num_tokens": 95375539.0,
1644
+ "reward": 1.1935091465711594,
1645
+ "reward_std": 0.39292260631918907,
1646
+ "rewards/cosine_scaled_reward": 1.1935090869665146,
1647
+ "rewards/format_reward": 0.0,
1648
+ "step": 117
1649
+ },
1650
+ {
1651
+ "clip_ratio": 0.0,
1652
+ "completion_length": 752.021240234375,
1653
+ "epoch": 1.202788339670469,
1654
+ "grad_norm": 0.13410834968090057,
1655
+ "learning_rate": 4.1122340187284845e-07,
1656
+ "loss": -0.0055,
1657
+ "num_tokens": 96171638.0,
1658
+ "reward": 1.215317651629448,
1659
+ "reward_std": 0.4204988572746515,
1660
+ "rewards/cosine_scaled_reward": 1.2153176292777061,
1661
+ "rewards/format_reward": 0.0,
1662
+ "step": 118
1663
+ },
1664
+ {
1665
+ "clip_ratio": 0.0,
1666
+ "completion_length": 741.1294937133789,
1667
+ "epoch": 1.2129277566539924,
1668
+ "grad_norm": 0.14352215826511383,
1669
+ "learning_rate": 4.0245483899193586e-07,
1670
+ "loss": -0.0021,
1671
+ "num_tokens": 96963066.0,
1672
+ "reward": 1.171108528971672,
1673
+ "reward_std": 0.390108622610569,
1674
+ "rewards/cosine_scaled_reward": 1.171108491718769,
1675
+ "rewards/format_reward": 0.0,
1676
+ "step": 119
1677
+ },
1678
+ {
1679
+ "clip_ratio": 0.0,
1680
+ "completion_length": 731.9676666259766,
1681
+ "epoch": 1.2230671736375158,
1682
+ "grad_norm": 0.3331538140773773,
1683
+ "learning_rate": 3.937173552235116e-07,
1684
+ "loss": -0.0142,
1685
+ "num_tokens": 97751717.0,
1686
+ "reward": 1.1743304282426834,
1687
+ "reward_std": 0.3502536006271839,
1688
+ "rewards/cosine_scaled_reward": 1.174330398440361,
1689
+ "rewards/format_reward": 0.0,
1690
+ "step": 120
1691
+ },
1692
+ {
1693
+ "clip_ratio": 0.0,
1694
+ "completion_length": 722.037971496582,
1695
+ "epoch": 1.2332065906210392,
1696
+ "grad_norm": 0.5637516975402832,
1697
+ "learning_rate": 3.850137344395598e-07,
1698
+ "loss": 0.0232,
1699
+ "num_tokens": 98521879.0,
1700
+ "reward": 1.261700451374054,
1701
+ "reward_std": 0.34792254120111465,
1702
+ "rewards/cosine_scaled_reward": 1.2617004364728928,
1703
+ "rewards/format_reward": 0.0,
1704
+ "step": 121
1705
+ },
1706
+ {
1707
+ "clip_ratio": 0.0,
1708
+ "completion_length": 810.3984832763672,
1709
+ "epoch": 1.2433460076045628,
1710
+ "grad_norm": 0.12412155419588089,
1711
+ "learning_rate": 3.763467497228922e-07,
1712
+ "loss": -0.0175,
1713
+ "num_tokens": 99386028.0,
1714
+ "reward": 1.1121523007750511,
1715
+ "reward_std": 0.4185631051659584,
1716
+ "rewards/cosine_scaled_reward": 1.11215228587389,
1717
+ "rewards/format_reward": 0.0,
1718
+ "step": 122
1719
+ },
1720
+ {
1721
+ "clip_ratio": 0.0,
1722
+ "completion_length": 762.1272659301758,
1723
+ "epoch": 1.2534854245880862,
1724
+ "grad_norm": 0.5518940091133118,
1725
+ "learning_rate": 3.677191624836106e-07,
1726
+ "loss": -0.0156,
1727
+ "num_tokens": 100190606.0,
1728
+ "reward": 1.2098966389894485,
1729
+ "reward_std": 0.42287516221404076,
1730
+ "rewards/cosine_scaled_reward": 1.2098965868353844,
1731
+ "rewards/format_reward": 0.0,
1732
+ "step": 123
1733
+ },
1734
+ {
1735
+ "clip_ratio": 0.0,
1736
+ "completion_length": 797.1741409301758,
1737
+ "epoch": 1.2636248415716096,
1738
+ "grad_norm": 0.13090519607067108,
1739
+ "learning_rate": 3.591337215792851e-07,
1740
+ "loss": -0.0049,
1741
+ "num_tokens": 101033042.0,
1742
+ "reward": 1.2026162892580032,
1743
+ "reward_std": 0.38727976381778717,
1744
+ "rewards/cosine_scaled_reward": 1.2026162147521973,
1745
+ "rewards/format_reward": 0.0,
1746
+ "step": 124
1747
+ },
1748
+ {
1749
+ "clip_ratio": 0.0,
1750
+ "completion_length": 755.0368728637695,
1751
+ "epoch": 1.2737642585551332,
1752
+ "grad_norm": 0.13797762989997864,
1753
+ "learning_rate": 3.505931624391355e-07,
1754
+ "loss": 0.0129,
1755
+ "num_tokens": 101827819.0,
1756
+ "reward": 1.1926257461309433,
1757
+ "reward_std": 0.44687049090862274,
1758
+ "rewards/cosine_scaled_reward": 1.1926256865262985,
1759
+ "rewards/format_reward": 0.0,
1760
+ "step": 125
1761
+ },
1762
+ {
1763
+ "clip_ratio": 0.0,
1764
+ "completion_length": 741.3024978637695,
1765
+ "epoch": 1.2839036755386566,
1766
+ "grad_norm": 0.18938252329826355,
1767
+ "learning_rate": 3.421002061924876e-07,
1768
+ "loss": 0.0201,
1769
+ "num_tokens": 102618890.0,
1770
+ "reward": 1.263178527355194,
1771
+ "reward_std": 0.3987303748726845,
1772
+ "rewards/cosine_scaled_reward": 1.2631784602999687,
1773
+ "rewards/format_reward": 0.0,
1774
+ "step": 126
1775
+ },
1776
+ {
1777
+ "clip_ratio": 0.0,
1778
+ "completion_length": 747.1696701049805,
1779
+ "epoch": 1.29404309252218,
1780
+ "grad_norm": 0.13063876330852509,
1781
+ "learning_rate": 3.3365755880178807e-07,
1782
+ "loss": 0.0312,
1783
+ "num_tokens": 103410058.0,
1784
+ "reward": 1.2827118784189224,
1785
+ "reward_std": 0.3673780020326376,
1786
+ "rewards/cosine_scaled_reward": 1.2827118039131165,
1787
+ "rewards/format_reward": 0.0,
1788
+ "step": 127
1789
+ },
1790
+ {
1791
+ "clip_ratio": 0.0,
1792
+ "completion_length": 796.0893173217773,
1793
+ "epoch": 1.3041825095057034,
1794
+ "grad_norm": 0.12410367280244827,
1795
+ "learning_rate": 3.2526791020045087e-07,
1796
+ "loss": 0.0153,
1797
+ "num_tokens": 104255954.0,
1798
+ "reward": 1.13737141340971,
1799
+ "reward_std": 0.43270402774214745,
1800
+ "rewards/cosine_scaled_reward": 1.1373713836073875,
1801
+ "rewards/format_reward": 0.0,
1802
+ "step": 128
1803
+ },
1804
+ {
1805
+ "clip_ratio": 0.0,
1806
+ "completion_length": 709.1953353881836,
1807
+ "epoch": 1.3143219264892267,
1808
+ "grad_norm": 0.13791009783744812,
1809
+ "learning_rate": 3.169339334358104e-07,
1810
+ "loss": 0.0012,
1811
+ "num_tokens": 105024737.0,
1812
+ "reward": 1.2293454632163048,
1813
+ "reward_std": 0.36772353760898113,
1814
+ "rewards/cosine_scaled_reward": 1.2293454110622406,
1815
+ "rewards/format_reward": 0.0,
1816
+ "step": 129
1817
+ },
1818
+ {
1819
+ "clip_ratio": 0.0,
1820
+ "completion_length": 754.825927734375,
1821
+ "epoch": 1.3244613434727504,
1822
+ "grad_norm": 0.13509666919708252,
1823
+ "learning_rate": 3.086582838174551e-07,
1824
+ "loss": -0.0033,
1825
+ "num_tokens": 105833549.0,
1826
+ "reward": 1.1948751211166382,
1827
+ "reward_std": 0.37703990191221237,
1828
+ "rewards/cosine_scaled_reward": 1.1948750466108322,
1829
+ "rewards/format_reward": 0.0,
1830
+ "step": 130
1831
+ },
1832
+ {
1833
+ "clip_ratio": 0.0,
1834
+ "completion_length": 752.9855194091797,
1835
+ "epoch": 1.3346007604562737,
1836
+ "grad_norm": 0.13409507274627686,
1837
+ "learning_rate": 3.004435980712129e-07,
1838
+ "loss": -0.0299,
1839
+ "num_tokens": 106631688.0,
1840
+ "reward": 1.2489657402038574,
1841
+ "reward_std": 0.3884260356426239,
1842
+ "rewards/cosine_scaled_reward": 1.248965710401535,
1843
+ "rewards/format_reward": 0.0,
1844
+ "step": 131
1845
+ },
1846
+ {
1847
+ "clip_ratio": 0.0,
1848
+ "completion_length": 728.0178909301758,
1849
+ "epoch": 1.3447401774397973,
1850
+ "grad_norm": 0.14096473157405853,
1851
+ "learning_rate": 2.922924934990568e-07,
1852
+ "loss": 0.0421,
1853
+ "num_tokens": 107409144.0,
1854
+ "reward": 1.2679341062903404,
1855
+ "reward_std": 0.4113157168030739,
1856
+ "rewards/cosine_scaled_reward": 1.2679340541362762,
1857
+ "rewards/format_reward": 0.0,
1858
+ "step": 132
1859
+ },
1860
+ {
1861
+ "clip_ratio": 0.0,
1862
+ "completion_length": 708.2924423217773,
1863
+ "epoch": 1.3548795944233207,
1864
+ "grad_norm": 0.13527466356754303,
1865
+ "learning_rate": 2.8420756714519954e-07,
1866
+ "loss": 0.0099,
1867
+ "num_tokens": 108188078.0,
1868
+ "reward": 1.2067532986402512,
1869
+ "reward_std": 0.4394964314997196,
1870
+ "rewards/cosine_scaled_reward": 1.2067532688379288,
1871
+ "rewards/format_reward": 0.0,
1872
+ "step": 133
1873
+ },
1874
+ {
1875
+ "clip_ratio": 0.0,
1876
+ "completion_length": 745.264533996582,
1877
+ "epoch": 1.3650190114068441,
1878
+ "grad_norm": 0.12941937148571014,
1879
+ "learning_rate": 2.7619139496864376e-07,
1880
+ "loss": -0.0117,
1881
+ "num_tokens": 108991819.0,
1882
+ "reward": 1.1342971697449684,
1883
+ "reward_std": 0.35589409433305264,
1884
+ "rewards/cosine_scaled_reward": 1.1342971250414848,
1885
+ "rewards/format_reward": 0.0,
1886
+ "step": 134
1887
+ },
1888
+ {
1889
+ "clip_ratio": 0.0,
1890
+ "completion_length": 690.6328506469727,
1891
+ "epoch": 1.3751584283903675,
1892
+ "grad_norm": 0.1342969536781311,
1893
+ "learning_rate": 2.6824653102244727e-07,
1894
+ "loss": 0.0057,
1895
+ "num_tokens": 109744698.0,
1896
+ "reward": 1.20839624106884,
1897
+ "reward_std": 0.44357743114233017,
1898
+ "rewards/cosine_scaled_reward": 1.208396166563034,
1899
+ "rewards/format_reward": 0.0,
1900
+ "step": 135
1901
+ },
1902
+ {
1903
+ "clip_ratio": 0.0,
1904
+ "completion_length": 739.3605270385742,
1905
+ "epoch": 1.385297845373891,
1906
+ "grad_norm": 0.22325177490711212,
1907
+ "learning_rate": 2.603755066399718e-07,
1908
+ "loss": 0.0066,
1909
+ "num_tokens": 110537245.0,
1910
+ "reward": 1.1991283893585205,
1911
+ "reward_std": 0.4422716274857521,
1912
+ "rewards/cosine_scaled_reward": 1.1991283744573593,
1913
+ "rewards/format_reward": 0.0,
1914
+ "step": 136
1915
+ },
1916
+ {
1917
+ "clip_ratio": 0.0,
1918
+ "completion_length": 771.5045013427734,
1919
+ "epoch": 1.3954372623574145,
1920
+ "grad_norm": 0.131967693567276,
1921
+ "learning_rate": 2.5258082962836614e-07,
1922
+ "loss": -0.0609,
1923
+ "num_tokens": 111363073.0,
1924
+ "reward": 1.0822726637125015,
1925
+ "reward_std": 0.31478939671069384,
1926
+ "rewards/cosine_scaled_reward": 1.082272619009018,
1927
+ "rewards/format_reward": 0.0,
1928
+ "step": 137
1929
+ },
1930
+ {
1931
+ "clip_ratio": 0.0,
1932
+ "completion_length": 745.6908798217773,
1933
+ "epoch": 1.4055766793409379,
1934
+ "grad_norm": 0.13431350886821747,
1935
+ "learning_rate": 2.4486498346955023e-07,
1936
+ "loss": 0.0393,
1937
+ "num_tokens": 112154044.0,
1938
+ "reward": 1.2083490192890167,
1939
+ "reward_std": 0.4234139323234558,
1940
+ "rewards/cosine_scaled_reward": 1.2083489745855331,
1941
+ "rewards/format_reward": 0.0,
1942
+ "step": 138
1943
+ },
1944
+ {
1945
+ "clip_ratio": 0.0,
1946
+ "completion_length": 730.8303909301758,
1947
+ "epoch": 1.4157160963244613,
1948
+ "grad_norm": 0.37255144119262695,
1949
+ "learning_rate": 2.372304265289436e-07,
1950
+ "loss": 0.0129,
1951
+ "num_tokens": 112933164.0,
1952
+ "reward": 1.14224873483181,
1953
+ "reward_std": 0.47245585173368454,
1954
+ "rewards/cosine_scaled_reward": 1.142248660326004,
1955
+ "rewards/format_reward": 0.0,
1956
+ "step": 139
1957
+ },
1958
+ {
1959
+ "clip_ratio": 0.0,
1960
+ "completion_length": 723.6975708007812,
1961
+ "epoch": 1.4258555133079849,
1962
+ "grad_norm": 0.13437899947166443,
1963
+ "learning_rate": 2.2967959127220137e-07,
1964
+ "loss": 0.0149,
1965
+ "num_tokens": 113703557.0,
1966
+ "reward": 1.2409879192709923,
1967
+ "reward_std": 0.39572571590542793,
1968
+ "rewards/cosine_scaled_reward": 1.240987867116928,
1969
+ "rewards/format_reward": 0.0,
1970
+ "step": 140
1971
+ },
1972
+ {
1973
+ "clip_ratio": 0.0,
1974
+ "completion_length": 715.0223617553711,
1975
+ "epoch": 1.4359949302915083,
1976
+ "grad_norm": 0.5516566634178162,
1977
+ "learning_rate": 2.2221488349019902e-07,
1978
+ "loss": -0.0049,
1979
+ "num_tokens": 114492577.0,
1980
+ "reward": 1.1441184803843498,
1981
+ "reward_std": 0.4257240351289511,
1982
+ "rewards/cosine_scaled_reward": 1.1441184356808662,
1983
+ "rewards/format_reward": 0.0,
1984
+ "step": 141
1985
+ },
1986
+ {
1987
+ "clip_ratio": 0.0,
1988
+ "completion_length": 683.0033798217773,
1989
+ "epoch": 1.4461343472750317,
1990
+ "grad_norm": 0.135258749127388,
1991
+ "learning_rate": 2.1483868153251788e-07,
1992
+ "loss": 0.0098,
1993
+ "num_tokens": 115236092.0,
1994
+ "reward": 1.2539073526859283,
1995
+ "reward_std": 0.42142050713300705,
1996
+ "rewards/cosine_scaled_reward": 1.2539073079824448,
1997
+ "rewards/format_reward": 0.0,
1998
+ "step": 142
1999
+ },
2000
+ {
2001
+ "clip_ratio": 0.0,
2002
+ "completion_length": 783.0201263427734,
2003
+ "epoch": 1.456273764258555,
2004
+ "grad_norm": 0.17022480070590973,
2005
+ "learning_rate": 2.0755333554967346e-07,
2006
+ "loss": -0.0005,
2007
+ "num_tokens": 116074878.0,
2008
+ "reward": 1.0983483716845512,
2009
+ "reward_std": 0.42650508880615234,
2010
+ "rewards/cosine_scaled_reward": 1.0983483269810677,
2011
+ "rewards/format_reward": 0.0,
2012
+ "step": 143
2013
+ },
2014
+ {
2015
+ "clip_ratio": 0.0,
2016
+ "completion_length": 774.2790603637695,
2017
+ "epoch": 1.4664131812420786,
2018
+ "grad_norm": 0.1283206343650818,
2019
+ "learning_rate": 2.0036116674432652e-07,
2020
+ "loss": -0.0156,
2021
+ "num_tokens": 116905080.0,
2022
+ "reward": 1.1725050881505013,
2023
+ "reward_std": 0.35209081321954727,
2024
+ "rewards/cosine_scaled_reward": 1.172505035996437,
2025
+ "rewards/format_reward": 0.0,
2026
+ "step": 144
2027
+ },
2028
+ {
2029
+ "clip_ratio": 0.0,
2030
+ "completion_length": 658.7600746154785,
2031
+ "epoch": 1.476552598225602,
2032
+ "grad_norm": 0.14014765620231628,
2033
+ "learning_rate": 1.9326446663172035e-07,
2034
+ "loss": 0.0474,
2035
+ "num_tokens": 117626809.0,
2036
+ "reward": 1.341113954782486,
2037
+ "reward_std": 0.4142299462109804,
2038
+ "rewards/cosine_scaled_reward": 1.3411139100790024,
2039
+ "rewards/format_reward": 0.0,
2040
+ "step": 145
2041
+ },
2042
+ {
2043
+ "clip_ratio": 0.0,
2044
+ "completion_length": 679.3192367553711,
2045
+ "epoch": 1.4866920152091254,
2046
+ "grad_norm": 0.15331801772117615,
2047
+ "learning_rate": 1.8626549630957395e-07,
2048
+ "loss": 0.0136,
2049
+ "num_tokens": 118363623.0,
2050
+ "reward": 1.19945827126503,
2051
+ "reward_std": 0.4244145043194294,
2052
+ "rewards/cosine_scaled_reward": 1.1994582116603851,
2053
+ "rewards/format_reward": 0.0,
2054
+ "step": 146
2055
+ },
2056
+ {
2057
+ "clip_ratio": 0.0,
2058
+ "completion_length": 752.2890930175781,
2059
+ "epoch": 1.496831432192649,
2060
+ "grad_norm": 0.13101238012313843,
2061
+ "learning_rate": 1.7936648573766954e-07,
2062
+ "loss": -0.0056,
2063
+ "num_tokens": 119174386.0,
2064
+ "reward": 1.2080383449792862,
2065
+ "reward_std": 0.4013860188424587,
2066
+ "rewards/cosine_scaled_reward": 1.2080383002758026,
2067
+ "rewards/format_reward": 0.0,
2068
+ "step": 147
2069
+ },
2070
+ {
2071
+ "clip_ratio": 0.0,
2072
+ "completion_length": 762.8594131469727,
2073
+ "epoch": 1.5069708491761724,
2074
+ "grad_norm": 0.12902143597602844,
2075
+ "learning_rate": 1.725696330273575e-07,
2076
+ "loss": -0.0383,
2077
+ "num_tokens": 119988252.0,
2078
+ "reward": 1.169223740696907,
2079
+ "reward_std": 0.39561279490590096,
2080
+ "rewards/cosine_scaled_reward": 1.169223703444004,
2081
+ "rewards/format_reward": 0.0,
2082
+ "step": 148
2083
+ },
2084
+ {
2085
+ "clip_ratio": 0.0,
2086
+ "completion_length": 700.4531478881836,
2087
+ "epoch": 1.5171102661596958,
2088
+ "grad_norm": 0.1390618234872818,
2089
+ "learning_rate": 1.65877103741212e-07,
2090
+ "loss": 0.0519,
2091
+ "num_tokens": 120740586.0,
2092
+ "reward": 1.37008798122406,
2093
+ "reward_std": 0.3735492154955864,
2094
+ "rewards/cosine_scaled_reward": 1.3700879365205765,
2095
+ "rewards/format_reward": 0.0,
2096
+ "step": 149
2097
+ },
2098
+ {
2099
+ "clip_ratio": 0.0,
2100
+ "completion_length": 706.7366485595703,
2101
+ "epoch": 1.5272496831432192,
2102
+ "grad_norm": 0.13693000376224518,
2103
+ "learning_rate": 1.592910302030544e-07,
2104
+ "loss": 0.0186,
2105
+ "num_tokens": 121493126.0,
2106
+ "reward": 1.1771222800016403,
2107
+ "reward_std": 0.4693867526948452,
2108
+ "rewards/cosine_scaled_reward": 1.1771221980452538,
2109
+ "rewards/format_reward": 0.0,
2110
+ "step": 150
2111
+ },
2112
+ {
2113
+ "clip_ratio": 0.0,
2114
+ "completion_length": 725.279052734375,
2115
+ "epoch": 1.5373891001267426,
2116
+ "grad_norm": 0.16313283145427704,
2117
+ "learning_rate": 1.5281351081856976e-07,
2118
+ "loss": 0.0023,
2119
+ "num_tokens": 122274232.0,
2120
+ "reward": 1.203133448958397,
2121
+ "reward_std": 0.41216023825109005,
2122
+ "rewards/cosine_scaled_reward": 1.2031333893537521,
2123
+ "rewards/format_reward": 0.0,
2124
+ "step": 151
2125
+ },
2126
+ {
2127
+ "clip_ratio": 0.0,
2128
+ "completion_length": 703.8917770385742,
2129
+ "epoch": 1.5475285171102662,
2130
+ "grad_norm": 0.16185523569583893,
2131
+ "learning_rate": 1.4644660940672627e-07,
2132
+ "loss": -0.0126,
2133
+ "num_tokens": 123030511.0,
2134
+ "reward": 1.2789395153522491,
2135
+ "reward_std": 0.382132101804018,
2136
+ "rewards/cosine_scaled_reward": 1.2789394706487656,
2137
+ "rewards/format_reward": 0.0,
2138
+ "step": 152
2139
+ },
2140
+ {
2141
+ "clip_ratio": 0.0,
2142
+ "completion_length": 690.9888610839844,
2143
+ "epoch": 1.5576679340937896,
2144
+ "grad_norm": 0.6896323561668396,
2145
+ "learning_rate": 1.4019235454221856e-07,
2146
+ "loss": -0.0251,
2147
+ "num_tokens": 123783709.0,
2148
+ "reward": 1.230302333831787,
2149
+ "reward_std": 0.3682190030813217,
2150
+ "rewards/cosine_scaled_reward": 1.2303023040294647,
2151
+ "rewards/format_reward": 0.0,
2152
+ "step": 153
2153
+ },
2154
+ {
2155
+ "clip_ratio": 0.0,
2156
+ "completion_length": 702.8527069091797,
2157
+ "epoch": 1.5678073510773132,
2158
+ "grad_norm": 0.14129234850406647,
2159
+ "learning_rate": 1.3405273890913737e-07,
2160
+ "loss": 0.0005,
2161
+ "num_tokens": 124535481.0,
2162
+ "reward": 1.2051593363285065,
2163
+ "reward_std": 0.4200758896768093,
2164
+ "rewards/cosine_scaled_reward": 1.205159269273281,
2165
+ "rewards/format_reward": 0.0,
2166
+ "step": 154
2167
+ },
2168
+ {
2169
+ "clip_ratio": 0.0,
2170
+ "completion_length": 710.2265853881836,
2171
+ "epoch": 1.5779467680608366,
2172
+ "grad_norm": 0.1381485015153885,
2173
+ "learning_rate": 1.280297186660752e-07,
2174
+ "loss": 0.0181,
2175
+ "num_tokens": 125290220.0,
2176
+ "reward": 1.2739061638712883,
2177
+ "reward_std": 0.4085957519710064,
2178
+ "rewards/cosine_scaled_reward": 1.2739060744643211,
2179
+ "rewards/format_reward": 0.0,
2180
+ "step": 155
2181
+ },
2182
+ {
2183
+ "clip_ratio": 0.0,
2184
+ "completion_length": 808.0145492553711,
2185
+ "epoch": 1.58808618504436,
2186
+ "grad_norm": 0.13849686086177826,
2187
+ "learning_rate": 1.2212521282287093e-07,
2188
+ "loss": -0.005,
2189
+ "num_tokens": 126134129.0,
2190
+ "reward": 1.175368033349514,
2191
+ "reward_std": 0.3809916824102402,
2192
+ "rewards/cosine_scaled_reward": 1.1753679811954498,
2193
+ "rewards/format_reward": 0.0,
2194
+ "step": 156
2195
+ },
2196
+ {
2197
+ "clip_ratio": 0.0,
2198
+ "completion_length": 754.8638763427734,
2199
+ "epoch": 1.5982256020278833,
2200
+ "grad_norm": 0.14109836518764496,
2201
+ "learning_rate": 1.1634110262918717e-07,
2202
+ "loss": -0.0008,
2203
+ "num_tokens": 126954495.0,
2204
+ "reward": 1.2158635556697845,
2205
+ "reward_std": 0.34099204279482365,
2206
+ "rewards/cosine_scaled_reward": 1.2158635258674622,
2207
+ "rewards/format_reward": 0.0,
2208
+ "step": 157
2209
+ },
2210
+ {
2211
+ "clip_ratio": 0.0,
2212
+ "completion_length": 775.9598541259766,
2213
+ "epoch": 1.6083650190114067,
2214
+ "grad_norm": 0.12850748002529144,
2215
+ "learning_rate": 1.1067923097512255e-07,
2216
+ "loss": -0.0005,
2217
+ "num_tokens": 127780467.0,
2218
+ "reward": 1.1476298496127129,
2219
+ "reward_std": 0.4081256799399853,
2220
+ "rewards/cosine_scaled_reward": 1.1476298123598099,
2221
+ "rewards/format_reward": 0.0,
2222
+ "step": 158
2223
+ },
2224
+ {
2225
+ "clip_ratio": 0.0,
2226
+ "completion_length": 753.6797256469727,
2227
+ "epoch": 1.6185044359949303,
2228
+ "grad_norm": 0.36824139952659607,
2229
+ "learning_rate": 1.0514140180404202e-07,
2230
+ "loss": 0.0054,
2231
+ "num_tokens": 128584820.0,
2232
+ "reward": 1.0943407788872719,
2233
+ "reward_std": 0.41071633249521255,
2234
+ "rewards/cosine_scaled_reward": 1.0943407267332077,
2235
+ "rewards/format_reward": 0.0,
2236
+ "step": 159
2237
+ },
2238
+ {
2239
+ "clip_ratio": 0.0,
2240
+ "completion_length": 821.4553909301758,
2241
+ "epoch": 1.6286438529784537,
2242
+ "grad_norm": 0.15713122487068176,
2243
+ "learning_rate": 9.972937953781984e-08,
2244
+ "loss": 0.0185,
2245
+ "num_tokens": 129467812.0,
2246
+ "reward": 1.086461715400219,
2247
+ "reward_std": 0.4641183950006962,
2248
+ "rewards/cosine_scaled_reward": 1.0864616632461548,
2249
+ "rewards/format_reward": 0.0,
2250
+ "step": 160
2251
+ },
2252
+ {
2253
+ "clip_ratio": 0.0,
2254
+ "completion_length": 705.2288208007812,
2255
+ "epoch": 1.6387832699619773,
2256
+ "grad_norm": 0.13201545178890228,
2257
+ "learning_rate": 9.444488851467041e-08,
2258
+ "loss": 0.0098,
2259
+ "num_tokens": 130217369.0,
2260
+ "reward": 1.2702905237674713,
2261
+ "reward_std": 0.3846561126410961,
2262
+ "rewards/cosine_scaled_reward": 1.2702905088663101,
2263
+ "rewards/format_reward": 0.0,
2264
+ "step": 161
2265
+ },
2266
+ {
2267
+ "clip_ratio": 0.0,
2268
+ "completion_length": 732.4542770385742,
2269
+ "epoch": 1.6489226869455007,
2270
+ "grad_norm": 0.12551482021808624,
2271
+ "learning_rate": 8.928961243975436e-08,
2272
+ "loss": 0.0025,
2273
+ "num_tokens": 131006504.0,
2274
+ "reward": 1.1392531916499138,
2275
+ "reward_std": 0.4202544465661049,
2276
+ "rewards/cosine_scaled_reward": 1.150568701326847,
2277
+ "rewards/format_reward": 0.0,
2278
+ "step": 162
2279
+ },
2280
+ {
2281
+ "clip_ratio": 0.0,
2282
+ "completion_length": 690.4821739196777,
2283
+ "epoch": 1.659062103929024,
2284
+ "grad_norm": 0.14846356213092804,
2285
+ "learning_rate": 8.426519384872732e-08,
2286
+ "loss": 0.0151,
2287
+ "num_tokens": 131747312.0,
2288
+ "reward": 1.2898453325033188,
2289
+ "reward_std": 0.3518759645521641,
2290
+ "rewards/cosine_scaled_reward": 1.2898452877998352,
2291
+ "rewards/format_reward": 0.0,
2292
+ "step": 163
2293
+ },
2294
+ {
2295
+ "clip_ratio": 0.0,
2296
+ "completion_length": 749.0937881469727,
2297
+ "epoch": 1.6692015209125475,
2298
+ "grad_norm": 0.13933837413787842,
2299
+ "learning_rate": 7.937323358440934e-08,
2300
+ "loss": 0.028,
2301
+ "num_tokens": 132549532.0,
2302
+ "reward": 1.1977392584085464,
2303
+ "reward_std": 0.43290739692747593,
2304
+ "rewards/cosine_scaled_reward": 1.1977391764521599,
2305
+ "rewards/format_reward": 0.0,
2306
+ "step": 164
2307
+ },
2308
+ {
2309
+ "clip_ratio": 0.0,
2310
+ "completion_length": 750.6361999511719,
2311
+ "epoch": 1.6793409378960709,
2312
+ "grad_norm": 0.1263691633939743,
2313
+ "learning_rate": 7.461529028673463e-08,
2314
+ "loss": 0.0414,
2315
+ "num_tokens": 133354910.0,
2316
+ "reward": 1.2933627367019653,
2317
+ "reward_std": 0.41812827065587044,
2318
+ "rewards/cosine_scaled_reward": 1.2933626621961594,
2319
+ "rewards/format_reward": 0.0,
2320
+ "step": 165
2321
+ },
2322
+ {
2323
+ "clip_ratio": 0.0,
2324
+ "completion_length": 748.4531555175781,
2325
+ "epoch": 1.6894803548795945,
2326
+ "grad_norm": 0.1337745487689972,
2327
+ "learning_rate": 6.999287989614971e-08,
2328
+ "loss": 0.0003,
2329
+ "num_tokens": 134157964.0,
2330
+ "reward": 1.1261718794703484,
2331
+ "reward_std": 0.43879349157214165,
2332
+ "rewards/cosine_scaled_reward": 1.1261718571186066,
2333
+ "rewards/format_reward": 0.0,
2334
+ "step": 166
2335
+ },
2336
+ {
2337
+ "clip_ratio": 0.0,
2338
+ "completion_length": 684.155158996582,
2339
+ "epoch": 1.6996197718631179,
2340
+ "grad_norm": 0.14675858616828918,
2341
+ "learning_rate": 6.550747517061656e-08,
2342
+ "loss": 0.0135,
2343
+ "num_tokens": 134897511.0,
2344
+ "reward": 1.31067955493927,
2345
+ "reward_std": 0.4250149428844452,
2346
+ "rewards/cosine_scaled_reward": 1.3106794953346252,
2347
+ "rewards/format_reward": 0.0,
2348
+ "step": 167
2349
+ },
2350
+ {
2351
+ "clip_ratio": 0.0,
2352
+ "completion_length": 708.9955825805664,
2353
+ "epoch": 1.7097591888466415,
2354
+ "grad_norm": 0.1361108422279358,
2355
+ "learning_rate": 6.116050521637218e-08,
2356
+ "loss": 0.0403,
2357
+ "num_tokens": 135661099.0,
2358
+ "reward": 1.2582149505615234,
2359
+ "reward_std": 0.4354391284286976,
2360
+ "rewards/cosine_scaled_reward": 1.2582148760557175,
2361
+ "rewards/format_reward": 0.0,
2362
+ "step": 168
2363
+ },
2364
+ {
2365
+ "clip_ratio": 0.0,
2366
+ "completion_length": 708.2154312133789,
2367
+ "epoch": 1.7198986058301649,
2368
+ "grad_norm": 0.13875119388103485,
2369
+ "learning_rate": 5.6953355032598795e-08,
2370
+ "loss": -0.0346,
2371
+ "num_tokens": 136432732.0,
2372
+ "reward": 1.1267788708209991,
2373
+ "reward_std": 0.43002479895949364,
2374
+ "rewards/cosine_scaled_reward": 1.126778818666935,
2375
+ "rewards/format_reward": 0.0,
2376
+ "step": 169
2377
+ },
2378
+ {
2379
+ "clip_ratio": 0.0,
2380
+ "completion_length": 729.7221374511719,
2381
+ "epoch": 1.7300380228136882,
2382
+ "grad_norm": 0.136752188205719,
2383
+ "learning_rate": 5.288736507014435e-08,
2384
+ "loss": -0.0081,
2385
+ "num_tokens": 137210563.0,
2386
+ "reward": 1.251162275671959,
2387
+ "reward_std": 0.36924222111701965,
2388
+ "rewards/cosine_scaled_reward": 1.2511622309684753,
2389
+ "rewards/format_reward": 0.0,
2390
+ "step": 170
2391
+ },
2392
+ {
2393
+ "clip_ratio": 0.0,
2394
+ "completion_length": 707.0078430175781,
2395
+ "epoch": 1.7401774397972116,
2396
+ "grad_norm": 0.132461279630661,
2397
+ "learning_rate": 4.896383080443933e-08,
2398
+ "loss": 0.0231,
2399
+ "num_tokens": 137968794.0,
2400
+ "reward": 1.2026441097259521,
2401
+ "reward_std": 0.42860128730535507,
2402
+ "rewards/cosine_scaled_reward": 1.202644057571888,
2403
+ "rewards/format_reward": 0.0,
2404
+ "step": 171
2405
+ },
2406
+ {
2407
+ "clip_ratio": 0.0,
2408
+ "completion_length": 690.8460159301758,
2409
+ "epoch": 1.750316856780735,
2410
+ "grad_norm": 0.14912405610084534,
2411
+ "learning_rate": 4.518400232274078e-08,
2412
+ "loss": 0.0096,
2413
+ "num_tokens": 138718296.0,
2414
+ "reward": 1.188114494085312,
2415
+ "reward_std": 0.39624364115297794,
2416
+ "rewards/cosine_scaled_reward": 1.2003108784556389,
2417
+ "rewards/format_reward": 0.0,
2418
+ "step": 172
2419
+ },
2420
+ {
2421
+ "clip_ratio": 0.0,
2422
+ "completion_length": 684.9765930175781,
2423
+ "epoch": 1.7604562737642584,
2424
+ "grad_norm": 0.14246663451194763,
2425
+ "learning_rate": 4.1549083925840165e-08,
2426
+ "loss": 0.0125,
2427
+ "num_tokens": 139449483.0,
2428
+ "reward": 1.2953269332647324,
2429
+ "reward_std": 0.4603438973426819,
2430
+ "rewards/cosine_scaled_reward": 1.29532690346241,
2431
+ "rewards/format_reward": 0.0,
2432
+ "step": 173
2433
+ },
2434
+ {
2435
+ "clip_ratio": 0.0,
2436
+ "completion_length": 685.9721298217773,
2437
+ "epoch": 1.770595690747782,
2438
+ "grad_norm": 0.1809549778699875,
2439
+ "learning_rate": 3.806023374435663e-08,
2440
+ "loss": -0.0066,
2441
+ "num_tokens": 140192642.0,
2442
+ "reward": 1.18511962890625,
2443
+ "reward_std": 0.365385384298861,
2444
+ "rewards/cosine_scaled_reward": 1.1851196065545082,
2445
+ "rewards/format_reward": 0.0,
2446
+ "step": 174
2447
+ },
2448
+ {
2449
+ "clip_ratio": 0.0,
2450
+ "completion_length": 718.2578353881836,
2451
+ "epoch": 1.7807351077313056,
2452
+ "grad_norm": 2.9148614406585693,
2453
+ "learning_rate": 3.4718563369743213e-08,
2454
+ "loss": 0.0077,
2455
+ "num_tokens": 140969529.0,
2456
+ "reward": 1.1897177621722221,
2457
+ "reward_std": 0.44410305470228195,
2458
+ "rewards/cosine_scaled_reward": 1.1897177323698997,
2459
+ "rewards/format_reward": 0.0,
2460
+ "step": 175
2461
+ },
2462
+ {
2463
+ "clip_ratio": 0.0,
2464
+ "completion_length": 672.2578392028809,
2465
+ "epoch": 1.790874524714829,
2466
+ "grad_norm": 0.14049698412418365,
2467
+ "learning_rate": 3.15251375001192e-08,
2468
+ "loss": 0.0167,
2469
+ "num_tokens": 141711976.0,
2470
+ "reward": 1.2624182030558586,
2471
+ "reward_std": 0.45537084713578224,
2472
+ "rewards/cosine_scaled_reward": 1.2624181509017944,
2473
+ "rewards/format_reward": 0.0,
2474
+ "step": 176
2475
+ },
2476
+ {
2477
+ "clip_ratio": 0.0,
2478
+ "completion_length": 697.0145378112793,
2479
+ "epoch": 1.8010139416983524,
2480
+ "grad_norm": 0.22702006995677948,
2481
+ "learning_rate": 2.8480973601043955e-08,
2482
+ "loss": -0.01,
2483
+ "num_tokens": 142471309.0,
2484
+ "reward": 1.249974675476551,
2485
+ "reward_std": 0.3576275184750557,
2486
+ "rewards/cosine_scaled_reward": 1.2499746307730675,
2487
+ "rewards/format_reward": 0.0,
2488
+ "step": 177
2489
+ },
2490
+ {
2491
+ "clip_ratio": 0.0,
2492
+ "completion_length": 744.4810409545898,
2493
+ "epoch": 1.8111533586818758,
2494
+ "grad_norm": 0.12813909351825714,
2495
+ "learning_rate": 2.558704158134023e-08,
2496
+ "loss": -0.0078,
2497
+ "num_tokens": 143278388.0,
2498
+ "reward": 1.1319350376725197,
2499
+ "reward_std": 0.41413314267992973,
2500
+ "rewards/cosine_scaled_reward": 1.1319349706172943,
2501
+ "rewards/format_reward": 0.0,
2502
+ "step": 178
2503
+ },
2504
+ {
2505
+ "clip_ratio": 0.0,
2506
+ "completion_length": 750.3560638427734,
2507
+ "epoch": 1.8212927756653992,
2508
+ "grad_norm": 0.13594254851341248,
2509
+ "learning_rate": 2.2844263484068093e-08,
2510
+ "loss": 0.0003,
2511
+ "num_tokens": 144077707.0,
2512
+ "reward": 1.2112557888031006,
2513
+ "reward_std": 0.3645152598619461,
2514
+ "rewards/cosine_scaled_reward": 1.2112557291984558,
2515
+ "rewards/format_reward": 0.0,
2516
+ "step": 179
2517
+ },
2518
+ {
2519
+ "clip_ratio": 0.0,
2520
+ "completion_length": 682.1417694091797,
2521
+ "epoch": 1.8314321926489225,
2522
+ "grad_norm": 0.15148992836475372,
2523
+ "learning_rate": 2.025351319275137e-08,
2524
+ "loss": -0.0027,
2525
+ "num_tokens": 144819874.0,
2526
+ "reward": 1.2360253632068634,
2527
+ "reward_std": 0.43914780393242836,
2528
+ "rewards/cosine_scaled_reward": 1.2360252812504768,
2529
+ "rewards/format_reward": 0.0,
2530
+ "step": 180
2531
+ },
2532
+ {
2533
+ "clip_ratio": 0.0,
2534
+ "completion_length": 724.106071472168,
2535
+ "epoch": 1.8415716096324461,
2536
+ "grad_norm": 0.14325113594532013,
2537
+ "learning_rate": 1.781561615294652e-08,
2538
+ "loss": 0.0052,
2539
+ "num_tokens": 145603137.0,
2540
+ "reward": 1.226213201880455,
2541
+ "reward_std": 0.4393893778324127,
2542
+ "rewards/cosine_scaled_reward": 1.2262131571769714,
2543
+ "rewards/format_reward": 0.0,
2544
+ "step": 181
2545
+ },
2546
+ {
2547
+ "clip_ratio": 0.0,
2548
+ "completion_length": 782.7399749755859,
2549
+ "epoch": 1.8517110266159695,
2550
+ "grad_norm": 0.1272156685590744,
2551
+ "learning_rate": 1.553134910924636e-08,
2552
+ "loss": -0.0251,
2553
+ "num_tokens": 146440576.0,
2554
+ "reward": 1.0694795548915863,
2555
+ "reward_std": 0.3888841047883034,
2556
+ "rewards/cosine_scaled_reward": 1.0694795101881027,
2557
+ "rewards/format_reward": 0.0,
2558
+ "step": 182
2559
+ },
2560
+ {
2561
+ "clip_ratio": 0.0,
2562
+ "completion_length": 677.8895492553711,
2563
+ "epoch": 1.8618504435994931,
2564
+ "grad_norm": 0.14028111100196838,
2565
+ "learning_rate": 1.340143985779829e-08,
2566
+ "loss": 0.0087,
2567
+ "num_tokens": 147174725.0,
2568
+ "reward": 1.2450221478939056,
2569
+ "reward_std": 0.3928499836474657,
2570
+ "rewards/cosine_scaled_reward": 1.2450221329927444,
2571
+ "rewards/format_reward": 0.0,
2572
+ "step": 183
2573
+ },
2574
+ {
2575
+ "clip_ratio": 0.0,
2576
+ "completion_length": 698.4096298217773,
2577
+ "epoch": 1.8719898605830165,
2578
+ "grad_norm": 0.1386420577764511,
2579
+ "learning_rate": 1.1426567014420297e-08,
2580
+ "loss": -0.001,
2581
+ "num_tokens": 147927204.0,
2582
+ "reward": 1.2862959504127502,
2583
+ "reward_std": 0.39938454143702984,
2584
+ "rewards/cosine_scaled_reward": 1.2862958759069443,
2585
+ "rewards/format_reward": 0.0,
2586
+ "step": 184
2587
+ },
2588
+ {
2589
+ "clip_ratio": 0.0,
2590
+ "completion_length": 749.4877471923828,
2591
+ "epoch": 1.88212927756654,
2592
+ "grad_norm": 0.26094314455986023,
2593
+ "learning_rate": 9.607359798384784e-09,
2594
+ "loss": 0.0099,
2595
+ "num_tokens": 148714225.0,
2596
+ "reward": 1.15062565356493,
2597
+ "reward_std": 0.49165763705968857,
2598
+ "rewards/cosine_scaled_reward": 1.150625616312027,
2599
+ "rewards/format_reward": 0.0,
2600
+ "step": 185
2601
+ },
2602
+ {
2603
+ "clip_ratio": 0.0,
2604
+ "completion_length": 701.1071624755859,
2605
+ "epoch": 1.8922686945500633,
2606
+ "grad_norm": 0.14008425176143646,
2607
+ "learning_rate": 7.944397831941951e-09,
2608
+ "loss": 0.0034,
2609
+ "num_tokens": 149488913.0,
2610
+ "reward": 1.181966707110405,
2611
+ "reward_std": 0.4252682514488697,
2612
+ "rewards/cosine_scaled_reward": 1.181966632604599,
2613
+ "rewards/format_reward": 0.0,
2614
+ "step": 186
2615
+ },
2616
+ {
2617
+ "clip_ratio": 0.0,
2618
+ "completion_length": 779.9766082763672,
2619
+ "epoch": 1.9024081115335867,
2620
+ "grad_norm": 0.16207562386989594,
2621
+ "learning_rate": 6.438210955644452e-09,
2622
+ "loss": -0.0089,
2623
+ "num_tokens": 150330196.0,
2624
+ "reward": 1.0697861537337303,
2625
+ "reward_std": 0.3918624483048916,
2626
+ "rewards/cosine_scaled_reward": 1.0697861164808273,
2627
+ "rewards/format_reward": 0.0,
2628
+ "step": 187
2629
+ },
2630
+ {
2631
+ "clip_ratio": 0.0,
2632
+ "completion_length": 725.6529235839844,
2633
+ "epoch": 1.9125475285171103,
2634
+ "grad_norm": 0.1371779590845108,
2635
+ "learning_rate": 5.0892790595336575e-09,
2636
+ "loss": 0.0236,
2637
+ "num_tokens": 151117397.0,
2638
+ "reward": 1.2266732975840569,
2639
+ "reward_std": 0.4197930619120598,
2640
+ "rewards/cosine_scaled_reward": 1.226673237979412,
2641
+ "rewards/format_reward": 0.0,
2642
+ "step": 188
2643
+ },
2644
+ {
2645
+ "clip_ratio": 0.0,
2646
+ "completion_length": 735.1306228637695,
2647
+ "epoch": 1.9226869455006337,
2648
+ "grad_norm": 0.14734813570976257,
2649
+ "learning_rate": 3.898031930240797e-09,
2650
+ "loss": -0.013,
2651
+ "num_tokens": 151901074.0,
2652
+ "reward": 1.1564877182245255,
2653
+ "reward_std": 0.4156847130507231,
2654
+ "rewards/cosine_scaled_reward": 1.1564876809716225,
2655
+ "rewards/format_reward": 0.0,
2656
+ "step": 189
2657
+ },
2658
+ {
2659
+ "clip_ratio": 0.0,
2660
+ "completion_length": 774.4129943847656,
2661
+ "epoch": 1.9328263624841573,
2662
+ "grad_norm": 0.12307488918304443,
2663
+ "learning_rate": 2.8648491140513264e-09,
2664
+ "loss": -0.0197,
2665
+ "num_tokens": 152721396.0,
2666
+ "reward": 1.2314427196979523,
2667
+ "reward_std": 0.39738379418849945,
2668
+ "rewards/cosine_scaled_reward": 1.2314426600933075,
2669
+ "rewards/format_reward": 0.0,
2670
+ "step": 190
2671
+ },
2672
+ {
2673
+ "clip_ratio": 0.0,
2674
+ "completion_length": 639.6127433776855,
2675
+ "epoch": 1.9429657794676807,
2676
+ "grad_norm": 0.15224085748195648,
2677
+ "learning_rate": 1.9900597959770505e-09,
2678
+ "loss": 0.0385,
2679
+ "num_tokens": 153419617.0,
2680
+ "reward": 1.3687696307897568,
2681
+ "reward_std": 0.3240409158170223,
2682
+ "rewards/cosine_scaled_reward": 1.3687695413827896,
2683
+ "rewards/format_reward": 0.0,
2684
+ "step": 191
2685
+ },
2686
+ {
2687
+ "clip_ratio": 0.0,
2688
+ "completion_length": 727.8538208007812,
2689
+ "epoch": 1.953105196451204,
2690
+ "grad_norm": 0.12927082180976868,
2691
+ "learning_rate": 1.2739426948732424e-09,
2692
+ "loss": 0.002,
2693
+ "num_tokens": 154205742.0,
2694
+ "reward": 1.2560684382915497,
2695
+ "reward_std": 0.3877686746418476,
2696
+ "rewards/cosine_scaled_reward": 1.256068378686905,
2697
+ "rewards/format_reward": 0.0,
2698
+ "step": 192
2699
+ },
2700
+ {
2701
+ "clip_ratio": 0.0,
2702
+ "completion_length": 720.2232513427734,
2703
+ "epoch": 1.9632446134347274,
2704
+ "grad_norm": 0.5261313915252686,
2705
+ "learning_rate": 7.16725974635568e-10,
2706
+ "loss": 0.0203,
2707
+ "num_tokens": 154976902.0,
2708
+ "reward": 1.1852427944540977,
2709
+ "reward_std": 0.4709174670279026,
2710
+ "rewards/cosine_scaled_reward": 1.1852427273988724,
2711
+ "rewards/format_reward": 0.0,
2712
+ "step": 193
2713
+ },
2714
+ {
2715
+ "clip_ratio": 0.0,
2716
+ "completion_length": 744.1283721923828,
2717
+ "epoch": 1.9733840304182508,
2718
+ "grad_norm": 0.13184396922588348,
2719
+ "learning_rate": 3.185871715041255e-10,
2720
+ "loss": -0.0006,
2721
+ "num_tokens": 155776489.0,
2722
+ "reward": 1.153962902724743,
2723
+ "reward_std": 0.35575347393751144,
2724
+ "rewards/cosine_scaled_reward": 1.1539628580212593,
2725
+ "rewards/format_reward": 0.0,
2726
+ "step": 194
2727
+ },
2728
+ {
2729
+ "clip_ratio": 0.0,
2730
+ "completion_length": 681.7768096923828,
2731
+ "epoch": 1.9835234474017744,
2732
+ "grad_norm": 0.14284567534923553,
2733
+ "learning_rate": 7.96531374983589e-11,
2734
+ "loss": 0.0074,
2735
+ "num_tokens": 156507841.0,
2736
+ "reward": 1.263557255268097,
2737
+ "reward_std": 0.42133441381156445,
2738
+ "rewards/cosine_scaled_reward": 1.263557218015194,
2739
+ "rewards/format_reward": 0.0,
2740
+ "step": 195
2741
+ },
2742
+ {
2743
+ "clip_ratio": 0.0,
2744
+ "completion_length": 712.19921875,
2745
+ "epoch": 1.9936628643852978,
2746
+ "grad_norm": 0.12404794245958328,
2747
+ "learning_rate": 0.0,
2748
+ "loss": -0.0205,
2749
+ "num_tokens": 157287049.0,
2750
+ "reward": 1.1777000427246094,
2751
+ "reward_std": 0.4066210947930813,
2752
+ "rewards/cosine_scaled_reward": 1.1776999980211258,
2753
+ "rewards/format_reward": 0.0,
2754
+ "step": 196
2755
+ },
2756
+ {
2757
+ "epoch": 1.9936628643852978,
2758
+ "step": 196,
2759
+ "total_flos": 0.0,
2760
+ "train_loss": -0.009652293185873921,
2761
+ "train_runtime": 33755.1654,
2762
+ "train_samples_per_second": 0.654,
2763
+ "train_steps_per_second": 0.006
2764
+ }
2765
+ ],
2766
+ "logging_steps": 1,
2767
+ "max_steps": 196,
2768
+ "num_input_tokens_seen": 0,
2769
+ "num_train_epochs": 2,
2770
+ "save_steps": 500,
2771
+ "stateful_callbacks": {
2772
+ "TrainerControl": {
2773
+ "args": {
2774
+ "should_epoch_stop": false,
2775
+ "should_evaluate": false,
2776
+ "should_log": false,
2777
+ "should_save": true,
2778
+ "should_training_stop": true
2779
+ },
2780
+ "attributes": {}
2781
+ }
2782
+ },
2783
+ "total_flos": 0.0,
2784
+ "train_batch_size": 16,
2785
+ "trial_name": null,
2786
+ "trial_params": null
2787
+ }