nlee-208 commited on
Commit
95ef226
1 Parent(s): 4a61a79

Model save

Browse files
README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: mistralai/Mistral-7B-v0.1
4
+ tags:
5
+ - trl
6
+ - kto
7
+ - generated_from_trainer
8
+ model-index:
9
+ - name: zephyr-7b-kto
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/nlee28/huggingface/runs/odmtxd9p)
17
+ # zephyr-7b-kto
18
+
19
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-07
39
+ - train_batch_size: 4
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - num_devices: 4
44
+ - gradient_accumulation_steps: 2
45
+ - total_train_batch_size: 32
46
+ - total_eval_batch_size: 32
47
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
+ - lr_scheduler_type: cosine
49
+ - num_epochs: 1
50
+
51
+ ### Training results
52
+
53
+
54
+
55
+ ### Framework versions
56
+
57
+ - Transformers 4.42.4
58
+ - Pytorch 2.1.2+cu121
59
+ - Datasets 2.16.1
60
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.321955375749023,
5
+ "train_runtime": 31031.977,
6
+ "train_samples": 60917,
7
+ "train_samples_per_second": 1.963,
8
+ "train_steps_per_second": 0.061
9
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.42.4"
6
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46f8b429e795066757438ee4f5e111f05d1dedd5a204e1768338157d806ac453
3
+ size 4943162336
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f5488369d7fb93a6995ce580fc61441f62b827c72815e00d96007d231690c51
3
+ size 4999819336
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6636ec0ebc90538d14353926a784e25a86cbc48be9ada07792068ff299a0c7d6
3
+ size 4540516344
model.safetensors.index.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 14483464192
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00003-of-00003.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
225
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
226
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
227
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
229
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
230
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
231
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
232
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
235
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
238
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
239
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
240
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
241
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
242
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
243
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
244
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
245
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
246
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
247
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
248
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
249
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
250
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
251
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
252
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
253
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
254
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
255
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
256
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
257
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
258
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
259
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
260
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
261
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
262
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
263
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
264
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
265
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
266
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
267
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
268
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
269
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
270
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
271
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
272
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
273
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
274
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
275
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
276
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
277
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
278
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
279
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
280
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
281
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
282
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
283
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
284
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
285
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
286
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
287
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
288
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
289
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
290
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
291
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
292
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
293
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
294
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
295
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
296
+ "model.norm.weight": "model-00003-of-00003.safetensors"
297
+ }
298
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.321955375749023,
5
+ "train_runtime": 31031.977,
6
+ "train_samples": 60917,
7
+ "train_samples_per_second": 1.963,
8
+ "train_steps_per_second": 0.061
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,2512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 1904,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.005252100840336135,
13
+ "grad_norm": 70.43905184347379,
14
+ "kl": 0.21706357598304749,
15
+ "learning_rate": 4.999659696812289e-07,
16
+ "logps/chosen": -305.59059320494185,
17
+ "logps/rejected": -267.9389252533784,
18
+ "loss": 0.5004,
19
+ "rewards/chosen": -0.09660225136335505,
20
+ "rewards/margins": 0.019717185921372296,
21
+ "rewards/rejected": -0.11631943728472735,
22
+ "step": 10
23
+ },
24
+ {
25
+ "epoch": 0.01050420168067227,
26
+ "grad_norm": 81.8070303563997,
27
+ "kl": 1.2398526668548584,
28
+ "learning_rate": 4.998638879894165e-07,
29
+ "logps/chosen": -279.9350071957237,
30
+ "logps/rejected": -239.82217261904762,
31
+ "loss": 0.4612,
32
+ "rewards/chosen": 0.36437626888877467,
33
+ "rewards/margins": 0.3501945868470615,
34
+ "rewards/rejected": 0.01418168204171317,
35
+ "step": 20
36
+ },
37
+ {
38
+ "epoch": 0.015756302521008403,
39
+ "grad_norm": 73.53961446495941,
40
+ "kl": 0.0,
41
+ "learning_rate": 4.996937827155428e-07,
42
+ "logps/chosen": -327.1117964181287,
43
+ "logps/rejected": -270.5936713506711,
44
+ "loss": 0.4185,
45
+ "rewards/chosen": -0.6132015652126737,
46
+ "rewards/margins": 1.3433878967603758,
47
+ "rewards/rejected": -1.9565894619730495,
48
+ "step": 30
49
+ },
50
+ {
51
+ "epoch": 0.02100840336134454,
52
+ "grad_norm": 51.541845841038615,
53
+ "kl": 3.030362129211426,
54
+ "learning_rate": 4.994557001695013e-07,
55
+ "logps/chosen": -286.4127286585366,
56
+ "logps/rejected": -238.16353665865384,
57
+ "loss": 0.4045,
58
+ "rewards/chosen": 1.3511776807831555,
59
+ "rewards/margins": 1.1392201160624147,
60
+ "rewards/rejected": 0.21195756472074068,
61
+ "step": 40
62
+ },
63
+ {
64
+ "epoch": 0.026260504201680673,
65
+ "grad_norm": 83.73385866105204,
66
+ "kl": 0.0,
67
+ "learning_rate": 4.991497051674917e-07,
68
+ "logps/chosen": -295.27088246855345,
69
+ "logps/rejected": -278.80997670807454,
70
+ "loss": 0.3944,
71
+ "rewards/chosen": 0.2137240523812156,
72
+ "rewards/margins": 1.5743222196584865,
73
+ "rewards/rejected": -1.360598167277271,
74
+ "step": 50
75
+ },
76
+ {
77
+ "epoch": 0.031512605042016806,
78
+ "grad_norm": 57.616560952806196,
79
+ "kl": 0.0,
80
+ "learning_rate": 4.987758810143735e-07,
81
+ "logps/chosen": -325.4705078125,
82
+ "logps/rejected": -255.959326171875,
83
+ "loss": 0.4034,
84
+ "rewards/chosen": 1.1321415901184082,
85
+ "rewards/margins": 1.2807276964187622,
86
+ "rewards/rejected": -0.14858610630035402,
87
+ "step": 60
88
+ },
89
+ {
90
+ "epoch": 0.03676470588235294,
91
+ "grad_norm": 62.734358016783204,
92
+ "kl": 0.0,
93
+ "learning_rate": 4.983343294809874e-07,
94
+ "logps/chosen": -306.6684683866279,
95
+ "logps/rejected": -258.4501953125,
96
+ "loss": 0.3822,
97
+ "rewards/chosen": 1.1383993459302326,
98
+ "rewards/margins": 1.6607683698461164,
99
+ "rewards/rejected": -0.5223690239158837,
100
+ "step": 70
101
+ },
102
+ {
103
+ "epoch": 0.04201680672268908,
104
+ "grad_norm": 52.446040187552235,
105
+ "kl": 0.0,
106
+ "learning_rate": 4.978251707764491e-07,
107
+ "logps/chosen": -304.0558176100629,
108
+ "logps/rejected": -276.5557793090062,
109
+ "loss": 0.3593,
110
+ "rewards/chosen": 0.427140049964377,
111
+ "rewards/margins": 2.316681770716027,
112
+ "rewards/rejected": -1.8895417207516498,
113
+ "step": 80
114
+ },
115
+ {
116
+ "epoch": 0.04726890756302521,
117
+ "grad_norm": 51.1401597083277,
118
+ "kl": 0.0,
119
+ "learning_rate": 4.972485435154228e-07,
120
+ "logps/chosen": -284.92168090062114,
121
+ "logps/rejected": -289.27459217767296,
122
+ "loss": 0.3601,
123
+ "rewards/chosen": 1.139393302964868,
124
+ "rewards/margins": 2.435457571832792,
125
+ "rewards/rejected": -1.2960642688679245,
126
+ "step": 90
127
+ },
128
+ {
129
+ "epoch": 0.052521008403361345,
130
+ "grad_norm": 44.236393372962524,
131
+ "kl": 0.0,
132
+ "learning_rate": 4.966046046803842e-07,
133
+ "logps/chosen": -312.7745820063694,
134
+ "logps/rejected": -278.74285851226995,
135
+ "loss": 0.3593,
136
+ "rewards/chosen": 0.3532914568664162,
137
+ "rewards/margins": 2.3767495515510406,
138
+ "rewards/rejected": -2.0234580946846243,
139
+ "step": 100
140
+ },
141
+ {
142
+ "epoch": 0.05777310924369748,
143
+ "grad_norm": 38.95152051376054,
144
+ "kl": 0.0,
145
+ "learning_rate": 4.958935295788841e-07,
146
+ "logps/chosen": -364.43832781456956,
147
+ "logps/rejected": -306.8252588757396,
148
+ "loss": 0.3823,
149
+ "rewards/chosen": -1.0973136851329677,
150
+ "rewards/margins": 2.0935938101286817,
151
+ "rewards/rejected": -3.1909074952616496,
152
+ "step": 110
153
+ },
154
+ {
155
+ "epoch": 0.06302521008403361,
156
+ "grad_norm": 45.75317629434847,
157
+ "kl": 0.0,
158
+ "learning_rate": 4.951155117958216e-07,
159
+ "logps/chosen": -304.1672453703704,
160
+ "logps/rejected": -288.4255340189873,
161
+ "loss": 0.3693,
162
+ "rewards/chosen": -0.012564458964783469,
163
+ "rewards/margins": 2.483852622843959,
164
+ "rewards/rejected": -2.496417081808742,
165
+ "step": 120
166
+ },
167
+ {
168
+ "epoch": 0.06827731092436974,
169
+ "grad_norm": 52.69834029914945,
170
+ "kl": 0.0,
171
+ "learning_rate": 4.942707631407419e-07,
172
+ "logps/chosen": -331.2020513523392,
173
+ "logps/rejected": -275.2519662332215,
174
+ "loss": 0.3931,
175
+ "rewards/chosen": 0.4922319378769189,
176
+ "rewards/margins": 1.7257208016235108,
177
+ "rewards/rejected": -1.233488863746592,
178
+ "step": 130
179
+ },
180
+ {
181
+ "epoch": 0.07352941176470588,
182
+ "grad_norm": 65.97698112794365,
183
+ "kl": 0.0,
184
+ "learning_rate": 4.933595135901732e-07,
185
+ "logps/chosen": -314.8364361702128,
186
+ "logps/rejected": -269.268156424581,
187
+ "loss": 0.3674,
188
+ "rewards/chosen": 0.04331512315898922,
189
+ "rewards/margins": 1.9166126173759934,
190
+ "rewards/rejected": -1.8732974942170042,
191
+ "step": 140
192
+ },
193
+ {
194
+ "epoch": 0.07878151260504201,
195
+ "grad_norm": 75.72251586801518,
196
+ "kl": 0.0,
197
+ "learning_rate": 4.923820112250169e-07,
198
+ "logps/chosen": -310.11692533557044,
199
+ "logps/rejected": -272.4901087353801,
200
+ "loss": 0.3438,
201
+ "rewards/chosen": -0.31625601749292154,
202
+ "rewards/margins": 2.9441549382819323,
203
+ "rewards/rejected": -3.2604109557748537,
204
+ "step": 150
205
+ },
206
+ {
207
+ "epoch": 0.08403361344537816,
208
+ "grad_norm": 51.266519524836305,
209
+ "kl": 0.0,
210
+ "learning_rate": 4.913385221630096e-07,
211
+ "logps/chosen": -264.80689858490564,
212
+ "logps/rejected": -284.40986510093165,
213
+ "loss": 0.3589,
214
+ "rewards/chosen": 0.2699741627435264,
215
+ "rewards/margins": 2.429860872395118,
216
+ "rewards/rejected": -2.1598867096515915,
217
+ "step": 160
218
+ },
219
+ {
220
+ "epoch": 0.08928571428571429,
221
+ "grad_norm": 69.780370721156,
222
+ "kl": 0.0,
223
+ "learning_rate": 4.902293304862749e-07,
224
+ "logps/chosen": -281.5589111328125,
225
+ "logps/rejected": -274.93525390625,
226
+ "loss": 0.361,
227
+ "rewards/chosen": 0.5642091274261475,
228
+ "rewards/margins": 2.2661412715911866,
229
+ "rewards/rejected": -1.701932144165039,
230
+ "step": 170
231
+ },
232
+ {
233
+ "epoch": 0.09453781512605042,
234
+ "grad_norm": 48.456143880910204,
235
+ "kl": 0.0,
236
+ "learning_rate": 4.890547381639833e-07,
237
+ "logps/chosen": -312.7041968368902,
238
+ "logps/rejected": -251.9474158653846,
239
+ "loss": 0.3564,
240
+ "rewards/chosen": -0.15778241506436977,
241
+ "rewards/margins": 2.705699155448451,
242
+ "rewards/rejected": -2.8634815705128207,
243
+ "step": 180
244
+ },
245
+ {
246
+ "epoch": 0.09978991596638656,
247
+ "grad_norm": 49.10681910714073,
248
+ "kl": 0.0,
249
+ "learning_rate": 4.878150649701439e-07,
250
+ "logps/chosen": -356.0400260416667,
251
+ "logps/rejected": -285.11312040441175,
252
+ "loss": 0.3345,
253
+ "rewards/chosen": -0.45484156290690103,
254
+ "rewards/margins": 3.4065278505811505,
255
+ "rewards/rejected": -3.8613694134880516,
256
+ "step": 190
257
+ },
258
+ {
259
+ "epoch": 0.10504201680672269,
260
+ "grad_norm": 42.17787499105965,
261
+ "kl": 0.0,
262
+ "learning_rate": 4.865106483965486e-07,
263
+ "logps/chosen": -297.6003605769231,
264
+ "logps/rejected": -252.6407440929878,
265
+ "loss": 0.3495,
266
+ "rewards/chosen": -0.08823707164862217,
267
+ "rewards/margins": 2.8701916601301507,
268
+ "rewards/rejected": -2.9584287317787727,
269
+ "step": 200
270
+ },
271
+ {
272
+ "epoch": 0.11029411764705882,
273
+ "grad_norm": 32.7456875390811,
274
+ "kl": 0.0,
275
+ "learning_rate": 4.851418435608919e-07,
276
+ "logps/chosen": -292.8467514124294,
277
+ "logps/rejected": -265.97642591783216,
278
+ "loss": 0.3655,
279
+ "rewards/chosen": 0.9715841692046258,
280
+ "rewards/margins": 2.2475568689901917,
281
+ "rewards/rejected": -1.275972699785566,
282
+ "step": 210
283
+ },
284
+ {
285
+ "epoch": 0.11554621848739496,
286
+ "grad_norm": 46.476558893396145,
287
+ "kl": 0.0,
288
+ "learning_rate": 4.837090231100927e-07,
289
+ "logps/chosen": -315.36924463757396,
290
+ "logps/rejected": -251.48538389900662,
291
+ "loss": 0.3217,
292
+ "rewards/chosen": 1.1581101276465422,
293
+ "rewards/margins": 2.9979615575668648,
294
+ "rewards/rejected": -1.8398514299203228,
295
+ "step": 220
296
+ },
297
+ {
298
+ "epoch": 0.1207983193277311,
299
+ "grad_norm": 58.692837949072945,
300
+ "kl": 0.0,
301
+ "learning_rate": 4.822125771188448e-07,
302
+ "logps/chosen": -306.7650669642857,
303
+ "logps/rejected": -301.322265625,
304
+ "loss": 0.3701,
305
+ "rewards/chosen": -0.7039821178882153,
306
+ "rewards/margins": 2.5890142082365966,
307
+ "rewards/rejected": -3.2929963261248116,
308
+ "step": 230
309
+ },
310
+ {
311
+ "epoch": 0.12605042016806722,
312
+ "grad_norm": 38.77396195360521,
313
+ "kl": 0.0,
314
+ "learning_rate": 4.806529129834207e-07,
315
+ "logps/chosen": -299.9251123715753,
316
+ "logps/rejected": -295.51858836206895,
317
+ "loss": 0.3211,
318
+ "rewards/chosen": -0.4328058843743311,
319
+ "rewards/margins": 3.400510611674519,
320
+ "rewards/rejected": -3.8333164960488504,
321
+ "step": 240
322
+ },
323
+ {
324
+ "epoch": 0.13130252100840337,
325
+ "grad_norm": 54.771749584890465,
326
+ "kl": 0.0,
327
+ "learning_rate": 4.790304553107622e-07,
328
+ "logps/chosen": -305.0079280695266,
329
+ "logps/rejected": -256.60885761589407,
330
+ "loss": 0.3722,
331
+ "rewards/chosen": 0.8469177787825906,
332
+ "rewards/margins": 2.0538868820589253,
333
+ "rewards/rejected": -1.2069691032763348,
334
+ "step": 250
335
+ },
336
+ {
337
+ "epoch": 0.13655462184873948,
338
+ "grad_norm": 56.498632681940265,
339
+ "kl": 0.0,
340
+ "learning_rate": 4.773456458028837e-07,
341
+ "logps/chosen": -278.4545183121019,
342
+ "logps/rejected": -264.5601514570552,
343
+ "loss": 0.3744,
344
+ "rewards/chosen": 1.3172516063520103,
345
+ "rewards/margins": 2.152952080254426,
346
+ "rewards/rejected": -0.8357004739024156,
347
+ "step": 260
348
+ },
349
+ {
350
+ "epoch": 0.14180672268907563,
351
+ "grad_norm": 56.9574995100581,
352
+ "kl": 0.0,
353
+ "learning_rate": 4.755989431366221e-07,
354
+ "logps/chosen": -304.06099759615387,
355
+ "logps/rejected": -293.3189310213415,
356
+ "loss": 0.3093,
357
+ "rewards/chosen": 1.1587672111315606,
358
+ "rewards/margins": 4.173915839180341,
359
+ "rewards/rejected": -3.0151486280487805,
360
+ "step": 270
361
+ },
362
+ {
363
+ "epoch": 0.14705882352941177,
364
+ "grad_norm": 63.035012985547,
365
+ "kl": 0.0,
366
+ "learning_rate": 4.737908228387656e-07,
367
+ "logps/chosen": -297.4018322172619,
368
+ "logps/rejected": -267.7578895970395,
369
+ "loss": 0.3581,
370
+ "rewards/chosen": 1.0978340875534784,
371
+ "rewards/margins": 2.6742270142213442,
372
+ "rewards/rejected": -1.576392926667866,
373
+ "step": 280
374
+ },
375
+ {
376
+ "epoch": 0.15231092436974789,
377
+ "grad_norm": 57.87624552889083,
378
+ "kl": 0.0,
379
+ "learning_rate": 4.7192177715659516e-07,
380
+ "logps/chosen": -293.31778630239523,
381
+ "logps/rejected": -253.0896905637255,
382
+ "loss": 0.3583,
383
+ "rewards/chosen": 0.7335890809932869,
384
+ "rewards/margins": 2.5885067194353826,
385
+ "rewards/rejected": -1.8549176384420956,
386
+ "step": 290
387
+ },
388
+ {
389
+ "epoch": 0.15756302521008403,
390
+ "grad_norm": 63.42262644789701,
391
+ "kl": 0.0,
392
+ "learning_rate": 4.699923149238736e-07,
393
+ "logps/chosen": -300.68055867805754,
394
+ "logps/rejected": -294.7720778660221,
395
+ "loss": 0.3416,
396
+ "rewards/chosen": -0.4132621820024449,
397
+ "rewards/margins": 3.4406516889618164,
398
+ "rewards/rejected": -3.853913870964261,
399
+ "step": 300
400
+ },
401
+ {
402
+ "epoch": 0.16281512605042017,
403
+ "grad_norm": 58.34203205576481,
404
+ "kl": 0.0,
405
+ "learning_rate": 4.680029614223198e-07,
406
+ "logps/chosen": -284.8943819665605,
407
+ "logps/rejected": -281.19528853527606,
408
+ "loss": 0.374,
409
+ "rewards/chosen": -0.6231775951992934,
410
+ "rewards/margins": 2.335680965000093,
411
+ "rewards/rejected": -2.9588585601993866,
412
+ "step": 310
413
+ },
414
+ {
415
+ "epoch": 0.16806722689075632,
416
+ "grad_norm": 45.38290295645773,
417
+ "kl": 0.0,
418
+ "learning_rate": 4.65954258238604e-07,
419
+ "logps/chosen": -297.1282980913174,
420
+ "logps/rejected": -263.37903390522877,
421
+ "loss": 0.3695,
422
+ "rewards/chosen": 0.311750309196061,
423
+ "rewards/margins": 2.5421297005359302,
424
+ "rewards/rejected": -2.230379391339869,
425
+ "step": 320
426
+ },
427
+ {
428
+ "epoch": 0.17331932773109243,
429
+ "grad_norm": 67.2098515597697,
430
+ "kl": 0.0,
431
+ "learning_rate": 4.638467631169056e-07,
432
+ "logps/chosen": -328.67982700892856,
433
+ "logps/rejected": -309.46533203125,
434
+ "loss": 0.3203,
435
+ "rewards/chosen": 0.9687714349655878,
436
+ "rewards/margins": 3.498681472357652,
437
+ "rewards/rejected": -2.529910037392064,
438
+ "step": 330
439
+ },
440
+ {
441
+ "epoch": 0.17857142857142858,
442
+ "grad_norm": 55.44522631657296,
443
+ "kl": 0.0,
444
+ "learning_rate": 4.6168104980707103e-07,
445
+ "logps/chosen": -285.8288395579268,
446
+ "logps/rejected": -275.2672526041667,
447
+ "loss": 0.3315,
448
+ "rewards/chosen": 0.7993925141125191,
449
+ "rewards/margins": 3.4423916251902433,
450
+ "rewards/rejected": -2.6429991110777245,
451
+ "step": 340
452
+ },
453
+ {
454
+ "epoch": 0.18382352941176472,
455
+ "grad_norm": 36.50055695974426,
456
+ "kl": 0.0,
457
+ "learning_rate": 4.594577079084145e-07,
458
+ "logps/chosen": -287.8186279296875,
459
+ "logps/rejected": -295.969091796875,
460
+ "loss": 0.3146,
461
+ "rewards/chosen": 1.0129197120666504,
462
+ "rewards/margins": 3.7097062110900882,
463
+ "rewards/rejected": -2.6967864990234376,
464
+ "step": 350
465
+ },
466
+ {
467
+ "epoch": 0.18907563025210083,
468
+ "grad_norm": 48.72297305247662,
469
+ "kl": 0.0,
470
+ "learning_rate": 4.5717734270920466e-07,
471
+ "logps/chosen": -280.1525594325153,
472
+ "logps/rejected": -232.07081011146497,
473
+ "loss": 0.351,
474
+ "rewards/chosen": 0.9978786538715011,
475
+ "rewards/margins": 2.654015719394491,
476
+ "rewards/rejected": -1.6561370655229897,
477
+ "step": 360
478
+ },
479
+ {
480
+ "epoch": 0.19432773109243698,
481
+ "grad_norm": 47.17739577906298,
482
+ "kl": 0.0,
483
+ "learning_rate": 4.548405750218785e-07,
484
+ "logps/chosen": -290.4170778508772,
485
+ "logps/rejected": -278.8102191694631,
486
+ "loss": 0.3535,
487
+ "rewards/chosen": 0.4387444390190972,
488
+ "rewards/margins": 3.595384203792063,
489
+ "rewards/rejected": -3.1566397647729656,
490
+ "step": 370
491
+ },
492
+ {
493
+ "epoch": 0.19957983193277312,
494
+ "grad_norm": 55.83663877353586,
495
+ "kl": 0.0,
496
+ "learning_rate": 4.5244804101403025e-07,
497
+ "logps/chosen": -262.20204133064516,
498
+ "logps/rejected": -258.4055634469697,
499
+ "loss": 0.3611,
500
+ "rewards/chosen": 0.3024998326455393,
501
+ "rewards/margins": 2.7435985024490432,
502
+ "rewards/rejected": -2.441098669803504,
503
+ "step": 380
504
+ },
505
+ {
506
+ "epoch": 0.20483193277310924,
507
+ "grad_norm": 53.266164401529096,
508
+ "kl": 0.0,
509
+ "learning_rate": 4.5000039203521976e-07,
510
+ "logps/chosen": -300.6577662417763,
511
+ "logps/rejected": -298.0972609747024,
512
+ "loss": 0.348,
513
+ "rewards/chosen": 1.187171132940995,
514
+ "rewards/margins": 3.403816691616125,
515
+ "rewards/rejected": -2.2166455586751304,
516
+ "step": 390
517
+ },
518
+ {
519
+ "epoch": 0.21008403361344538,
520
+ "grad_norm": 51.236491583825575,
521
+ "kl": 0.0,
522
+ "learning_rate": 4.47498294439647e-07,
523
+ "logps/chosen": -291.7605892319277,
524
+ "logps/rejected": -267.51894784902595,
525
+ "loss": 0.3199,
526
+ "rewards/chosen": 1.1110430797898625,
527
+ "rewards/margins": 3.199955449158775,
528
+ "rewards/rejected": -2.0889123693689124,
529
+ "step": 400
530
+ },
531
+ {
532
+ "epoch": 0.21533613445378152,
533
+ "grad_norm": 66.69622404112168,
534
+ "kl": 0.0,
535
+ "learning_rate": 4.449424294047419e-07,
536
+ "logps/chosen": -301.6258148006135,
537
+ "logps/rejected": -284.4103801751592,
538
+ "loss": 0.3232,
539
+ "rewards/chosen": 1.0998908668939322,
540
+ "rewards/margins": 3.629001137594569,
541
+ "rewards/rejected": -2.5291102707006368,
542
+ "step": 410
543
+ },
544
+ {
545
+ "epoch": 0.22058823529411764,
546
+ "grad_norm": 52.10847099745242,
547
+ "kl": 0.0,
548
+ "learning_rate": 4.4233349274571974e-07,
549
+ "logps/chosen": -303.1148280201342,
550
+ "logps/rejected": -273.5418494152047,
551
+ "loss": 0.3532,
552
+ "rewards/chosen": 0.6703597203197095,
553
+ "rewards/margins": 3.336041436614026,
554
+ "rewards/rejected": -2.6656817162943165,
555
+ "step": 420
556
+ },
557
+ {
558
+ "epoch": 0.22584033613445378,
559
+ "grad_norm": 48.65147057788595,
560
+ "kl": 0.0,
561
+ "learning_rate": 4.396721947261496e-07,
562
+ "logps/chosen": -293.84951524849396,
563
+ "logps/rejected": -279.7598924512987,
564
+ "loss": 0.345,
565
+ "rewards/chosen": 0.35480995637824736,
566
+ "rewards/margins": 3.4659533905807636,
567
+ "rewards/rejected": -3.1111434342025164,
568
+ "step": 430
569
+ },
570
+ {
571
+ "epoch": 0.23109243697478993,
572
+ "grad_norm": 60.821733992897606,
573
+ "kl": 0.0,
574
+ "learning_rate": 4.3695925986459107e-07,
575
+ "logps/chosen": -284.638457507622,
576
+ "logps/rejected": -260.53165064102564,
577
+ "loss": 0.3469,
578
+ "rewards/chosen": 0.6963288376970989,
579
+ "rewards/margins": 3.8638249564871634,
580
+ "rewards/rejected": -3.1674961187900643,
581
+ "step": 440
582
+ },
583
+ {
584
+ "epoch": 0.23634453781512604,
585
+ "grad_norm": 40.515343648966464,
586
+ "kl": 0.0,
587
+ "learning_rate": 4.341954267373494e-07,
588
+ "logps/chosen": -288.7482045807453,
589
+ "logps/rejected": -257.25950766509436,
590
+ "loss": 0.3392,
591
+ "rewards/chosen": 1.8534261573175466,
592
+ "rewards/margins": 2.916300141600418,
593
+ "rewards/rejected": -1.0628739842828714,
594
+ "step": 450
595
+ },
596
+ {
597
+ "epoch": 0.2415966386554622,
598
+ "grad_norm": 59.00202211258808,
599
+ "kl": 0.0,
600
+ "learning_rate": 4.313814477774035e-07,
601
+ "logps/chosen": -450.8061615566038,
602
+ "logps/rejected": -373.24873835403724,
603
+ "loss": 0.3372,
604
+ "rewards/chosen": -13.37514433470912,
605
+ "rewards/margins": -0.7130592221314807,
606
+ "rewards/rejected": -12.662085112577639,
607
+ "step": 460
608
+ },
609
+ {
610
+ "epoch": 0.24684873949579833,
611
+ "grad_norm": 47.46663221207845,
612
+ "kl": 0.0,
613
+ "learning_rate": 4.2851808906956134e-07,
614
+ "logps/chosen": -293.39712000739644,
615
+ "logps/rejected": -281.5295685016556,
616
+ "loss": 0.3252,
617
+ "rewards/chosen": 1.1083643986628606,
618
+ "rewards/margins": 3.8087977886928686,
619
+ "rewards/rejected": -2.700433390030008,
620
+ "step": 470
621
+ },
622
+ {
623
+ "epoch": 0.25210084033613445,
624
+ "grad_norm": 51.531711526626296,
625
+ "kl": 0.0,
626
+ "learning_rate": 4.256061301418996e-07,
627
+ "logps/chosen": -276.60667242005815,
628
+ "logps/rejected": -270.2625897381757,
629
+ "loss": 0.3145,
630
+ "rewards/chosen": 1.4284691034361374,
631
+ "rewards/margins": 3.5734818300910147,
632
+ "rewards/rejected": -2.1450127266548775,
633
+ "step": 480
634
+ },
635
+ {
636
+ "epoch": 0.25735294117647056,
637
+ "grad_norm": 45.63717278781431,
638
+ "kl": 0.0,
639
+ "learning_rate": 4.2264636375354283e-07,
640
+ "logps/chosen": -291.6655943627451,
641
+ "logps/rejected": -258.8236573727545,
642
+ "loss": 0.3174,
643
+ "rewards/chosen": 1.2060089111328125,
644
+ "rewards/margins": 4.036167989947838,
645
+ "rewards/rejected": -2.830159078815026,
646
+ "step": 490
647
+ },
648
+ {
649
+ "epoch": 0.26260504201680673,
650
+ "grad_norm": 52.57777553035371,
651
+ "kl": 0.0,
652
+ "learning_rate": 4.1963959567884045e-07,
653
+ "logps/chosen": -308.52905933277026,
654
+ "logps/rejected": -288.6666061046512,
655
+ "loss": 0.3152,
656
+ "rewards/chosen": 1.2610940675477724,
657
+ "rewards/margins": 4.6736672387641605,
658
+ "rewards/rejected": -3.4125731712163883,
659
+ "step": 500
660
+ },
661
+ {
662
+ "epoch": 0.26785714285714285,
663
+ "grad_norm": 40.78741469978887,
664
+ "kl": 0.0,
665
+ "learning_rate": 4.1658664448800094e-07,
666
+ "logps/chosen": -287.10628043831167,
667
+ "logps/rejected": -265.04310993975906,
668
+ "loss": 0.334,
669
+ "rewards/chosen": 0.06761282140558417,
670
+ "rewards/margins": 3.2680445532156366,
671
+ "rewards/rejected": -3.2004317318100526,
672
+ "step": 510
673
+ },
674
+ {
675
+ "epoch": 0.27310924369747897,
676
+ "grad_norm": 53.20527576804437,
677
+ "kl": 0.0,
678
+ "learning_rate": 4.1348834132424204e-07,
679
+ "logps/chosen": -319.5769211871069,
680
+ "logps/rejected": -310.2074679736025,
681
+ "loss": 0.3435,
682
+ "rewards/chosen": -0.4044905368636989,
683
+ "rewards/margins": 4.406604517241503,
684
+ "rewards/rejected": -4.811095054105202,
685
+ "step": 520
686
+ },
687
+ {
688
+ "epoch": 0.27836134453781514,
689
+ "grad_norm": 56.13445331912671,
690
+ "kl": 0.0,
691
+ "learning_rate": 4.103455296775181e-07,
692
+ "logps/chosen": -318.1049981174699,
693
+ "logps/rejected": -280.2241020698052,
694
+ "loss": 0.3043,
695
+ "rewards/chosen": 0.8269406973597515,
696
+ "rewards/margins": 4.47310245796243,
697
+ "rewards/rejected": -3.6461617606026784,
698
+ "step": 530
699
+ },
700
+ {
701
+ "epoch": 0.28361344537815125,
702
+ "grad_norm": 35.13309154843364,
703
+ "kl": 0.0,
704
+ "learning_rate": 4.071590651548867e-07,
705
+ "logps/chosen": -300.40650531045753,
706
+ "logps/rejected": -319.7241766467066,
707
+ "loss": 0.3379,
708
+ "rewards/chosen": -1.0223476434844772,
709
+ "rewards/margins": 4.300584251514588,
710
+ "rewards/rejected": -5.322931894999065,
711
+ "step": 540
712
+ },
713
+ {
714
+ "epoch": 0.28886554621848737,
715
+ "grad_norm": 52.825979085690484,
716
+ "kl": 0.0,
717
+ "learning_rate": 4.039298152475754e-07,
718
+ "logps/chosen": -309.6330613057325,
719
+ "logps/rejected": -316.5918328220859,
720
+ "loss": 0.3449,
721
+ "rewards/chosen": -0.2137961903954767,
722
+ "rewards/margins": 3.9556471390236183,
723
+ "rewards/rejected": -4.169443329419095,
724
+ "step": 550
725
+ },
726
+ {
727
+ "epoch": 0.29411764705882354,
728
+ "grad_norm": 51.016496924749916,
729
+ "kl": 0.0,
730
+ "learning_rate": 4.006586590948141e-07,
731
+ "logps/chosen": -282.8731328616352,
732
+ "logps/rejected": -268.71367915372673,
733
+ "loss": 0.3175,
734
+ "rewards/chosen": 1.230227104522897,
735
+ "rewards/margins": 3.262244593180214,
736
+ "rewards/rejected": -2.0320174886573175,
737
+ "step": 560
738
+ },
739
+ {
740
+ "epoch": 0.29936974789915966,
741
+ "grad_norm": 38.1092514993909,
742
+ "kl": 0.0,
743
+ "learning_rate": 3.973464872444958e-07,
744
+ "logps/chosen": -294.08426339285717,
745
+ "logps/rejected": -278.8594487028302,
746
+ "loss": 0.3418,
747
+ "rewards/chosen": 1.2165776602229716,
748
+ "rewards/margins": 3.751641013947421,
749
+ "rewards/rejected": -2.53506335372445,
750
+ "step": 570
751
+ },
752
+ {
753
+ "epoch": 0.30462184873949577,
754
+ "grad_norm": 58.704387106710655,
755
+ "kl": 0.0,
756
+ "learning_rate": 3.939942014107318e-07,
757
+ "logps/chosen": -271.2850392964072,
758
+ "logps/rejected": -297.9468954248366,
759
+ "loss": 0.3425,
760
+ "rewards/chosen": 0.49638238781226607,
761
+ "rewards/margins": 3.3393096099706607,
762
+ "rewards/rejected": -2.8429272221583948,
763
+ "step": 580
764
+ },
765
+ {
766
+ "epoch": 0.30987394957983194,
767
+ "grad_norm": 40.68813636391329,
768
+ "kl": 0.0,
769
+ "learning_rate": 3.9060271422836624e-07,
770
+ "logps/chosen": -287.60264185855266,
771
+ "logps/rejected": -279.2552780877976,
772
+ "loss": 0.3002,
773
+ "rewards/chosen": 1.2695540377968235,
774
+ "rewards/margins": 4.471388216903036,
775
+ "rewards/rejected": -3.201834179106213,
776
+ "step": 590
777
+ },
778
+ {
779
+ "epoch": 0.31512605042016806,
780
+ "grad_norm": 50.475157194053196,
781
+ "kl": 0.0,
782
+ "learning_rate": 3.871729490045185e-07,
783
+ "logps/chosen": -290.81252297794117,
784
+ "logps/rejected": -269.86484375,
785
+ "loss": 0.2895,
786
+ "rewards/chosen": 1.230052095301011,
787
+ "rewards/margins": 4.965939546472886,
788
+ "rewards/rejected": -3.735887451171875,
789
+ "step": 600
790
+ },
791
+ {
792
+ "epoch": 0.32037815126050423,
793
+ "grad_norm": 52.61703361665091,
794
+ "kl": 0.0,
795
+ "learning_rate": 3.837058394672196e-07,
796
+ "logps/chosen": -272.8394775390625,
797
+ "logps/rejected": -298.7532958984375,
798
+ "loss": 0.3103,
799
+ "rewards/chosen": 0.9608588218688965,
800
+ "rewards/margins": 4.247009944915772,
801
+ "rewards/rejected": -3.286151123046875,
802
+ "step": 610
803
+ },
804
+ {
805
+ "epoch": 0.32563025210084034,
806
+ "grad_norm": 46.87592251872074,
807
+ "kl": 0.0,
808
+ "learning_rate": 3.8020232951121166e-07,
809
+ "logps/chosen": -291.6056034482759,
810
+ "logps/rejected": -298.2428348214286,
811
+ "loss": 0.3197,
812
+ "rewards/chosen": 0.4831507189520474,
813
+ "rewards/margins": 3.7980579455145476,
814
+ "rewards/rejected": -3.3149072265625,
815
+ "step": 620
816
+ },
817
+ {
818
+ "epoch": 0.33088235294117646,
819
+ "grad_norm": 50.37994120078213,
820
+ "kl": 0.0,
821
+ "learning_rate": 3.7666337294097985e-07,
822
+ "logps/chosen": -306.0224609375,
823
+ "logps/rejected": -271.47445401278407,
824
+ "loss": 0.3324,
825
+ "rewards/chosen": 0.5180339813232422,
826
+ "rewards/margins": 4.124568072232333,
827
+ "rewards/rejected": -3.606534090909091,
828
+ "step": 630
829
+ },
830
+ {
831
+ "epoch": 0.33613445378151263,
832
+ "grad_norm": 42.5115670992028,
833
+ "kl": 0.0,
834
+ "learning_rate": 3.730899332110855e-07,
835
+ "logps/chosen": -283.47386259191177,
836
+ "logps/rejected": -306.39444633152175,
837
+ "loss": 0.2874,
838
+ "rewards/chosen": 0.3301387113683364,
839
+ "rewards/margins": 5.416864360994695,
840
+ "rewards/rejected": -5.086725649626358,
841
+ "step": 640
842
+ },
843
+ {
844
+ "epoch": 0.34138655462184875,
845
+ "grad_norm": 41.86018982710908,
846
+ "kl": 0.0,
847
+ "learning_rate": 3.694829831638738e-07,
848
+ "logps/chosen": -277.6752025462963,
849
+ "logps/rejected": -316.3676819620253,
850
+ "loss": 0.3214,
851
+ "rewards/chosen": -0.18712226255440417,
852
+ "rewards/margins": 4.334243518819211,
853
+ "rewards/rejected": -4.521365781373616,
854
+ "step": 650
855
+ },
856
+ {
857
+ "epoch": 0.34663865546218486,
858
+ "grad_norm": 48.14500081411287,
859
+ "kl": 0.0,
860
+ "learning_rate": 3.658435047646238e-07,
861
+ "logps/chosen": -287.62355587121215,
862
+ "logps/rejected": -288.9054939516129,
863
+ "loss": 0.2986,
864
+ "rewards/chosen": 0.6268967137192235,
865
+ "rewards/margins": 5.107723799152692,
866
+ "rewards/rejected": -4.480827085433468,
867
+ "step": 660
868
+ },
869
+ {
870
+ "epoch": 0.35189075630252103,
871
+ "grad_norm": 56.47592313913895,
872
+ "kl": 0.0,
873
+ "learning_rate": 3.621724888342161e-07,
874
+ "logps/chosen": -315.55562279929575,
875
+ "logps/rejected": -265.77844101123594,
876
+ "loss": 0.3345,
877
+ "rewards/chosen": 0.5301809713874065,
878
+ "rewards/margins": 3.9119891344623716,
879
+ "rewards/rejected": -3.381808163074965,
880
+ "step": 670
881
+ },
882
+ {
883
+ "epoch": 0.35714285714285715,
884
+ "grad_norm": 61.01903580408772,
885
+ "kl": 0.0,
886
+ "learning_rate": 3.584709347793895e-07,
887
+ "logps/chosen": -333.9532463121118,
888
+ "logps/rejected": -284.3685878537736,
889
+ "loss": 0.3061,
890
+ "rewards/chosen": 0.6283516735764023,
891
+ "rewards/margins": 4.496458923478132,
892
+ "rewards/rejected": -3.8681072499017297,
893
+ "step": 680
894
+ },
895
+ {
896
+ "epoch": 0.36239495798319327,
897
+ "grad_norm": 65.14201524354712,
898
+ "kl": 0.0,
899
+ "learning_rate": 3.5473985032065946e-07,
900
+ "logps/chosen": -337.2389689700704,
901
+ "logps/rejected": -293.1407566713483,
902
+ "loss": 0.3049,
903
+ "rewards/chosen": 1.298159102318992,
904
+ "rewards/margins": 4.470166802349715,
905
+ "rewards/rejected": -3.1720077000307234,
906
+ "step": 690
907
+ },
908
+ {
909
+ "epoch": 0.36764705882352944,
910
+ "grad_norm": 83.73292739825986,
911
+ "kl": 0.0,
912
+ "learning_rate": 3.509802512179737e-07,
913
+ "logps/chosen": -641.1551411290322,
914
+ "logps/rejected": -469.2648674242424,
915
+ "loss": 0.3542,
916
+ "rewards/chosen": -31.990240675403225,
917
+ "rewards/margins": -6.357922375213832,
918
+ "rewards/rejected": -25.632318300189393,
919
+ "step": 700
920
+ },
921
+ {
922
+ "epoch": 0.37289915966386555,
923
+ "grad_norm": 59.59503648599958,
924
+ "kl": 0.0,
925
+ "learning_rate": 3.4719316099417983e-07,
926
+ "logps/chosen": -305.71205357142856,
927
+ "logps/rejected": -302.07416961477986,
928
+ "loss": 0.3297,
929
+ "rewards/chosen": 0.6071341378348214,
930
+ "rewards/margins": 4.526243746227117,
931
+ "rewards/rejected": -3.9191096083922954,
932
+ "step": 710
933
+ },
934
+ {
935
+ "epoch": 0.37815126050420167,
936
+ "grad_norm": 52.12511665719596,
937
+ "kl": 0.0,
938
+ "learning_rate": 3.4337961065637786e-07,
939
+ "logps/chosen": -354.86205286949684,
940
+ "logps/rejected": -341.6544788431677,
941
+ "loss": 0.3121,
942
+ "rewards/chosen": -0.023094465147774173,
943
+ "rewards/margins": 5.537508137603584,
944
+ "rewards/rejected": -5.560602602751358,
945
+ "step": 720
946
+ },
947
+ {
948
+ "epoch": 0.38340336134453784,
949
+ "grad_norm": 57.5402410664677,
950
+ "kl": 0.0,
951
+ "learning_rate": 3.395406384152371e-07,
952
+ "logps/chosen": -315.799042492378,
953
+ "logps/rejected": -294.5041316105769,
954
+ "loss": 0.3199,
955
+ "rewards/chosen": 0.43840366456566787,
956
+ "rewards/margins": 4.3100072146207555,
957
+ "rewards/rejected": -3.871603550055088,
958
+ "step": 730
959
+ },
960
+ {
961
+ "epoch": 0.38865546218487396,
962
+ "grad_norm": 49.22922457925009,
963
+ "kl": 0.0,
964
+ "learning_rate": 3.356772894023505e-07,
965
+ "logps/chosen": -268.91911764705884,
966
+ "logps/rejected": -281.14432565789474,
967
+ "loss": 0.3627,
968
+ "rewards/chosen": 1.405719124697109,
969
+ "rewards/margins": 3.5240630802892143,
970
+ "rewards/rejected": -2.1183439555921053,
971
+ "step": 740
972
+ },
973
+ {
974
+ "epoch": 0.3939075630252101,
975
+ "grad_norm": 46.913670160363154,
976
+ "kl": 0.0,
977
+ "learning_rate": 3.317906153857054e-07,
978
+ "logps/chosen": -283.1244277468153,
979
+ "logps/rejected": -271.5762078220859,
980
+ "loss": 0.3514,
981
+ "rewards/chosen": 1.5512054832118332,
982
+ "rewards/margins": 3.464116740341338,
983
+ "rewards/rejected": -1.9129112571295053,
984
+ "step": 750
985
+ },
986
+ {
987
+ "epoch": 0.39915966386554624,
988
+ "grad_norm": 39.898837555375735,
989
+ "kl": 0.0,
990
+ "learning_rate": 3.2788167448334784e-07,
991
+ "logps/chosen": -253.08657625786162,
992
+ "logps/rejected": -273.2616459627329,
993
+ "loss": 0.3454,
994
+ "rewards/chosen": 0.4627175001228381,
995
+ "rewards/margins": 3.103327146558688,
996
+ "rewards/rejected": -2.64060964643585,
997
+ "step": 760
998
+ },
999
+ {
1000
+ "epoch": 0.40441176470588236,
1001
+ "grad_norm": 78.57819058757303,
1002
+ "kl": 0.0,
1003
+ "learning_rate": 3.2395153087531763e-07,
1004
+ "logps/chosen": -297.46610213926175,
1005
+ "logps/rejected": -265.79100420321635,
1006
+ "loss": 0.3314,
1007
+ "rewards/chosen": 0.0033199643128670304,
1008
+ "rewards/margins": 4.24907213159631,
1009
+ "rewards/rejected": -4.245752167283443,
1010
+ "step": 770
1011
+ },
1012
+ {
1013
+ "epoch": 0.4096638655462185,
1014
+ "grad_norm": 58.66647545765705,
1015
+ "kl": 0.0,
1016
+ "learning_rate": 3.20001254513933e-07,
1017
+ "logps/chosen": -316.0476090604027,
1018
+ "logps/rejected": -338.84203673245617,
1019
+ "loss": 0.3352,
1020
+ "rewards/chosen": -0.9521625698012793,
1021
+ "rewards/margins": 3.994815012092909,
1022
+ "rewards/rejected": -4.946977581894188,
1023
+ "step": 780
1024
+ },
1025
+ {
1026
+ "epoch": 0.41491596638655465,
1027
+ "grad_norm": 61.099435670035334,
1028
+ "kl": 0.0,
1029
+ "learning_rate": 3.160319208325044e-07,
1030
+ "logps/chosen": -328.2163245506536,
1031
+ "logps/rejected": -299.7488070733533,
1032
+ "loss": 0.3143,
1033
+ "rewards/chosen": -0.38961004120072507,
1034
+ "rewards/margins": 4.762490664614208,
1035
+ "rewards/rejected": -5.152100705814933,
1036
+ "step": 790
1037
+ },
1038
+ {
1039
+ "epoch": 0.42016806722689076,
1040
+ "grad_norm": 55.61375515356607,
1041
+ "kl": 0.0,
1042
+ "learning_rate": 3.1204461045255597e-07,
1043
+ "logps/chosen": -310.96216982886904,
1044
+ "logps/rejected": -292.1716951069079,
1045
+ "loss": 0.3393,
1046
+ "rewards/chosen": 0.4771306628272647,
1047
+ "rewards/margins": 4.309103688500579,
1048
+ "rewards/rejected": -3.831973025673314,
1049
+ "step": 800
1050
+ },
1051
+ {
1052
+ "epoch": 0.4254201680672269,
1053
+ "grad_norm": 64.83942546607469,
1054
+ "kl": 0.0,
1055
+ "learning_rate": 3.0804040888963367e-07,
1056
+ "logps/chosen": -306.39021236795776,
1057
+ "logps/rejected": -276.6224543539326,
1058
+ "loss": 0.3007,
1059
+ "rewards/chosen": 0.6496220978213029,
1060
+ "rewards/margins": 5.41400753551881,
1061
+ "rewards/rejected": -4.764385437697507,
1062
+ "step": 810
1063
+ },
1064
+ {
1065
+ "epoch": 0.43067226890756305,
1066
+ "grad_norm": 47.53109374375667,
1067
+ "kl": 0.0,
1068
+ "learning_rate": 3.040204062577824e-07,
1069
+ "logps/chosen": -319.8095262096774,
1070
+ "logps/rejected": -276.47109375,
1071
+ "loss": 0.3453,
1072
+ "rewards/chosen": 0.12265574547552294,
1073
+ "rewards/margins": 3.962314170546546,
1074
+ "rewards/rejected": -3.839658425071023,
1075
+ "step": 820
1076
+ },
1077
+ {
1078
+ "epoch": 0.43592436974789917,
1079
+ "grad_norm": 50.597587143566045,
1080
+ "kl": 0.0,
1081
+ "learning_rate": 2.999856969727704e-07,
1082
+ "logps/chosen": -301.44910453216374,
1083
+ "logps/rejected": -281.371670511745,
1084
+ "loss": 0.3375,
1085
+ "rewards/chosen": 0.8545954297160545,
1086
+ "rewards/margins": 4.381973588797431,
1087
+ "rewards/rejected": -3.5273781590813758,
1088
+ "step": 830
1089
+ },
1090
+ {
1091
+ "epoch": 0.4411764705882353,
1092
+ "grad_norm": 65.2431749840782,
1093
+ "kl": 0.0,
1094
+ "learning_rate": 2.959373794541426e-07,
1095
+ "logps/chosen": -288.58415743670884,
1096
+ "logps/rejected": -275.80381944444446,
1097
+ "loss": 0.3174,
1098
+ "rewards/chosen": 0.5184780072562302,
1099
+ "rewards/margins": 4.098698170860589,
1100
+ "rewards/rejected": -3.5802201636043596,
1101
+ "step": 840
1102
+ },
1103
+ {
1104
+ "epoch": 0.44642857142857145,
1105
+ "grad_norm": 43.10826918884011,
1106
+ "kl": 0.0,
1107
+ "learning_rate": 2.9187655582618407e-07,
1108
+ "logps/chosen": -317.565112154908,
1109
+ "logps/rejected": -297.73979896496814,
1110
+ "loss": 0.3039,
1111
+ "rewards/chosen": 1.0487361978168137,
1112
+ "rewards/margins": 4.458561225185444,
1113
+ "rewards/rejected": -3.4098250273686306,
1114
+ "step": 850
1115
+ },
1116
+ {
1117
+ "epoch": 0.45168067226890757,
1118
+ "grad_norm": 49.36926037963924,
1119
+ "kl": 0.0,
1120
+ "learning_rate": 2.878043316178753e-07,
1121
+ "logps/chosen": -290.1714082154088,
1122
+ "logps/rejected": -275.16697399068323,
1123
+ "loss": 0.339,
1124
+ "rewards/chosen": -0.1980593579370271,
1125
+ "rewards/margins": 4.149451317528811,
1126
+ "rewards/rejected": -4.347510675465839,
1127
+ "step": 860
1128
+ },
1129
+ {
1130
+ "epoch": 0.4569327731092437,
1131
+ "grad_norm": 31.59893236803937,
1132
+ "kl": 0.0,
1133
+ "learning_rate": 2.837218154619193e-07,
1134
+ "logps/chosen": -310.3774646577381,
1135
+ "logps/rejected": -279.89951685855266,
1136
+ "loss": 0.3072,
1137
+ "rewards/chosen": 0.8483944847470238,
1138
+ "rewards/margins": 4.661724482562608,
1139
+ "rewards/rejected": -3.813329997815584,
1140
+ "step": 870
1141
+ },
1142
+ {
1143
+ "epoch": 0.46218487394957986,
1144
+ "grad_norm": 78.02492897783358,
1145
+ "kl": 0.0,
1146
+ "learning_rate": 2.796301187929257e-07,
1147
+ "logps/chosen": -328.6445046768707,
1148
+ "logps/rejected": -285.24498735549133,
1149
+ "loss": 0.3409,
1150
+ "rewards/chosen": -0.49578680959688565,
1151
+ "rewards/margins": 3.4661660613839667,
1152
+ "rewards/rejected": -3.9619528709808525,
1153
+ "step": 880
1154
+ },
1155
+ {
1156
+ "epoch": 0.46743697478991597,
1157
+ "grad_norm": 47.42133886253641,
1158
+ "kl": 0.0,
1159
+ "learning_rate": 2.755303555448301e-07,
1160
+ "logps/chosen": -321.7949578220859,
1161
+ "logps/rejected": -317.8714171974522,
1162
+ "loss": 0.3471,
1163
+ "rewards/chosen": -1.7351532918543904,
1164
+ "rewards/margins": 4.654421281443778,
1165
+ "rewards/rejected": -6.389574573298169,
1166
+ "step": 890
1167
+ },
1168
+ {
1169
+ "epoch": 0.4726890756302521,
1170
+ "grad_norm": 50.39237398761096,
1171
+ "kl": 0.0,
1172
+ "learning_rate": 2.7142364184763424e-07,
1173
+ "logps/chosen": -284.39781663907286,
1174
+ "logps/rejected": -317.32179178994085,
1175
+ "loss": 0.3519,
1176
+ "rewards/chosen": -0.08173387413782789,
1177
+ "rewards/margins": 4.825091084257143,
1178
+ "rewards/rejected": -4.906824958394971,
1179
+ "step": 900
1180
+ },
1181
+ {
1182
+ "epoch": 0.47794117647058826,
1183
+ "grad_norm": 56.7884471471947,
1184
+ "kl": 0.0,
1185
+ "learning_rate": 2.673110957235479e-07,
1186
+ "logps/chosen": -309.5387185534591,
1187
+ "logps/rejected": -299.7813227872671,
1188
+ "loss": 0.3017,
1189
+ "rewards/chosen": 0.5868375766202338,
1190
+ "rewards/margins": 4.1676898245336655,
1191
+ "rewards/rejected": -3.5808522479134317,
1192
+ "step": 910
1193
+ },
1194
+ {
1195
+ "epoch": 0.4831932773109244,
1196
+ "grad_norm": 55.26734943859427,
1197
+ "kl": 0.0,
1198
+ "learning_rate": 2.6319383678261557e-07,
1199
+ "logps/chosen": -322.89912539308176,
1200
+ "logps/rejected": -293.4796680900621,
1201
+ "loss": 0.3131,
1202
+ "rewards/chosen": 0.8568071089450668,
1203
+ "rewards/margins": 4.327623205849182,
1204
+ "rewards/rejected": -3.470816096904115,
1205
+ "step": 920
1206
+ },
1207
+ {
1208
+ "epoch": 0.4884453781512605,
1209
+ "grad_norm": 43.186318715354865,
1210
+ "kl": 0.0,
1211
+ "learning_rate": 2.5907298591791105e-07,
1212
+ "logps/chosen": -297.91787462349396,
1213
+ "logps/rejected": -322.5411931818182,
1214
+ "loss": 0.2897,
1215
+ "rewards/chosen": 0.7378294151949595,
1216
+ "rewards/margins": 5.026299398580715,
1217
+ "rewards/rejected": -4.288469983385755,
1218
+ "step": 930
1219
+ },
1220
+ {
1221
+ "epoch": 0.49369747899159666,
1222
+ "grad_norm": 75.34771457681897,
1223
+ "kl": 0.0,
1224
+ "learning_rate": 2.5494966500038264e-07,
1225
+ "logps/chosen": -304.362060546875,
1226
+ "logps/rejected": -301.68695746527777,
1227
+ "loss": 0.3408,
1228
+ "rewards/chosen": -0.0454999641938643,
1229
+ "rewards/margins": 4.585332284070025,
1230
+ "rewards/rejected": -4.630832248263889,
1231
+ "step": 940
1232
+ },
1233
+ {
1234
+ "epoch": 0.4989495798319328,
1235
+ "grad_norm": 70.74960017005274,
1236
+ "kl": 0.0,
1237
+ "learning_rate": 2.508249965734319e-07,
1238
+ "logps/chosen": -322.87849884969324,
1239
+ "logps/rejected": -298.17384056528664,
1240
+ "loss": 0.2932,
1241
+ "rewards/chosen": 0.08947515780209032,
1242
+ "rewards/margins": 5.383114517002927,
1243
+ "rewards/rejected": -5.293639359200836,
1244
+ "step": 950
1245
+ },
1246
+ {
1247
+ "epoch": 0.5042016806722689,
1248
+ "grad_norm": 71.05303842998583,
1249
+ "kl": 0.0,
1250
+ "learning_rate": 2.467001035473103e-07,
1251
+ "logps/chosen": -305.84786676646706,
1252
+ "logps/rejected": -311.00056168300654,
1253
+ "loss": 0.3135,
1254
+ "rewards/chosen": 0.8616841024981288,
1255
+ "rewards/margins": 4.789382175542451,
1256
+ "rewards/rejected": -3.927698073044322,
1257
+ "step": 960
1258
+ },
1259
+ {
1260
+ "epoch": 0.509453781512605,
1261
+ "grad_norm": 51.90598218741456,
1262
+ "kl": 0.0,
1263
+ "learning_rate": 2.425761088934142e-07,
1264
+ "logps/chosen": -275.5298913043478,
1265
+ "logps/rejected": -261.6807439072327,
1266
+ "loss": 0.3269,
1267
+ "rewards/chosen": 0.6217543797463364,
1268
+ "rewards/margins": 4.42957770484952,
1269
+ "rewards/rejected": -3.807823325103184,
1270
+ "step": 970
1271
+ },
1272
+ {
1273
+ "epoch": 0.5147058823529411,
1274
+ "grad_norm": 49.01333342668485,
1275
+ "kl": 0.0,
1276
+ "learning_rate": 2.3845413533856514e-07,
1277
+ "logps/chosen": -303.1413620283019,
1278
+ "logps/rejected": -294.9575892857143,
1279
+ "loss": 0.2877,
1280
+ "rewards/chosen": 0.7997542567223123,
1281
+ "rewards/margins": 5.2810229084595015,
1282
+ "rewards/rejected": -4.48126865173719,
1283
+ "step": 980
1284
+ },
1285
+ {
1286
+ "epoch": 0.5199579831932774,
1287
+ "grad_norm": 45.4420006215638,
1288
+ "kl": 0.0,
1289
+ "learning_rate": 2.343353050593553e-07,
1290
+ "logps/chosen": -315.00589576863354,
1291
+ "logps/rejected": -299.73405562106916,
1292
+ "loss": 0.3082,
1293
+ "rewards/chosen": 0.48438277155716225,
1294
+ "rewards/margins": 5.411952082927052,
1295
+ "rewards/rejected": -4.92756931136989,
1296
+ "step": 990
1297
+ },
1298
+ {
1299
+ "epoch": 0.5252100840336135,
1300
+ "grad_norm": 44.01774159207231,
1301
+ "kl": 0.0,
1302
+ "learning_rate": 2.3022073937664383e-07,
1303
+ "logps/chosen": -304.87012987012986,
1304
+ "logps/rejected": -277.4141330948795,
1305
+ "loss": 0.3005,
1306
+ "rewards/chosen": 1.3989817631709112,
1307
+ "rewards/margins": 4.661985928268803,
1308
+ "rewards/rejected": -3.2630041650978914,
1309
+ "step": 1000
1310
+ },
1311
+ {
1312
+ "epoch": 0.5304621848739496,
1313
+ "grad_norm": 48.00123102792709,
1314
+ "kl": 0.0,
1315
+ "learning_rate": 2.261115584502849e-07,
1316
+ "logps/chosen": -307.63991253930817,
1317
+ "logps/rejected": -270.22658676242236,
1318
+ "loss": 0.326,
1319
+ "rewards/chosen": 1.3268984428741648,
1320
+ "rewards/margins": 3.5992706708924103,
1321
+ "rewards/rejected": -2.2723722280182455,
1322
+ "step": 1010
1323
+ },
1324
+ {
1325
+ "epoch": 0.5357142857142857,
1326
+ "grad_norm": 101.16998848852474,
1327
+ "kl": 0.0,
1328
+ "learning_rate": 2.2200888097417302e-07,
1329
+ "logps/chosen": -260.3294677734375,
1330
+ "logps/rejected": -272.8283203125,
1331
+ "loss": 0.3344,
1332
+ "rewards/chosen": 1.6289764404296876,
1333
+ "rewards/margins": 3.6363025665283204,
1334
+ "rewards/rejected": -2.007326126098633,
1335
+ "step": 1020
1336
+ },
1337
+ {
1338
+ "epoch": 0.5409663865546218,
1339
+ "grad_norm": 45.84798251584408,
1340
+ "kl": 0.0,
1341
+ "learning_rate": 2.1791382387168684e-07,
1342
+ "logps/chosen": -293.09831912878786,
1343
+ "logps/rejected": -303.9574848790323,
1344
+ "loss": 0.3042,
1345
+ "rewards/chosen": 0.6066324407404119,
1346
+ "rewards/margins": 5.404102435070048,
1347
+ "rewards/rejected": -4.797469994329637,
1348
+ "step": 1030
1349
+ },
1350
+ {
1351
+ "epoch": 0.5462184873949579,
1352
+ "grad_norm": 40.19468881639845,
1353
+ "kl": 0.0,
1354
+ "learning_rate": 2.1382750199161495e-07,
1355
+ "logps/chosen": -319.7741268382353,
1356
+ "logps/rejected": -292.6617708333333,
1357
+ "loss": 0.3023,
1358
+ "rewards/chosen": 1.018133544921875,
1359
+ "rewards/margins": 4.818516031901042,
1360
+ "rewards/rejected": -3.8003824869791667,
1361
+ "step": 1040
1362
+ },
1363
+ {
1364
+ "epoch": 0.5514705882352942,
1365
+ "grad_norm": 33.38152973598087,
1366
+ "kl": 0.0,
1367
+ "learning_rate": 2.0975102780464673e-07,
1368
+ "logps/chosen": -311.2853467987805,
1369
+ "logps/rejected": -304.88221153846155,
1370
+ "loss": 0.3141,
1371
+ "rewards/chosen": 1.0162078113090702,
1372
+ "rewards/margins": 4.974026258324295,
1373
+ "rewards/rejected": -3.9578184470152245,
1374
+ "step": 1050
1375
+ },
1376
+ {
1377
+ "epoch": 0.5567226890756303,
1378
+ "grad_norm": 63.954166267932834,
1379
+ "kl": 0.0,
1380
+ "learning_rate": 2.0568551110051074e-07,
1381
+ "logps/chosen": -264.285549331761,
1382
+ "logps/rejected": -278.77358307453414,
1383
+ "loss": 0.3119,
1384
+ "rewards/chosen": 1.0249821884827044,
1385
+ "rewards/margins": 4.825221324983093,
1386
+ "rewards/rejected": -3.8002391365003882,
1387
+ "step": 1060
1388
+ },
1389
+ {
1390
+ "epoch": 0.5619747899159664,
1391
+ "grad_norm": 41.06781071234312,
1392
+ "kl": 0.0,
1393
+ "learning_rate": 2.016320586858422e-07,
1394
+ "logps/chosen": -276.17345252403845,
1395
+ "logps/rejected": -310.3708555640244,
1396
+ "loss": 0.2879,
1397
+ "rewards/chosen": 1.260654547275641,
1398
+ "rewards/margins": 5.546377977630062,
1399
+ "rewards/rejected": -4.2857234303544205,
1400
+ "step": 1070
1401
+ },
1402
+ {
1403
+ "epoch": 0.5672268907563025,
1404
+ "grad_norm": 48.37440945890147,
1405
+ "kl": 0.0,
1406
+ "learning_rate": 1.9759177408286337e-07,
1407
+ "logps/chosen": -316.57014678030305,
1408
+ "logps/rejected": -284.27633568548384,
1409
+ "loss": 0.3181,
1410
+ "rewards/chosen": 1.2529100822679924,
1411
+ "rewards/margins": 4.161886384876359,
1412
+ "rewards/rejected": -2.908976302608367,
1413
+ "step": 1080
1414
+ },
1415
+ {
1416
+ "epoch": 0.5724789915966386,
1417
+ "grad_norm": 43.39106017396027,
1418
+ "kl": 0.0,
1419
+ "learning_rate": 1.9356575722895808e-07,
1420
+ "logps/chosen": -298.2481151660839,
1421
+ "logps/rejected": -282.445334569209,
1422
+ "loss": 0.3094,
1423
+ "rewards/chosen": 1.7649928806545017,
1424
+ "rewards/margins": 4.975185889680809,
1425
+ "rewards/rejected": -3.2101930090263067,
1426
+ "step": 1090
1427
+ },
1428
+ {
1429
+ "epoch": 0.5777310924369747,
1430
+ "grad_norm": 60.62146730688155,
1431
+ "kl": 0.0,
1432
+ "learning_rate": 1.895551041772216e-07,
1433
+ "logps/chosen": -307.72318892045456,
1434
+ "logps/rejected": -269.39409722222223,
1435
+ "loss": 0.3155,
1436
+ "rewards/chosen": 1.027849023992365,
1437
+ "rewards/margins": 3.9311439051772608,
1438
+ "rewards/rejected": -2.903294881184896,
1439
+ "step": 1100
1440
+ },
1441
+ {
1442
+ "epoch": 0.582983193277311,
1443
+ "grad_norm": 87.64642171640351,
1444
+ "kl": 0.0,
1445
+ "learning_rate": 1.8556090679806847e-07,
1446
+ "logps/chosen": -302.53110881024094,
1447
+ "logps/rejected": -283.19252232142856,
1448
+ "loss": 0.299,
1449
+ "rewards/chosen": 1.1432133869952465,
1450
+ "rewards/margins": 5.261560554963997,
1451
+ "rewards/rejected": -4.11834716796875,
1452
+ "step": 1110
1453
+ },
1454
+ {
1455
+ "epoch": 0.5882352941176471,
1456
+ "grad_norm": 61.71028625828252,
1457
+ "kl": 0.0,
1458
+ "learning_rate": 1.8158425248197928e-07,
1459
+ "logps/chosen": -325.427734375,
1460
+ "logps/rejected": -312.36092748397436,
1461
+ "loss": 0.2687,
1462
+ "rewards/chosen": 1.890699060951791,
1463
+ "rewards/margins": 5.755905657130677,
1464
+ "rewards/rejected": -3.8652065961788864,
1465
+ "step": 1120
1466
+ },
1467
+ {
1468
+ "epoch": 0.5934873949579832,
1469
+ "grad_norm": 65.2119562803559,
1470
+ "kl": 0.0,
1471
+ "learning_rate": 1.7762622384346609e-07,
1472
+ "logps/chosen": -291.2824315200617,
1473
+ "logps/rejected": -258.82664161392404,
1474
+ "loss": 0.3013,
1475
+ "rewards/chosen": 1.8866918116440008,
1476
+ "rewards/margins": 4.205873229761089,
1477
+ "rewards/rejected": -2.3191814181170884,
1478
+ "step": 1130
1479
+ },
1480
+ {
1481
+ "epoch": 0.5987394957983193,
1482
+ "grad_norm": 38.01395730830506,
1483
+ "kl": 0.0,
1484
+ "learning_rate": 1.7368789842633907e-07,
1485
+ "logps/chosen": -322.3194526336478,
1486
+ "logps/rejected": -307.19121215062114,
1487
+ "loss": 0.3003,
1488
+ "rewards/chosen": 1.5120626005736537,
1489
+ "rewards/margins": 4.748744047099275,
1490
+ "rewards/rejected": -3.2366814465256213,
1491
+ "step": 1140
1492
+ },
1493
+ {
1494
+ "epoch": 0.6039915966386554,
1495
+ "grad_norm": 57.24723300086865,
1496
+ "kl": 0.0,
1497
+ "learning_rate": 1.697703484103532e-07,
1498
+ "logps/chosen": -297.3913395579268,
1499
+ "logps/rejected": -276.43687399839746,
1500
+ "loss": 0.3062,
1501
+ "rewards/chosen": 1.2956105674185403,
1502
+ "rewards/margins": 5.099581686834606,
1503
+ "rewards/rejected": -3.8039711194160657,
1504
+ "step": 1150
1505
+ },
1506
+ {
1507
+ "epoch": 0.6092436974789915,
1508
+ "grad_norm": 46.71083438381099,
1509
+ "kl": 0.0,
1510
+ "learning_rate": 1.6587464031931526e-07,
1511
+ "logps/chosen": -307.08378031716416,
1512
+ "logps/rejected": -292.77835181451616,
1513
+ "loss": 0.3292,
1514
+ "rewards/chosen": 1.054947810386544,
1515
+ "rewards/margins": 4.291957494631335,
1516
+ "rewards/rejected": -3.2370096842447915,
1517
+ "step": 1160
1518
+ },
1519
+ {
1520
+ "epoch": 0.6144957983193278,
1521
+ "grad_norm": 28.548641210768817,
1522
+ "kl": 0.0,
1523
+ "learning_rate": 1.6200183473073048e-07,
1524
+ "logps/chosen": -301.33444602272726,
1525
+ "logps/rejected": -287.6282510080645,
1526
+ "loss": 0.2939,
1527
+ "rewards/chosen": 0.8960710005326704,
1528
+ "rewards/margins": 4.915845997382469,
1529
+ "rewards/rejected": -4.019774996849798,
1530
+ "step": 1170
1531
+ },
1532
+ {
1533
+ "epoch": 0.6197478991596639,
1534
+ "grad_norm": 75.28423549841818,
1535
+ "kl": 0.0,
1536
+ "learning_rate": 1.5815298598706888e-07,
1537
+ "logps/chosen": -294.9990295031056,
1538
+ "logps/rejected": -297.1986536949685,
1539
+ "loss": 0.3023,
1540
+ "rewards/chosen": 0.4812743382424301,
1541
+ "rewards/margins": 5.439699416244593,
1542
+ "rewards/rejected": -4.958425078002162,
1543
+ "step": 1180
1544
+ },
1545
+ {
1546
+ "epoch": 0.625,
1547
+ "grad_norm": 65.4384114129384,
1548
+ "kl": 0.0,
1549
+ "learning_rate": 1.5432914190872756e-07,
1550
+ "logps/chosen": -287.07041139240505,
1551
+ "logps/rejected": -303.2984905478395,
1552
+ "loss": 0.3376,
1553
+ "rewards/chosen": 0.6506334135803995,
1554
+ "rewards/margins": 4.541042906732406,
1555
+ "rewards/rejected": -3.8904094931520063,
1556
+ "step": 1190
1557
+ },
1558
+ {
1559
+ "epoch": 0.6302521008403361,
1560
+ "grad_norm": 39.03209593199308,
1561
+ "kl": 0.0,
1562
+ "learning_rate": 1.505313435087698e-07,
1563
+ "logps/chosen": -302.3798961900685,
1564
+ "logps/rejected": -300.5006510416667,
1565
+ "loss": 0.3026,
1566
+ "rewards/chosen": 0.7183382217198202,
1567
+ "rewards/margins": 5.01667280381842,
1568
+ "rewards/rejected": -4.298334582098599,
1569
+ "step": 1200
1570
+ },
1571
+ {
1572
+ "epoch": 0.6355042016806722,
1573
+ "grad_norm": 37.74806766053673,
1574
+ "kl": 0.0,
1575
+ "learning_rate": 1.4676062470951705e-07,
1576
+ "logps/chosen": -320.5517877684049,
1577
+ "logps/rejected": -318.0521248009554,
1578
+ "loss": 0.3071,
1579
+ "rewards/chosen": 0.636018296692269,
1580
+ "rewards/margins": 5.225012997130167,
1581
+ "rewards/rejected": -4.588994700437898,
1582
+ "step": 1210
1583
+ },
1584
+ {
1585
+ "epoch": 0.6407563025210085,
1586
+ "grad_norm": 48.11396951304232,
1587
+ "kl": 0.0,
1588
+ "learning_rate": 1.430180120610711e-07,
1589
+ "logps/chosen": -294.4245869824841,
1590
+ "logps/rejected": -295.2305885736196,
1591
+ "loss": 0.2893,
1592
+ "rewards/chosen": 0.30213877198043143,
1593
+ "rewards/margins": 5.5401721368673185,
1594
+ "rewards/rejected": -5.238033364886887,
1595
+ "step": 1220
1596
+ },
1597
+ {
1598
+ "epoch": 0.6460084033613446,
1599
+ "grad_norm": 54.817845691829724,
1600
+ "kl": 0.0,
1601
+ "learning_rate": 1.3930452446184385e-07,
1602
+ "logps/chosen": -329.90924310064935,
1603
+ "logps/rejected": -325.61871705572287,
1604
+ "loss": 0.2779,
1605
+ "rewards/chosen": 0.11350006252140193,
1606
+ "rewards/margins": 6.256528965241658,
1607
+ "rewards/rejected": -6.143028902720256,
1608
+ "step": 1230
1609
+ },
1610
+ {
1611
+ "epoch": 0.6512605042016807,
1612
+ "grad_norm": 58.30938790650043,
1613
+ "kl": 0.0,
1614
+ "learning_rate": 1.3562117288116923e-07,
1615
+ "logps/chosen": -327.2169507575758,
1616
+ "logps/rejected": -302.0602318548387,
1617
+ "loss": 0.31,
1618
+ "rewards/chosen": 0.08168421658602627,
1619
+ "rewards/margins": 5.523900462175744,
1620
+ "rewards/rejected": -5.442216245589718,
1621
+ "step": 1240
1622
+ },
1623
+ {
1624
+ "epoch": 0.6565126050420168,
1625
+ "grad_norm": 55.044099371688475,
1626
+ "kl": 0.0,
1627
+ "learning_rate": 1.319689600840747e-07,
1628
+ "logps/chosen": -308.8638286226115,
1629
+ "logps/rejected": -301.6549079754601,
1630
+ "loss": 0.323,
1631
+ "rewards/chosen": -0.07818331384355096,
1632
+ "rewards/margins": 4.454612785298512,
1633
+ "rewards/rejected": -4.532796099142063,
1634
+ "step": 1250
1635
+ },
1636
+ {
1637
+ "epoch": 0.6617647058823529,
1638
+ "grad_norm": 102.96993890295445,
1639
+ "kl": 0.0,
1640
+ "learning_rate": 1.2834888035828596e-07,
1641
+ "logps/chosen": -300.1451028963415,
1642
+ "logps/rejected": -284.04051482371796,
1643
+ "loss": 0.2618,
1644
+ "rewards/chosen": 1.2945330550030965,
1645
+ "rewards/margins": 6.069371233588834,
1646
+ "rewards/rejected": -4.774838178585737,
1647
+ "step": 1260
1648
+ },
1649
+ {
1650
+ "epoch": 0.667016806722689,
1651
+ "grad_norm": 62.28101991590411,
1652
+ "kl": 0.0,
1653
+ "learning_rate": 1.2476191924353932e-07,
1654
+ "logps/chosen": -340.08214285714286,
1655
+ "logps/rejected": -286.0248114224138,
1656
+ "loss": 0.2961,
1657
+ "rewards/chosen": 1.140911167689732,
1658
+ "rewards/margins": 5.149687181294258,
1659
+ "rewards/rejected": -4.008776013604526,
1660
+ "step": 1270
1661
+ },
1662
+ {
1663
+ "epoch": 0.6722689075630253,
1664
+ "grad_norm": 56.339091641639506,
1665
+ "kl": 0.0,
1666
+ "learning_rate": 1.2120905326327596e-07,
1667
+ "logps/chosen": -294.5368897928994,
1668
+ "logps/rejected": -291.57833195364236,
1669
+ "loss": 0.3268,
1670
+ "rewards/chosen": 0.9549788977267474,
1671
+ "rewards/margins": 4.6167907768658205,
1672
+ "rewards/rejected": -3.661811879139073,
1673
+ "step": 1280
1674
+ },
1675
+ {
1676
+ "epoch": 0.6775210084033614,
1677
+ "grad_norm": 66.6477563654386,
1678
+ "kl": 0.0,
1679
+ "learning_rate": 1.1769124965879091e-07,
1680
+ "logps/chosen": -307.14776490066225,
1681
+ "logps/rejected": -255.9688424556213,
1682
+ "loss": 0.3042,
1683
+ "rewards/chosen": 1.2051307728748448,
1684
+ "rewards/margins": 4.886082675727101,
1685
+ "rewards/rejected": -3.680951902852256,
1686
+ "step": 1290
1687
+ },
1688
+ {
1689
+ "epoch": 0.6827731092436975,
1690
+ "grad_norm": 51.16719779623548,
1691
+ "kl": 0.0,
1692
+ "learning_rate": 1.1420946612590837e-07,
1693
+ "logps/chosen": -274.20879836309524,
1694
+ "logps/rejected": -287.1597193667763,
1695
+ "loss": 0.3073,
1696
+ "rewards/chosen": 0.7768129621233258,
1697
+ "rewards/margins": 4.585587408309592,
1698
+ "rewards/rejected": -3.8087744461862663,
1699
+ "step": 1300
1700
+ },
1701
+ {
1702
+ "epoch": 0.6880252100840336,
1703
+ "grad_norm": 46.424902003345146,
1704
+ "kl": 0.0,
1705
+ "learning_rate": 1.1076465055425646e-07,
1706
+ "logps/chosen": -295.8868777252907,
1707
+ "logps/rejected": -271.63410578547297,
1708
+ "loss": 0.2858,
1709
+ "rewards/chosen": 1.1238191294115643,
1710
+ "rewards/margins": 5.138592936421699,
1711
+ "rewards/rejected": -4.014773807010135,
1712
+ "step": 1310
1713
+ },
1714
+ {
1715
+ "epoch": 0.6932773109243697,
1716
+ "grad_norm": 68.63654254788709,
1717
+ "kl": 0.0,
1718
+ "learning_rate": 1.0735774076921128e-07,
1719
+ "logps/chosen": -290.4839599609375,
1720
+ "logps/rejected": -246.5270751953125,
1721
+ "loss": 0.3271,
1722
+ "rewards/chosen": 1.0010972976684571,
1723
+ "rewards/margins": 4.268711280822754,
1724
+ "rewards/rejected": -3.267613983154297,
1725
+ "step": 1320
1726
+ },
1727
+ {
1728
+ "epoch": 0.6985294117647058,
1729
+ "grad_norm": 41.396174001243224,
1730
+ "kl": 0.0,
1731
+ "learning_rate": 1.039896642765809e-07,
1732
+ "logps/chosen": -301.9284396701389,
1733
+ "logps/rejected": -323.622314453125,
1734
+ "loss": 0.2906,
1735
+ "rewards/chosen": 0.9389337963528104,
1736
+ "rewards/margins": 5.783896870083279,
1737
+ "rewards/rejected": -4.844963073730469,
1738
+ "step": 1330
1739
+ },
1740
+ {
1741
+ "epoch": 0.7037815126050421,
1742
+ "grad_norm": 42.28257001832088,
1743
+ "kl": 0.0,
1744
+ "learning_rate": 1.0066133801009871e-07,
1745
+ "logps/chosen": -330.92494419642856,
1746
+ "logps/rejected": -265.96412417763156,
1747
+ "loss": 0.2927,
1748
+ "rewards/chosen": 1.0747142973400297,
1749
+ "rewards/margins": 5.422101510796033,
1750
+ "rewards/rejected": -4.347387213456003,
1751
+ "step": 1340
1752
+ },
1753
+ {
1754
+ "epoch": 0.7090336134453782,
1755
+ "grad_norm": 84.07485027336253,
1756
+ "kl": 0.0,
1757
+ "learning_rate": 9.737366808179553e-08,
1758
+ "logps/chosen": -309.5953733766234,
1759
+ "logps/rejected": -269.1078219126506,
1760
+ "loss": 0.2967,
1761
+ "rewards/chosen": 0.7130060567484273,
1762
+ "rewards/margins": 5.262509245283819,
1763
+ "rewards/rejected": -4.549503188535391,
1764
+ "step": 1350
1765
+ },
1766
+ {
1767
+ "epoch": 0.7142857142857143,
1768
+ "grad_norm": 46.16115962232735,
1769
+ "kl": 0.0,
1770
+ "learning_rate": 9.412754953531663e-08,
1771
+ "logps/chosen": -268.95225954341316,
1772
+ "logps/rejected": -302.0711039624183,
1773
+ "loss": 0.3274,
1774
+ "rewards/chosen": 0.8310946161875468,
1775
+ "rewards/margins": 4.501142151803355,
1776
+ "rewards/rejected": -3.670047535615809,
1777
+ "step": 1360
1778
+ },
1779
+ {
1780
+ "epoch": 0.7195378151260504,
1781
+ "grad_norm": 125.1880813626159,
1782
+ "kl": 0.0,
1783
+ "learning_rate": 9.092386610225325e-08,
1784
+ "logps/chosen": -269.0572060032895,
1785
+ "logps/rejected": -291.4349655877976,
1786
+ "loss": 0.308,
1787
+ "rewards/chosen": 1.3923439226652448,
1788
+ "rewards/margins": 4.397153347655944,
1789
+ "rewards/rejected": -3.0048094249906994,
1790
+ "step": 1370
1791
+ },
1792
+ {
1793
+ "epoch": 0.7247899159663865,
1794
+ "grad_norm": 72.34232290742432,
1795
+ "kl": 0.0,
1796
+ "learning_rate": 8.776348996155317e-08,
1797
+ "logps/chosen": -295.1348721590909,
1798
+ "logps/rejected": -292.1137348790323,
1799
+ "loss": 0.2973,
1800
+ "rewards/chosen": 0.5286340886896307,
1801
+ "rewards/margins": 5.645270303407373,
1802
+ "rewards/rejected": -5.116636214717742,
1803
+ "step": 1380
1804
+ },
1805
+ {
1806
+ "epoch": 0.7300420168067226,
1807
+ "grad_norm": 101.10706766564086,
1808
+ "kl": 0.0,
1809
+ "learning_rate": 8.464728150207636e-08,
1810
+ "logps/chosen": -298.3083235062893,
1811
+ "logps/rejected": -309.3073563664596,
1812
+ "loss": 0.2942,
1813
+ "rewards/chosen": 1.189275585630405,
1814
+ "rewards/margins": 5.284051052075474,
1815
+ "rewards/rejected": -4.0947754664450695,
1816
+ "step": 1390
1817
+ },
1818
+ {
1819
+ "epoch": 0.7352941176470589,
1820
+ "grad_norm": 32.92022315972127,
1821
+ "kl": 0.0,
1822
+ "learning_rate": 8.15760890883607e-08,
1823
+ "logps/chosen": -282.652443272293,
1824
+ "logps/rejected": -268.00280387269936,
1825
+ "loss": 0.3095,
1826
+ "rewards/chosen": 0.9466076504652667,
1827
+ "rewards/margins": 4.818131267700696,
1828
+ "rewards/rejected": -3.8715236172354293,
1829
+ "step": 1400
1830
+ },
1831
+ {
1832
+ "epoch": 0.740546218487395,
1833
+ "grad_norm": 66.0924893411411,
1834
+ "kl": 0.0,
1835
+ "learning_rate": 7.855074882966103e-08,
1836
+ "logps/chosen": -317.7045433407738,
1837
+ "logps/rejected": -290.1392115542763,
1838
+ "loss": 0.2904,
1839
+ "rewards/chosen": 0.8157288687569755,
1840
+ "rewards/margins": 5.654886776343324,
1841
+ "rewards/rejected": -4.8391579075863485,
1842
+ "step": 1410
1843
+ },
1844
+ {
1845
+ "epoch": 0.7457983193277311,
1846
+ "grad_norm": 29.988518434005318,
1847
+ "kl": 0.0,
1848
+ "learning_rate": 7.557208435232449e-08,
1849
+ "logps/chosen": -281.4327616494083,
1850
+ "logps/rejected": -294.1578797599338,
1851
+ "loss": 0.2786,
1852
+ "rewards/chosen": 1.3708069028233636,
1853
+ "rewards/margins": 5.1220817695969565,
1854
+ "rewards/rejected": -3.751274866773593,
1855
+ "step": 1420
1856
+ },
1857
+ {
1858
+ "epoch": 0.7510504201680672,
1859
+ "grad_norm": 137.787205313578,
1860
+ "kl": 0.0,
1861
+ "learning_rate": 7.264090657556443e-08,
1862
+ "logps/chosen": -301.6008921967456,
1863
+ "logps/rejected": -260.6448158112583,
1864
+ "loss": 0.3048,
1865
+ "rewards/chosen": 0.6026445196930473,
1866
+ "rewards/margins": 5.368352381991268,
1867
+ "rewards/rejected": -4.76570786229822,
1868
+ "step": 1430
1869
+ },
1870
+ {
1871
+ "epoch": 0.7563025210084033,
1872
+ "grad_norm": 72.85612810540347,
1873
+ "kl": 0.0,
1874
+ "learning_rate": 6.975801349069385e-08,
1875
+ "logps/chosen": -284.1950284090909,
1876
+ "logps/rejected": -288.5327872983871,
1877
+ "loss": 0.2889,
1878
+ "rewards/chosen": 0.9271378950639204,
1879
+ "rewards/margins": 5.736759792115331,
1880
+ "rewards/rejected": -4.809621897051411,
1881
+ "step": 1440
1882
+ },
1883
+ {
1884
+ "epoch": 0.7615546218487395,
1885
+ "grad_norm": 102.78241617760347,
1886
+ "kl": 0.0,
1887
+ "learning_rate": 6.692418994387799e-08,
1888
+ "logps/chosen": -288.68474264705884,
1889
+ "logps/rejected": -295.6461452095808,
1890
+ "loss": 0.2864,
1891
+ "rewards/chosen": 1.031971950157016,
1892
+ "rewards/margins": 5.655328021218018,
1893
+ "rewards/rejected": -4.623356071061003,
1894
+ "step": 1450
1895
+ },
1896
+ {
1897
+ "epoch": 0.7668067226890757,
1898
+ "grad_norm": 65.20920984482183,
1899
+ "kl": 0.0,
1900
+ "learning_rate": 6.414020742246593e-08,
1901
+ "logps/chosen": -290.9591749237805,
1902
+ "logps/rejected": -305.4570562900641,
1903
+ "loss": 0.2474,
1904
+ "rewards/chosen": 1.242940390982279,
1905
+ "rewards/margins": 6.340856431647342,
1906
+ "rewards/rejected": -5.097916040665064,
1907
+ "step": 1460
1908
+ },
1909
+ {
1910
+ "epoch": 0.7720588235294118,
1911
+ "grad_norm": 66.73235542937682,
1912
+ "kl": 0.0,
1913
+ "learning_rate": 6.140682384495902e-08,
1914
+ "logps/chosen": -311.04422530594405,
1915
+ "logps/rejected": -286.8678937146893,
1916
+ "loss": 0.2764,
1917
+ "rewards/chosen": 1.3596580878837958,
1918
+ "rewards/margins": 5.264105654201769,
1919
+ "rewards/rejected": -3.904447566317973,
1920
+ "step": 1470
1921
+ },
1922
+ {
1923
+ "epoch": 0.7773109243697479,
1924
+ "grad_norm": 86.01646021421267,
1925
+ "kl": 0.0,
1926
+ "learning_rate": 5.872478335467298e-08,
1927
+ "logps/chosen": -266.1013243140244,
1928
+ "logps/rejected": -272.6937850560897,
1929
+ "loss": 0.3282,
1930
+ "rewards/chosen": 1.3123749523628048,
1931
+ "rewards/margins": 4.397385187489007,
1932
+ "rewards/rejected": -3.085010235126202,
1933
+ "step": 1480
1934
+ },
1935
+ {
1936
+ "epoch": 0.782563025210084,
1937
+ "grad_norm": 40.2582163158674,
1938
+ "kl": 0.0,
1939
+ "learning_rate": 5.60948161171505e-08,
1940
+ "logps/chosen": -293.6717694256757,
1941
+ "logps/rejected": -290.36904978197674,
1942
+ "loss": 0.3256,
1943
+ "rewards/chosen": 1.6847073323017843,
1944
+ "rewards/margins": 4.269535376394717,
1945
+ "rewards/rejected": -2.5848280440929323,
1946
+ "step": 1490
1947
+ },
1948
+ {
1949
+ "epoch": 0.7878151260504201,
1950
+ "grad_norm": 63.304715322652115,
1951
+ "kl": 0.0,
1952
+ "learning_rate": 5.351763812137916e-08,
1953
+ "logps/chosen": -276.54271343954247,
1954
+ "logps/rejected": -297.85116485778445,
1955
+ "loss": 0.307,
1956
+ "rewards/chosen": 1.3718405330882353,
1957
+ "rewards/margins": 4.822991575729516,
1958
+ "rewards/rejected": -3.45115104264128,
1959
+ "step": 1500
1960
+ },
1961
+ {
1962
+ "epoch": 0.7930672268907563,
1963
+ "grad_norm": 52.372015337826504,
1964
+ "kl": 0.0,
1965
+ "learning_rate": 5.0993950984868836e-08,
1966
+ "logps/chosen": -268.07744565217394,
1967
+ "logps/rejected": -287.0620332154088,
1968
+ "loss": 0.331,
1969
+ "rewards/chosen": 1.229935924458948,
1970
+ "rewards/margins": 4.45942081844971,
1971
+ "rewards/rejected": -3.2294848939907626,
1972
+ "step": 1510
1973
+ },
1974
+ {
1975
+ "epoch": 0.7983193277310925,
1976
+ "grad_norm": 32.103803649440515,
1977
+ "kl": 0.0,
1978
+ "learning_rate": 4.8524441762641284e-08,
1979
+ "logps/chosen": -314.27061222484275,
1980
+ "logps/rejected": -264.5042944487578,
1981
+ "loss": 0.273,
1982
+ "rewards/chosen": 1.570641667587952,
1983
+ "rewards/margins": 5.312982324129101,
1984
+ "rewards/rejected": -3.742340656541149,
1985
+ "step": 1520
1986
+ },
1987
+ {
1988
+ "epoch": 0.8035714285714286,
1989
+ "grad_norm": 43.137151503875806,
1990
+ "kl": 0.0,
1991
+ "learning_rate": 4.6109782760184956e-08,
1992
+ "logps/chosen": -277.4168693862275,
1993
+ "logps/rejected": -293.28860294117646,
1994
+ "loss": 0.2971,
1995
+ "rewards/chosen": 1.7373914889946669,
1996
+ "rewards/margins": 4.780555711063703,
1997
+ "rewards/rejected": -3.043164222069036,
1998
+ "step": 1530
1999
+ },
2000
+ {
2001
+ "epoch": 0.8088235294117647,
2002
+ "grad_norm": 50.02695394326994,
2003
+ "kl": 0.0,
2004
+ "learning_rate": 4.375063135042445e-08,
2005
+ "logps/chosen": -295.5123428254438,
2006
+ "logps/rejected": -317.11524730960264,
2007
+ "loss": 0.2655,
2008
+ "rewards/chosen": 1.741723596697023,
2009
+ "rewards/margins": 5.875752757758695,
2010
+ "rewards/rejected": -4.134029161061672,
2011
+ "step": 1540
2012
+ },
2013
+ {
2014
+ "epoch": 0.8140756302521008,
2015
+ "grad_norm": 39.113276722110314,
2016
+ "kl": 0.0,
2017
+ "learning_rate": 4.144762979475575e-08,
2018
+ "logps/chosen": -296.8016826923077,
2019
+ "logps/rejected": -275.8460451977401,
2020
+ "loss": 0.3124,
2021
+ "rewards/chosen": 1.1531088235494973,
2022
+ "rewards/margins": 4.638992460886185,
2023
+ "rewards/rejected": -3.4858836373366877,
2024
+ "step": 1550
2025
+ },
2026
+ {
2027
+ "epoch": 0.819327731092437,
2028
+ "grad_norm": 65.74166961412382,
2029
+ "kl": 0.0,
2030
+ "learning_rate": 3.9201405068195385e-08,
2031
+ "logps/chosen": -300.7455797697368,
2032
+ "logps/rejected": -287.8294735863095,
2033
+ "loss": 0.3239,
2034
+ "rewards/chosen": 1.3694068507144326,
2035
+ "rewards/margins": 4.3731968432739565,
2036
+ "rewards/rejected": -3.0037899925595237,
2037
+ "step": 1560
2038
+ },
2039
+ {
2040
+ "epoch": 0.8245798319327731,
2041
+ "grad_norm": 40.97531017956201,
2042
+ "kl": 0.0,
2043
+ "learning_rate": 3.701256868869124e-08,
2044
+ "logps/chosen": -308.8232851808176,
2045
+ "logps/rejected": -251.1597923136646,
2046
+ "loss": 0.3387,
2047
+ "rewards/chosen": 1.3816734649850138,
2048
+ "rewards/margins": 4.159259081007917,
2049
+ "rewards/rejected": -2.7775856160229035,
2050
+ "step": 1570
2051
+ },
2052
+ {
2053
+ "epoch": 0.8298319327731093,
2054
+ "grad_norm": 67.8764255365854,
2055
+ "kl": 0.0,
2056
+ "learning_rate": 3.488171655064107e-08,
2057
+ "logps/chosen": -283.0639042075163,
2058
+ "logps/rejected": -281.1170705464072,
2059
+ "loss": 0.3203,
2060
+ "rewards/chosen": 1.294641232958027,
2061
+ "rewards/margins": 4.51022142511278,
2062
+ "rewards/rejected": -3.215580192154753,
2063
+ "step": 1580
2064
+ },
2065
+ {
2066
+ "epoch": 0.8350840336134454,
2067
+ "grad_norm": 50.875889688456084,
2068
+ "kl": 0.0,
2069
+ "learning_rate": 3.28094287626651e-08,
2070
+ "logps/chosen": -326.3169806985294,
2071
+ "logps/rejected": -297.6423697916667,
2072
+ "loss": 0.295,
2073
+ "rewards/chosen": 1.5313853544347427,
2074
+ "rewards/margins": 5.293017434512867,
2075
+ "rewards/rejected": -3.761632080078125,
2076
+ "step": 1590
2077
+ },
2078
+ {
2079
+ "epoch": 0.8403361344537815,
2080
+ "grad_norm": 46.374855210212814,
2081
+ "kl": 0.0,
2082
+ "learning_rate": 3.079626948967534e-08,
2083
+ "logps/chosen": -304.4027423469388,
2084
+ "logps/rejected": -287.12953847543355,
2085
+ "loss": 0.3133,
2086
+ "rewards/chosen": 1.1032969156901042,
2087
+ "rewards/margins": 4.994385853438478,
2088
+ "rewards/rejected": -3.8910889377483744,
2089
+ "step": 1600
2090
+ },
2091
+ {
2092
+ "epoch": 0.8455882352941176,
2093
+ "grad_norm": 55.05693685423068,
2094
+ "kl": 0.0,
2095
+ "learning_rate": 2.88427867992862e-08,
2096
+ "logps/chosen": -289.62710160818716,
2097
+ "logps/rejected": -261.55342911073825,
2098
+ "loss": 0.2771,
2099
+ "rewards/chosen": 1.5187704521313048,
2100
+ "rewards/margins": 5.328108912963939,
2101
+ "rewards/rejected": -3.809338460832634,
2102
+ "step": 1610
2103
+ },
2104
+ {
2105
+ "epoch": 0.8508403361344538,
2106
+ "grad_norm": 40.13605931494857,
2107
+ "kl": 0.0,
2108
+ "learning_rate": 2.6949512512606965e-08,
2109
+ "logps/chosen": -292.2077305169753,
2110
+ "logps/rejected": -295.6835195806962,
2111
+ "loss": 0.2883,
2112
+ "rewards/chosen": 1.2319991500289351,
2113
+ "rewards/margins": 5.180050506091039,
2114
+ "rewards/rejected": -3.9480513560621042,
2115
+ "step": 1620
2116
+ },
2117
+ {
2118
+ "epoch": 0.8560924369747899,
2119
+ "grad_norm": 26.234795280971806,
2120
+ "kl": 0.0,
2121
+ "learning_rate": 2.5116962059457653e-08,
2122
+ "logps/chosen": -302.0387451171875,
2123
+ "logps/rejected": -288.72607421875,
2124
+ "loss": 0.2664,
2125
+ "rewards/chosen": 1.2181474685668945,
2126
+ "rewards/margins": 5.965513038635254,
2127
+ "rewards/rejected": -4.747365570068359,
2128
+ "step": 1630
2129
+ },
2130
+ {
2131
+ "epoch": 0.8613445378151261,
2132
+ "grad_norm": 88.66056550298477,
2133
+ "kl": 0.0,
2134
+ "learning_rate": 2.334563433804687e-08,
2135
+ "logps/chosen": -321.8770623059006,
2136
+ "logps/rejected": -290.4252161949685,
2137
+ "loss": 0.2794,
2138
+ "rewards/chosen": 1.0824178375812792,
2139
+ "rewards/margins": 5.4567103671852495,
2140
+ "rewards/rejected": -4.37429252960397,
2141
+ "step": 1640
2142
+ },
2143
+ {
2144
+ "epoch": 0.8665966386554622,
2145
+ "grad_norm": 46.4344164784028,
2146
+ "kl": 0.0,
2147
+ "learning_rate": 2.1636011579150793e-08,
2148
+ "logps/chosen": -295.313525390625,
2149
+ "logps/rejected": -301.4338134765625,
2150
+ "loss": 0.2569,
2151
+ "rewards/chosen": 1.2288617134094237,
2152
+ "rewards/margins": 5.767054080963135,
2153
+ "rewards/rejected": -4.538192367553711,
2154
+ "step": 1650
2155
+ },
2156
+ {
2157
+ "epoch": 0.8718487394957983,
2158
+ "grad_norm": 46.720887932263636,
2159
+ "kl": 0.0,
2160
+ "learning_rate": 1.998855921482906e-08,
2161
+ "logps/chosen": -277.0703369140625,
2162
+ "logps/rejected": -252.481689453125,
2163
+ "loss": 0.321,
2164
+ "rewards/chosen": 0.9479250907897949,
2165
+ "rewards/margins": 4.605030727386475,
2166
+ "rewards/rejected": -3.65710563659668,
2167
+ "step": 1660
2168
+ },
2169
+ {
2170
+ "epoch": 0.8771008403361344,
2171
+ "grad_norm": 77.29491195750472,
2172
+ "kl": 0.0,
2173
+ "learning_rate": 1.8403725751714615e-08,
2174
+ "logps/chosen": -291.567009066358,
2175
+ "logps/rejected": -321.13864715189874,
2176
+ "loss": 0.3081,
2177
+ "rewards/chosen": 0.9058116394796489,
2178
+ "rewards/margins": 5.581022712956706,
2179
+ "rewards/rejected": -4.675211073477057,
2180
+ "step": 1670
2181
+ },
2182
+ {
2183
+ "epoch": 0.8823529411764706,
2184
+ "grad_norm": 35.132417644186994,
2185
+ "kl": 0.0,
2186
+ "learning_rate": 1.6881942648911074e-08,
2187
+ "logps/chosen": -289.9874625748503,
2188
+ "logps/rejected": -282.17639399509807,
2189
+ "loss": 0.3084,
2190
+ "rewards/chosen": 1.0200939064254304,
2191
+ "rewards/margins": 5.139812570258151,
2192
+ "rewards/rejected": -4.119718663832721,
2193
+ "step": 1680
2194
+ },
2195
+ {
2196
+ "epoch": 0.8876050420168067,
2197
+ "grad_norm": 41.03832392131175,
2198
+ "kl": 0.0,
2199
+ "learning_rate": 1.5423624200531115e-08,
2200
+ "logps/chosen": -295.63090376420456,
2201
+ "logps/rejected": -312.71739366319446,
2202
+ "loss": 0.267,
2203
+ "rewards/chosen": 1.9761036959561435,
2204
+ "rewards/margins": 5.501615119702889,
2205
+ "rewards/rejected": -3.5255114237467446,
2206
+ "step": 1690
2207
+ },
2208
+ {
2209
+ "epoch": 0.8928571428571429,
2210
+ "grad_norm": 75.37154437951729,
2211
+ "kl": 0.0,
2212
+ "learning_rate": 1.4029167422908105e-08,
2213
+ "logps/chosen": -276.8064123376623,
2214
+ "logps/rejected": -277.1538968373494,
2215
+ "loss": 0.2886,
2216
+ "rewards/chosen": 1.2620481020444398,
2217
+ "rewards/margins": 5.54502657354764,
2218
+ "rewards/rejected": -4.2829784715032,
2219
+ "step": 1700
2220
+ },
2221
+ {
2222
+ "epoch": 0.898109243697479,
2223
+ "grad_norm": 40.567317410299374,
2224
+ "kl": 0.0,
2225
+ "learning_rate": 1.2698951946511327e-08,
2226
+ "logps/chosen": -308.1201601808176,
2227
+ "logps/rejected": -281.5419497282609,
2228
+ "loss": 0.3105,
2229
+ "rewards/chosen": 0.3164803127072892,
2230
+ "rewards/margins": 4.992804434407599,
2231
+ "rewards/rejected": -4.67632412170031,
2232
+ "step": 1710
2233
+ },
2234
+ {
2235
+ "epoch": 0.9033613445378151,
2236
+ "grad_norm": 54.46098979124272,
2237
+ "kl": 0.0,
2238
+ "learning_rate": 1.1433339912594265e-08,
2239
+ "logps/chosen": -306.5632858727811,
2240
+ "logps/rejected": -280.34178394039736,
2241
+ "loss": 0.2749,
2242
+ "rewards/chosen": 1.2275824010724852,
2243
+ "rewards/margins": 6.236800730697899,
2244
+ "rewards/rejected": -5.009218329625414,
2245
+ "step": 1720
2246
+ },
2247
+ {
2248
+ "epoch": 0.9086134453781513,
2249
+ "grad_norm": 40.407188173526485,
2250
+ "kl": 0.0,
2251
+ "learning_rate": 1.0232675874604608e-08,
2252
+ "logps/chosen": -309.16411713286715,
2253
+ "logps/rejected": -310.2364936440678,
2254
+ "loss": 0.2894,
2255
+ "rewards/chosen": 0.768409675651497,
2256
+ "rewards/margins": 5.153310129725472,
2257
+ "rewards/rejected": -4.384900454073976,
2258
+ "step": 1730
2259
+ },
2260
+ {
2261
+ "epoch": 0.9138655462184874,
2262
+ "grad_norm": 30.438351627967617,
2263
+ "kl": 0.0,
2264
+ "learning_rate": 9.097286704381896e-09,
2265
+ "logps/chosen": -317.2416330645161,
2266
+ "logps/rejected": -278.5894886363636,
2267
+ "loss": 0.3153,
2268
+ "rewards/chosen": 1.0305794992754536,
2269
+ "rewards/margins": 4.64966315812962,
2270
+ "rewards/rejected": -3.6190836588541666,
2271
+ "step": 1740
2272
+ },
2273
+ {
2274
+ "epoch": 0.9191176470588235,
2275
+ "grad_norm": 56.93756466862378,
2276
+ "kl": 0.0,
2277
+ "learning_rate": 8.02748150316937e-09,
2278
+ "logps/chosen": -286.57449070411394,
2279
+ "logps/rejected": -278.80803915895063,
2280
+ "loss": 0.3173,
2281
+ "rewards/chosen": 1.041691840449466,
2282
+ "rewards/margins": 4.70056070076486,
2283
+ "rewards/rejected": -3.6588688603153936,
2284
+ "step": 1750
2285
+ },
2286
+ {
2287
+ "epoch": 0.9243697478991597,
2288
+ "grad_norm": 59.48003945550571,
2289
+ "kl": 0.0,
2290
+ "learning_rate": 7.023551517463089e-09,
2291
+ "logps/chosen": -276.2664721385542,
2292
+ "logps/rejected": -300.2993861607143,
2293
+ "loss": 0.3055,
2294
+ "rewards/chosen": 1.1007473497505647,
2295
+ "rewards/margins": 4.7333291844198015,
2296
+ "rewards/rejected": -3.632581834669237,
2297
+ "step": 1760
2298
+ },
2299
+ {
2300
+ "epoch": 0.9296218487394958,
2301
+ "grad_norm": 75.70187200694772,
2302
+ "kl": 0.0,
2303
+ "learning_rate": 6.085770059722634e-09,
2304
+ "logps/chosen": -276.1528105345912,
2305
+ "logps/rejected": -285.99791343167703,
2306
+ "loss": 0.2734,
2307
+ "rewards/chosen": 1.5460262658461086,
2308
+ "rewards/margins": 5.593088920519051,
2309
+ "rewards/rejected": -4.047062654672943,
2310
+ "step": 1770
2311
+ },
2312
+ {
2313
+ "epoch": 0.9348739495798319,
2314
+ "grad_norm": 62.362510517390575,
2315
+ "kl": 0.0,
2316
+ "learning_rate": 5.214392433963488e-09,
2317
+ "logps/chosen": -291.6478470203488,
2318
+ "logps/rejected": -285.2013038429054,
2319
+ "loss": 0.324,
2320
+ "rewards/chosen": 0.7867310989734738,
2321
+ "rewards/margins": 4.6428056641392255,
2322
+ "rewards/rejected": -3.8560745651657515,
2323
+ "step": 1780
2324
+ },
2325
+ {
2326
+ "epoch": 0.9401260504201681,
2327
+ "grad_norm": 57.685751276785766,
2328
+ "kl": 0.0,
2329
+ "learning_rate": 4.409655866252693e-09,
2330
+ "logps/chosen": -284.08316022398844,
2331
+ "logps/rejected": -311.69903273809524,
2332
+ "loss": 0.2987,
2333
+ "rewards/chosen": 1.1329683513310604,
2334
+ "rewards/margins": 5.054558935807038,
2335
+ "rewards/rejected": -3.921590584475978,
2336
+ "step": 1790
2337
+ },
2338
+ {
2339
+ "epoch": 0.9453781512605042,
2340
+ "grad_norm": 64.21944849726562,
2341
+ "kl": 0.0,
2342
+ "learning_rate": 3.671779440125644e-09,
2343
+ "logps/chosen": -296.63870919585986,
2344
+ "logps/rejected": -292.5702645705521,
2345
+ "loss": 0.282,
2346
+ "rewards/chosen": 1.5888889580015924,
2347
+ "rewards/margins": 5.482801244116431,
2348
+ "rewards/rejected": -3.8939122861148387,
2349
+ "step": 1800
2350
+ },
2351
+ {
2352
+ "epoch": 0.9506302521008403,
2353
+ "grad_norm": 120.91274575574961,
2354
+ "kl": 0.0,
2355
+ "learning_rate": 3.000964036942305e-09,
2356
+ "logps/chosen": -286.46690883757964,
2357
+ "logps/rejected": -307.89421970858893,
2358
+ "loss": 0.3187,
2359
+ "rewards/chosen": 0.656261176820014,
2360
+ "rewards/margins": 5.1038937616551365,
2361
+ "rewards/rejected": -4.447632584835123,
2362
+ "step": 1810
2363
+ },
2364
+ {
2365
+ "epoch": 0.9558823529411765,
2366
+ "grad_norm": 36.13196162142592,
2367
+ "kl": 0.0,
2368
+ "learning_rate": 2.397392281198729e-09,
2369
+ "logps/chosen": -302.49788306451615,
2370
+ "logps/rejected": -277.52104640151515,
2371
+ "loss": 0.2937,
2372
+ "rewards/chosen": 0.9888374574722782,
2373
+ "rewards/margins": 5.233212605436293,
2374
+ "rewards/rejected": -4.244375147964015,
2375
+ "step": 1820
2376
+ },
2377
+ {
2378
+ "epoch": 0.9611344537815126,
2379
+ "grad_norm": 28.996205179253028,
2380
+ "kl": 0.0,
2381
+ "learning_rate": 1.861228490808886e-09,
2382
+ "logps/chosen": -293.10886452414775,
2383
+ "logps/rejected": -297.74403211805554,
2384
+ "loss": 0.2997,
2385
+ "rewards/chosen": 0.9712606776844371,
2386
+ "rewards/margins": 5.202846710128014,
2387
+ "rewards/rejected": -4.231586032443577,
2388
+ "step": 1830
2389
+ },
2390
+ {
2391
+ "epoch": 0.9663865546218487,
2392
+ "grad_norm": 45.07816045720644,
2393
+ "kl": 0.0,
2394
+ "learning_rate": 1.3926186323703903e-09,
2395
+ "logps/chosen": -278.5950362042683,
2396
+ "logps/rejected": -270.61345653044873,
2397
+ "loss": 0.3545,
2398
+ "rewards/chosen": 0.5840071236214986,
2399
+ "rewards/margins": 3.794906672274343,
2400
+ "rewards/rejected": -3.2108995486528444,
2401
+ "step": 1840
2402
+ },
2403
+ {
2404
+ "epoch": 0.9716386554621849,
2405
+ "grad_norm": 37.183516112349096,
2406
+ "kl": 0.0,
2407
+ "learning_rate": 9.916902814261774e-10,
2408
+ "logps/chosen": -289.34506048387095,
2409
+ "logps/rejected": -296.7537168560606,
2410
+ "loss": 0.2669,
2411
+ "rewards/chosen": 1.187039283014113,
2412
+ "rewards/margins": 5.528245041773583,
2413
+ "rewards/rejected": -4.34120575875947,
2414
+ "step": 1850
2415
+ },
2416
+ {
2417
+ "epoch": 0.976890756302521,
2418
+ "grad_norm": 99.37300393766564,
2419
+ "kl": 0.0,
2420
+ "learning_rate": 6.585525877328968e-10,
2421
+ "logps/chosen": -291.4291068412162,
2422
+ "logps/rejected": -333.1322901526163,
2423
+ "loss": 0.3206,
2424
+ "rewards/chosen": 0.7791987238703547,
2425
+ "rewards/margins": 4.7422238391274405,
2426
+ "rewards/rejected": -3.9630251152570857,
2427
+ "step": 1860
2428
+ },
2429
+ {
2430
+ "epoch": 0.9821428571428571,
2431
+ "grad_norm": 91.23926744066455,
2432
+ "kl": 0.0,
2433
+ "learning_rate": 3.9329624554584883e-10,
2434
+ "logps/chosen": -284.56466490963857,
2435
+ "logps/rejected": -287.7052049512987,
2436
+ "loss": 0.3167,
2437
+ "rewards/chosen": 0.6357683848185711,
2438
+ "rewards/margins": 4.381204223394058,
2439
+ "rewards/rejected": -3.745435838575487,
2440
+ "step": 1870
2441
+ },
2442
+ {
2443
+ "epoch": 0.9873949579831933,
2444
+ "grad_norm": 56.11511499932997,
2445
+ "kl": 0.0,
2446
+ "learning_rate": 1.959934689280962e-10,
2447
+ "logps/chosen": -295.48760695684524,
2448
+ "logps/rejected": -301.8758994654605,
2449
+ "loss": 0.2857,
2450
+ "rewards/chosen": 1.2652909415108817,
2451
+ "rewards/margins": 6.0682341209927895,
2452
+ "rewards/rejected": -4.8029431794819075,
2453
+ "step": 1880
2454
+ },
2455
+ {
2456
+ "epoch": 0.9926470588235294,
2457
+ "grad_norm": 71.18992657480682,
2458
+ "kl": 0.0,
2459
+ "learning_rate": 6.669797209069017e-11,
2460
+ "logps/chosen": -269.54678721910113,
2461
+ "logps/rejected": -283.6238996478873,
2462
+ "loss": 0.3009,
2463
+ "rewards/chosen": 0.8121567629696278,
2464
+ "rewards/margins": 5.090732821370816,
2465
+ "rewards/rejected": -4.278576058401188,
2466
+ "step": 1890
2467
+ },
2468
+ {
2469
+ "epoch": 0.9978991596638656,
2470
+ "grad_norm": 25.085484740800712,
2471
+ "kl": 0.0,
2472
+ "learning_rate": 5.444954769395771e-12,
2473
+ "logps/chosen": -256.12128784937886,
2474
+ "logps/rejected": -292.2530709512579,
2475
+ "loss": 0.2815,
2476
+ "rewards/chosen": 1.2973005401421778,
2477
+ "rewards/margins": 5.830910950052752,
2478
+ "rewards/rejected": -4.533610409910574,
2479
+ "step": 1900
2480
+ },
2481
+ {
2482
+ "epoch": 1.0,
2483
+ "step": 1904,
2484
+ "total_flos": 0.0,
2485
+ "train_loss": 0.321955375749023,
2486
+ "train_runtime": 31031.977,
2487
+ "train_samples_per_second": 1.963,
2488
+ "train_steps_per_second": 0.061
2489
+ }
2490
+ ],
2491
+ "logging_steps": 10,
2492
+ "max_steps": 1904,
2493
+ "num_input_tokens_seen": 0,
2494
+ "num_train_epochs": 1,
2495
+ "save_steps": 500,
2496
+ "stateful_callbacks": {
2497
+ "TrainerControl": {
2498
+ "args": {
2499
+ "should_epoch_stop": false,
2500
+ "should_evaluate": false,
2501
+ "should_log": false,
2502
+ "should_save": true,
2503
+ "should_training_stop": true
2504
+ },
2505
+ "attributes": {}
2506
+ }
2507
+ },
2508
+ "total_flos": 0.0,
2509
+ "train_batch_size": 4,
2510
+ "trial_name": null,
2511
+ "trial_params": null
2512
+ }