yiran-wang3 commited on
Commit
7e74294
1 Parent(s): 2509e9d

End of training

Browse files
Files changed (6) hide show
  1. README.md +64 -0
  2. all_results.json +9 -0
  3. config.json +1 -1
  4. generation_config.json +14 -0
  5. train_results.json +9 -0
  6. trainer_state.json +1050 -0
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: yiran-wang3/qwen2_chat_adamw_iter1
5
+ tags:
6
+ - alignment-handbook
7
+ - generated_from_trainer
8
+ - trl
9
+ - dpo
10
+ datasets:
11
+ - self-generate/qw2_sppo_hard_new_cn_mining_oj_iter1-binarized
12
+ model-index:
13
+ - name: qwen2_chat_adamw_iter2
14
+ results: []
15
+ ---
16
+
17
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
+ should probably proofread and complete it, then remove this comment. -->
19
+
20
+ # qwen2_chat_adamw_iter2
21
+
22
+ This model is a fine-tuned version of [yiran-wang3/qwen2_chat_adamw_iter1](https://huggingface.co/yiran-wang3/qwen2_chat_adamw_iter1) on the self-generate/qw2_sppo_hard_new_cn_mining_oj_iter1-binarized dataset.
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 1e-06
42
+ - train_batch_size: 8
43
+ - eval_batch_size: 4
44
+ - seed: 42
45
+ - distributed_type: multi-GPU
46
+ - num_devices: 8
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 32
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: constant
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - lr_scheduler_warmup_steps: 100
53
+ - num_epochs: 1.0
54
+
55
+ ### Training results
56
+
57
+
58
+
59
+ ### Framework versions
60
+
61
+ - Transformers 4.45.0
62
+ - Pytorch 2.4.0+cu121
63
+ - Datasets 2.14.6
64
+ - Tokenizers 0.20.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.4953512450059255,
5
+ "train_runtime": 163.6148,
6
+ "train_samples": 3053,
7
+ "train_samples_per_second": 18.66,
8
+ "train_steps_per_second": 0.293
9
+ }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.45.0",
26
- "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
 
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.45.0",
26
+ "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.45.0"
14
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.4953512450059255,
5
+ "train_runtime": 163.6148,
6
+ "train_samples": 3053,
7
+ "train_samples_per_second": 18.66,
8
+ "train_steps_per_second": 0.293
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1050 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 100,
6
+ "global_step": 48,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "debug/policy_chosen_logits": -1.8909083604812622,
13
+ "debug/policy_chosen_logps": -212.1804656982422,
14
+ "debug/policy_rejected_logits": -1.8466837406158447,
15
+ "debug/policy_rejected_logps": -221.20199584960938,
16
+ "debug/reference_chosen_logps": -212.1804656982422,
17
+ "debug/reference_rejected_logps": -221.20199584960938,
18
+ "epoch": 0.020833333333333332,
19
+ "grad_norm": 4.55949998556895,
20
+ "learning_rate": 1e-06,
21
+ "logits/chosen": -1.8909083604812622,
22
+ "logits/rejected": -1.8466837406158447,
23
+ "logps/chosen": -212.1804656982422,
24
+ "logps/rejected": -221.20199584960938,
25
+ "loss": 0.5,
26
+ "rewards/accuracies": 0.0,
27
+ "rewards/chosen": 0.0,
28
+ "rewards/margins": 0.0,
29
+ "rewards/rejected": 0.0,
30
+ "step": 1
31
+ },
32
+ {
33
+ "debug/policy_chosen_logits": -1.9734070301055908,
34
+ "debug/policy_chosen_logps": -230.3494873046875,
35
+ "debug/policy_rejected_logits": -1.894794225692749,
36
+ "debug/policy_rejected_logps": -232.613037109375,
37
+ "debug/reference_chosen_logps": -230.78518676757812,
38
+ "debug/reference_rejected_logps": -232.9464111328125,
39
+ "epoch": 0.041666666666666664,
40
+ "grad_norm": 5.028939569239188,
41
+ "learning_rate": 1e-06,
42
+ "logits/chosen": -1.9734070301055908,
43
+ "logits/rejected": -1.894794225692749,
44
+ "logps/chosen": -230.3494873046875,
45
+ "logps/rejected": -232.613037109375,
46
+ "loss": 0.5001,
47
+ "rewards/accuracies": 0.5,
48
+ "rewards/chosen": 0.004357052035629749,
49
+ "rewards/margins": 0.0010233304928988218,
50
+ "rewards/rejected": 0.00333372107706964,
51
+ "step": 2
52
+ },
53
+ {
54
+ "debug/policy_chosen_logits": -1.796450138092041,
55
+ "debug/policy_chosen_logps": -239.1052703857422,
56
+ "debug/policy_rejected_logits": -1.9075357913970947,
57
+ "debug/policy_rejected_logps": -226.32025146484375,
58
+ "debug/reference_chosen_logps": -238.9079132080078,
59
+ "debug/reference_rejected_logps": -225.81983947753906,
60
+ "epoch": 0.0625,
61
+ "grad_norm": 5.159311600496707,
62
+ "learning_rate": 1e-06,
63
+ "logits/chosen": -1.796450138092041,
64
+ "logits/rejected": -1.9075357913970947,
65
+ "logps/chosen": -239.1052703857422,
66
+ "logps/rejected": -226.32025146484375,
67
+ "loss": 0.4992,
68
+ "rewards/accuracies": 0.625,
69
+ "rewards/chosen": -0.0019736099056899548,
70
+ "rewards/margins": 0.0030304910615086555,
71
+ "rewards/rejected": -0.005004100501537323,
72
+ "step": 3
73
+ },
74
+ {
75
+ "debug/policy_chosen_logits": -1.8530775308609009,
76
+ "debug/policy_chosen_logps": -224.97164916992188,
77
+ "debug/policy_rejected_logits": -1.868254542350769,
78
+ "debug/policy_rejected_logps": -193.08578491210938,
79
+ "debug/reference_chosen_logps": -225.149658203125,
80
+ "debug/reference_rejected_logps": -193.382568359375,
81
+ "epoch": 0.08333333333333333,
82
+ "grad_norm": 5.291981074658711,
83
+ "learning_rate": 1e-06,
84
+ "logits/chosen": -1.8530775308609009,
85
+ "logits/rejected": -1.868254542350769,
86
+ "logps/chosen": -224.97164916992188,
87
+ "logps/rejected": -193.08578491210938,
88
+ "loss": 0.5002,
89
+ "rewards/accuracies": 0.375,
90
+ "rewards/chosen": 0.0017800141358748078,
91
+ "rewards/margins": -0.0011878202203661203,
92
+ "rewards/rejected": 0.00296783447265625,
93
+ "step": 4
94
+ },
95
+ {
96
+ "debug/policy_chosen_logits": -2.037583827972412,
97
+ "debug/policy_chosen_logps": -218.92698669433594,
98
+ "debug/policy_rejected_logits": -1.9175313711166382,
99
+ "debug/policy_rejected_logps": -226.9515838623047,
100
+ "debug/reference_chosen_logps": -218.6461181640625,
101
+ "debug/reference_rejected_logps": -226.9630126953125,
102
+ "epoch": 0.10416666666666667,
103
+ "grad_norm": 4.634912199648336,
104
+ "learning_rate": 1e-06,
105
+ "logits/chosen": -2.037583827972412,
106
+ "logits/rejected": -1.9175313711166382,
107
+ "logps/chosen": -218.92698669433594,
108
+ "logps/rejected": -226.9515838623047,
109
+ "loss": 0.5003,
110
+ "rewards/accuracies": 0.375,
111
+ "rewards/chosen": -0.0028084944933652878,
112
+ "rewards/margins": -0.002922859275713563,
113
+ "rewards/rejected": 0.00011436472414061427,
114
+ "step": 5
115
+ },
116
+ {
117
+ "debug/policy_chosen_logits": -1.9980738162994385,
118
+ "debug/policy_chosen_logps": -229.96798706054688,
119
+ "debug/policy_rejected_logits": -2.106717586517334,
120
+ "debug/policy_rejected_logps": -228.21139526367188,
121
+ "debug/reference_chosen_logps": -229.47543334960938,
122
+ "debug/reference_rejected_logps": -227.33944702148438,
123
+ "epoch": 0.125,
124
+ "grad_norm": 4.883441388138732,
125
+ "learning_rate": 1e-06,
126
+ "logits/chosen": -1.9980738162994385,
127
+ "logits/rejected": -2.106717586517334,
128
+ "logps/chosen": -229.96798706054688,
129
+ "logps/rejected": -228.21139526367188,
130
+ "loss": 0.4985,
131
+ "rewards/accuracies": 0.625,
132
+ "rewards/chosen": -0.004925765562802553,
133
+ "rewards/margins": 0.0037936782464385033,
134
+ "rewards/rejected": -0.008719444274902344,
135
+ "step": 6
136
+ },
137
+ {
138
+ "debug/policy_chosen_logits": -1.831208348274231,
139
+ "debug/policy_chosen_logps": -209.08120727539062,
140
+ "debug/policy_rejected_logits": -1.866153597831726,
141
+ "debug/policy_rejected_logps": -212.32781982421875,
142
+ "debug/reference_chosen_logps": -209.07412719726562,
143
+ "debug/reference_rejected_logps": -212.41383361816406,
144
+ "epoch": 0.14583333333333334,
145
+ "grad_norm": 4.5645610444456395,
146
+ "learning_rate": 1e-06,
147
+ "logits/chosen": -1.831208348274231,
148
+ "logits/rejected": -1.866153597831726,
149
+ "logps/chosen": -209.08120727539062,
150
+ "logps/rejected": -212.32781982421875,
151
+ "loss": 0.4994,
152
+ "rewards/accuracies": 0.25,
153
+ "rewards/chosen": -7.074361201375723e-05,
154
+ "rewards/margins": -0.0009309577289968729,
155
+ "rewards/rejected": 0.0008602142333984375,
156
+ "step": 7
157
+ },
158
+ {
159
+ "debug/policy_chosen_logits": -2.0019192695617676,
160
+ "debug/policy_chosen_logps": -194.42999267578125,
161
+ "debug/policy_rejected_logits": -1.971666932106018,
162
+ "debug/policy_rejected_logps": -202.69728088378906,
163
+ "debug/reference_chosen_logps": -194.51901245117188,
164
+ "debug/reference_rejected_logps": -202.4325714111328,
165
+ "epoch": 0.16666666666666666,
166
+ "grad_norm": 4.5561230493648965,
167
+ "learning_rate": 1e-06,
168
+ "logits/chosen": -2.0019192695617676,
169
+ "logits/rejected": -1.971666932106018,
170
+ "logps/chosen": -194.42999267578125,
171
+ "logps/rejected": -202.69728088378906,
172
+ "loss": 0.4992,
173
+ "rewards/accuracies": 0.75,
174
+ "rewards/chosen": 0.0008901978144422174,
175
+ "rewards/margins": 0.0035373116843402386,
176
+ "rewards/rejected": -0.0026471137534826994,
177
+ "step": 8
178
+ },
179
+ {
180
+ "debug/policy_chosen_logits": -1.9558801651000977,
181
+ "debug/policy_chosen_logps": -215.09481811523438,
182
+ "debug/policy_rejected_logits": -1.905698537826538,
183
+ "debug/policy_rejected_logps": -221.3304443359375,
184
+ "debug/reference_chosen_logps": -215.267578125,
185
+ "debug/reference_rejected_logps": -221.17218017578125,
186
+ "epoch": 0.1875,
187
+ "grad_norm": 4.561516825339906,
188
+ "learning_rate": 1e-06,
189
+ "logits/chosen": -1.9558801651000977,
190
+ "logits/rejected": -1.905698537826538,
191
+ "logps/chosen": -215.09481811523438,
192
+ "logps/rejected": -221.3304443359375,
193
+ "loss": 0.4993,
194
+ "rewards/accuracies": 0.375,
195
+ "rewards/chosen": 0.0017275046557188034,
196
+ "rewards/margins": 0.0033102035522460938,
197
+ "rewards/rejected": -0.0015826987801119685,
198
+ "step": 9
199
+ },
200
+ {
201
+ "debug/policy_chosen_logits": -2.034991979598999,
202
+ "debug/policy_chosen_logps": -203.59085083007812,
203
+ "debug/policy_rejected_logits": -2.0764079093933105,
204
+ "debug/policy_rejected_logps": -204.25738525390625,
205
+ "debug/reference_chosen_logps": -204.2655029296875,
206
+ "debug/reference_rejected_logps": -204.46153259277344,
207
+ "epoch": 0.20833333333333334,
208
+ "grad_norm": 4.745773144075817,
209
+ "learning_rate": 1e-06,
210
+ "logits/chosen": -2.034991979598999,
211
+ "logits/rejected": -2.0764079093933105,
212
+ "logps/chosen": -203.59085083007812,
213
+ "logps/rejected": -204.25738525390625,
214
+ "loss": 0.4984,
215
+ "rewards/accuracies": 0.75,
216
+ "rewards/chosen": 0.006746310740709305,
217
+ "rewards/margins": 0.004704780410975218,
218
+ "rewards/rejected": 0.0020415307953953743,
219
+ "step": 10
220
+ },
221
+ {
222
+ "debug/policy_chosen_logits": -2.0038836002349854,
223
+ "debug/policy_chosen_logps": -226.81753540039062,
224
+ "debug/policy_rejected_logits": -1.8038012981414795,
225
+ "debug/policy_rejected_logps": -226.14935302734375,
226
+ "debug/reference_chosen_logps": -226.28543090820312,
227
+ "debug/reference_rejected_logps": -225.2156524658203,
228
+ "epoch": 0.22916666666666666,
229
+ "grad_norm": 4.763690551977785,
230
+ "learning_rate": 1e-06,
231
+ "logits/chosen": -2.0038836002349854,
232
+ "logits/rejected": -1.8038012981414795,
233
+ "logps/chosen": -226.81753540039062,
234
+ "logps/rejected": -226.14935302734375,
235
+ "loss": 0.4984,
236
+ "rewards/accuracies": 0.5,
237
+ "rewards/chosen": -0.005321083124727011,
238
+ "rewards/margins": 0.004015827551484108,
239
+ "rewards/rejected": -0.009336910210549831,
240
+ "step": 11
241
+ },
242
+ {
243
+ "debug/policy_chosen_logits": -2.122854471206665,
244
+ "debug/policy_chosen_logps": -204.9606170654297,
245
+ "debug/policy_rejected_logits": -1.9377713203430176,
246
+ "debug/policy_rejected_logps": -229.38241577148438,
247
+ "debug/reference_chosen_logps": -204.8526153564453,
248
+ "debug/reference_rejected_logps": -229.0899658203125,
249
+ "epoch": 0.25,
250
+ "grad_norm": 4.606159450019782,
251
+ "learning_rate": 1e-06,
252
+ "logits/chosen": -2.122854471206665,
253
+ "logits/rejected": -1.9377713203430176,
254
+ "logps/chosen": -204.9606170654297,
255
+ "logps/rejected": -229.38241577148438,
256
+ "loss": 0.5,
257
+ "rewards/accuracies": 0.625,
258
+ "rewards/chosen": -0.0010799788869917393,
259
+ "rewards/margins": 0.0018445015884935856,
260
+ "rewards/rejected": -0.002924480475485325,
261
+ "step": 12
262
+ },
263
+ {
264
+ "debug/policy_chosen_logits": -1.9475512504577637,
265
+ "debug/policy_chosen_logps": -224.90866088867188,
266
+ "debug/policy_rejected_logits": -1.981605887413025,
267
+ "debug/policy_rejected_logps": -228.95736694335938,
268
+ "debug/reference_chosen_logps": -225.68130493164062,
269
+ "debug/reference_rejected_logps": -229.091796875,
270
+ "epoch": 0.2708333333333333,
271
+ "grad_norm": 5.058425556791009,
272
+ "learning_rate": 1e-06,
273
+ "logits/chosen": -1.9475512504577637,
274
+ "logits/rejected": -1.981605887413025,
275
+ "logps/chosen": -224.90866088867188,
276
+ "logps/rejected": -228.95736694335938,
277
+ "loss": 0.4975,
278
+ "rewards/accuracies": 0.625,
279
+ "rewards/chosen": 0.007726612035185099,
280
+ "rewards/margins": 0.00638235080987215,
281
+ "rewards/rejected": 0.0013442612253129482,
282
+ "step": 13
283
+ },
284
+ {
285
+ "debug/policy_chosen_logits": -1.999975323677063,
286
+ "debug/policy_chosen_logps": -209.15884399414062,
287
+ "debug/policy_rejected_logits": -1.9890738725662231,
288
+ "debug/policy_rejected_logps": -207.29666137695312,
289
+ "debug/reference_chosen_logps": -209.07809448242188,
290
+ "debug/reference_rejected_logps": -207.4446563720703,
291
+ "epoch": 0.2916666666666667,
292
+ "grad_norm": 4.7138136584768215,
293
+ "learning_rate": 1e-06,
294
+ "logits/chosen": -1.999975323677063,
295
+ "logits/rejected": -1.9890738725662231,
296
+ "logps/chosen": -209.15884399414062,
297
+ "logps/rejected": -207.29666137695312,
298
+ "loss": 0.4986,
299
+ "rewards/accuracies": 0.25,
300
+ "rewards/chosen": -0.0008074190700426698,
301
+ "rewards/margins": -0.002287311712279916,
302
+ "rewards/rejected": 0.001479892642237246,
303
+ "step": 14
304
+ },
305
+ {
306
+ "debug/policy_chosen_logits": -1.8115614652633667,
307
+ "debug/policy_chosen_logps": -241.42343139648438,
308
+ "debug/policy_rejected_logits": -1.799264669418335,
309
+ "debug/policy_rejected_logps": -215.36915588378906,
310
+ "debug/reference_chosen_logps": -242.1216583251953,
311
+ "debug/reference_rejected_logps": -215.05712890625,
312
+ "epoch": 0.3125,
313
+ "grad_norm": 4.724225993954909,
314
+ "learning_rate": 1e-06,
315
+ "logits/chosen": -1.8115614652633667,
316
+ "logits/rejected": -1.799264669418335,
317
+ "logps/chosen": -241.42343139648438,
318
+ "logps/rejected": -215.36915588378906,
319
+ "loss": 0.4959,
320
+ "rewards/accuracies": 0.625,
321
+ "rewards/chosen": 0.006982230581343174,
322
+ "rewards/margins": 0.010102405212819576,
323
+ "rewards/rejected": -0.0031201746314764023,
324
+ "step": 15
325
+ },
326
+ {
327
+ "debug/policy_chosen_logits": -1.9611539840698242,
328
+ "debug/policy_chosen_logps": -240.25247192382812,
329
+ "debug/policy_rejected_logits": -1.8923048973083496,
330
+ "debug/policy_rejected_logps": -247.1781768798828,
331
+ "debug/reference_chosen_logps": -241.02410888671875,
332
+ "debug/reference_rejected_logps": -248.17608642578125,
333
+ "epoch": 0.3333333333333333,
334
+ "grad_norm": 4.859194726511771,
335
+ "learning_rate": 1e-06,
336
+ "logits/chosen": -1.9611539840698242,
337
+ "logits/rejected": -1.8923048973083496,
338
+ "logps/chosen": -240.25247192382812,
339
+ "logps/rejected": -247.1781768798828,
340
+ "loss": 0.5015,
341
+ "rewards/accuracies": 0.375,
342
+ "rewards/chosen": 0.0077165220864117146,
343
+ "rewards/margins": -0.002262687310576439,
344
+ "rewards/rejected": 0.00997920986264944,
345
+ "step": 16
346
+ },
347
+ {
348
+ "debug/policy_chosen_logits": -1.8419935703277588,
349
+ "debug/policy_chosen_logps": -225.85903930664062,
350
+ "debug/policy_rejected_logits": -1.8397587537765503,
351
+ "debug/policy_rejected_logps": -222.993896484375,
352
+ "debug/reference_chosen_logps": -225.80227661132812,
353
+ "debug/reference_rejected_logps": -223.05838012695312,
354
+ "epoch": 0.3541666666666667,
355
+ "grad_norm": 5.459841357543238,
356
+ "learning_rate": 1e-06,
357
+ "logits/chosen": -1.8419935703277588,
358
+ "logits/rejected": -1.8397587537765503,
359
+ "logps/chosen": -225.85903930664062,
360
+ "logps/rejected": -222.993896484375,
361
+ "loss": 0.4973,
362
+ "rewards/accuracies": 0.25,
363
+ "rewards/chosen": -0.0005677794106304646,
364
+ "rewards/margins": -0.001212749513797462,
365
+ "rewards/rejected": 0.000644969753921032,
366
+ "step": 17
367
+ },
368
+ {
369
+ "debug/policy_chosen_logits": -2.0776429176330566,
370
+ "debug/policy_chosen_logps": -198.11862182617188,
371
+ "debug/policy_rejected_logits": -1.911252498626709,
372
+ "debug/policy_rejected_logps": -210.159912109375,
373
+ "debug/reference_chosen_logps": -198.45901489257812,
374
+ "debug/reference_rejected_logps": -209.61679077148438,
375
+ "epoch": 0.375,
376
+ "grad_norm": 5.052192403611287,
377
+ "learning_rate": 1e-06,
378
+ "logits/chosen": -2.0776429176330566,
379
+ "logits/rejected": -1.911252498626709,
380
+ "logps/chosen": -198.11862182617188,
381
+ "logps/rejected": -210.159912109375,
382
+ "loss": 0.4972,
383
+ "rewards/accuracies": 0.875,
384
+ "rewards/chosen": 0.003403835231438279,
385
+ "rewards/margins": 0.008835029788315296,
386
+ "rewards/rejected": -0.005431194789707661,
387
+ "step": 18
388
+ },
389
+ {
390
+ "debug/policy_chosen_logits": -1.9367341995239258,
391
+ "debug/policy_chosen_logps": -199.481689453125,
392
+ "debug/policy_rejected_logits": -2.1376593112945557,
393
+ "debug/policy_rejected_logps": -220.9750213623047,
394
+ "debug/reference_chosen_logps": -198.0703582763672,
395
+ "debug/reference_rejected_logps": -220.8565673828125,
396
+ "epoch": 0.3958333333333333,
397
+ "grad_norm": 4.498097056137636,
398
+ "learning_rate": 1e-06,
399
+ "logits/chosen": -1.9367341995239258,
400
+ "logits/rejected": -2.1376593112945557,
401
+ "logps/chosen": -199.481689453125,
402
+ "logps/rejected": -220.9750213623047,
403
+ "loss": 0.5032,
404
+ "rewards/accuracies": 0.25,
405
+ "rewards/chosen": -0.014113139361143112,
406
+ "rewards/margins": -0.012928619049489498,
407
+ "rewards/rejected": -0.0011845207773149014,
408
+ "step": 19
409
+ },
410
+ {
411
+ "debug/policy_chosen_logits": -1.8367891311645508,
412
+ "debug/policy_chosen_logps": -212.05157470703125,
413
+ "debug/policy_rejected_logits": -1.9387035369873047,
414
+ "debug/policy_rejected_logps": -215.82733154296875,
415
+ "debug/reference_chosen_logps": -210.82493591308594,
416
+ "debug/reference_rejected_logps": -215.45745849609375,
417
+ "epoch": 0.4166666666666667,
418
+ "grad_norm": 5.189713253568091,
419
+ "learning_rate": 1e-06,
420
+ "logits/chosen": -1.8367891311645508,
421
+ "logits/rejected": -1.9387035369873047,
422
+ "logps/chosen": -212.05157470703125,
423
+ "logps/rejected": -215.82733154296875,
424
+ "loss": 0.4931,
425
+ "rewards/accuracies": 0.25,
426
+ "rewards/chosen": -0.012266368605196476,
427
+ "rewards/margins": -0.008567637763917446,
428
+ "rewards/rejected": -0.0036987303756177425,
429
+ "step": 20
430
+ },
431
+ {
432
+ "debug/policy_chosen_logits": -1.8324953317642212,
433
+ "debug/policy_chosen_logps": -218.19522094726562,
434
+ "debug/policy_rejected_logits": -1.8325927257537842,
435
+ "debug/policy_rejected_logps": -196.0030975341797,
436
+ "debug/reference_chosen_logps": -218.56109619140625,
437
+ "debug/reference_rejected_logps": -196.50845336914062,
438
+ "epoch": 0.4375,
439
+ "grad_norm": 4.589785119981397,
440
+ "learning_rate": 1e-06,
441
+ "logits/chosen": -1.8324953317642212,
442
+ "logits/rejected": -1.8325927257537842,
443
+ "logps/chosen": -218.19522094726562,
444
+ "logps/rejected": -196.0030975341797,
445
+ "loss": 0.4983,
446
+ "rewards/accuracies": 0.375,
447
+ "rewards/chosen": 0.0036588667426258326,
448
+ "rewards/margins": -0.0013946916442364454,
449
+ "rewards/rejected": 0.005053558386862278,
450
+ "step": 21
451
+ },
452
+ {
453
+ "debug/policy_chosen_logits": -1.8342565298080444,
454
+ "debug/policy_chosen_logps": -204.41470336914062,
455
+ "debug/policy_rejected_logits": -1.7877732515335083,
456
+ "debug/policy_rejected_logps": -209.70840454101562,
457
+ "debug/reference_chosen_logps": -204.43324279785156,
458
+ "debug/reference_rejected_logps": -209.82937622070312,
459
+ "epoch": 0.4583333333333333,
460
+ "grad_norm": 5.153849908641546,
461
+ "learning_rate": 1e-06,
462
+ "logits/chosen": -1.8342565298080444,
463
+ "logits/rejected": -1.7877732515335083,
464
+ "logps/chosen": -204.41470336914062,
465
+ "logps/rejected": -209.70840454101562,
466
+ "loss": 0.4949,
467
+ "rewards/accuracies": 0.625,
468
+ "rewards/chosen": 0.00018556579016149044,
469
+ "rewards/margins": -0.001024017110466957,
470
+ "rewards/rejected": 0.0012095831334590912,
471
+ "step": 22
472
+ },
473
+ {
474
+ "debug/policy_chosen_logits": -1.8171206712722778,
475
+ "debug/policy_chosen_logps": -236.4231719970703,
476
+ "debug/policy_rejected_logits": -1.7470709085464478,
477
+ "debug/policy_rejected_logps": -222.30374145507812,
478
+ "debug/reference_chosen_logps": -236.17730712890625,
479
+ "debug/reference_rejected_logps": -221.56448364257812,
480
+ "epoch": 0.4791666666666667,
481
+ "grad_norm": 4.893815400071578,
482
+ "learning_rate": 1e-06,
483
+ "logits/chosen": -1.8171206712722778,
484
+ "logits/rejected": -1.7470709085464478,
485
+ "logps/chosen": -236.4231719970703,
486
+ "logps/rejected": -222.30374145507812,
487
+ "loss": 0.4916,
488
+ "rewards/accuracies": 0.5,
489
+ "rewards/chosen": -0.00245870603248477,
490
+ "rewards/margins": 0.004933852702379227,
491
+ "rewards/rejected": -0.007392558269202709,
492
+ "step": 23
493
+ },
494
+ {
495
+ "debug/policy_chosen_logits": -1.6100187301635742,
496
+ "debug/policy_chosen_logps": -212.08888244628906,
497
+ "debug/policy_rejected_logits": -1.6458077430725098,
498
+ "debug/policy_rejected_logps": -224.05459594726562,
499
+ "debug/reference_chosen_logps": -211.93698120117188,
500
+ "debug/reference_rejected_logps": -224.69906616210938,
501
+ "epoch": 0.5,
502
+ "grad_norm": 4.528676055698226,
503
+ "learning_rate": 1e-06,
504
+ "logits/chosen": -1.6100187301635742,
505
+ "logits/rejected": -1.6458077430725098,
506
+ "logps/chosen": -212.08888244628906,
507
+ "logps/rejected": -224.05459594726562,
508
+ "loss": 0.4958,
509
+ "rewards/accuracies": 0.375,
510
+ "rewards/chosen": -0.0015190127305686474,
511
+ "rewards/margins": -0.007963847368955612,
512
+ "rewards/rejected": 0.006444835104048252,
513
+ "step": 24
514
+ },
515
+ {
516
+ "debug/policy_chosen_logits": -1.8963773250579834,
517
+ "debug/policy_chosen_logps": -210.84144592285156,
518
+ "debug/policy_rejected_logits": -1.88288414478302,
519
+ "debug/policy_rejected_logps": -208.75941467285156,
520
+ "debug/reference_chosen_logps": -210.99363708496094,
521
+ "debug/reference_rejected_logps": -208.8118896484375,
522
+ "epoch": 0.5208333333333334,
523
+ "grad_norm": 4.253747413467878,
524
+ "learning_rate": 1e-06,
525
+ "logits/chosen": -1.8963773250579834,
526
+ "logits/rejected": -1.88288414478302,
527
+ "logps/chosen": -210.84144592285156,
528
+ "logps/rejected": -208.75941467285156,
529
+ "loss": 0.4986,
530
+ "rewards/accuracies": 0.625,
531
+ "rewards/chosen": 0.0015218928456306458,
532
+ "rewards/margins": 0.0009972000261768699,
533
+ "rewards/rejected": 0.0005246927030384541,
534
+ "step": 25
535
+ },
536
+ {
537
+ "debug/policy_chosen_logits": -1.7630257606506348,
538
+ "debug/policy_chosen_logps": -226.61837768554688,
539
+ "debug/policy_rejected_logits": -1.7200900316238403,
540
+ "debug/policy_rejected_logps": -244.78564453125,
541
+ "debug/reference_chosen_logps": -226.70632934570312,
542
+ "debug/reference_rejected_logps": -244.5872802734375,
543
+ "epoch": 0.5416666666666666,
544
+ "grad_norm": 4.3073478263433875,
545
+ "learning_rate": 1e-06,
546
+ "logits/chosen": -1.7630257606506348,
547
+ "logits/rejected": -1.7200900316238403,
548
+ "logps/chosen": -226.61837768554688,
549
+ "logps/rejected": -244.78564453125,
550
+ "loss": 0.4977,
551
+ "rewards/accuracies": 0.5,
552
+ "rewards/chosen": 0.0008793829474598169,
553
+ "rewards/margins": 0.002862987108528614,
554
+ "rewards/rejected": -0.001983604161068797,
555
+ "step": 26
556
+ },
557
+ {
558
+ "debug/policy_chosen_logits": -1.8077607154846191,
559
+ "debug/policy_chosen_logps": -211.0703887939453,
560
+ "debug/policy_rejected_logits": -1.873244047164917,
561
+ "debug/policy_rejected_logps": -235.58494567871094,
562
+ "debug/reference_chosen_logps": -212.21527099609375,
563
+ "debug/reference_rejected_logps": -235.125732421875,
564
+ "epoch": 0.5625,
565
+ "grad_norm": 4.735343462350552,
566
+ "learning_rate": 1e-06,
567
+ "logits/chosen": -1.8077607154846191,
568
+ "logits/rejected": -1.873244047164917,
569
+ "logps/chosen": -211.0703887939453,
570
+ "logps/rejected": -235.58494567871094,
571
+ "loss": 0.4979,
572
+ "rewards/accuracies": 0.75,
573
+ "rewards/chosen": 0.011448878794908524,
574
+ "rewards/margins": 0.01604101061820984,
575
+ "rewards/rejected": -0.00459213275462389,
576
+ "step": 27
577
+ },
578
+ {
579
+ "debug/policy_chosen_logits": -1.9979571104049683,
580
+ "debug/policy_chosen_logps": -209.10702514648438,
581
+ "debug/policy_rejected_logits": -1.9711703062057495,
582
+ "debug/policy_rejected_logps": -210.96670532226562,
583
+ "debug/reference_chosen_logps": -209.4556427001953,
584
+ "debug/reference_rejected_logps": -210.02169799804688,
585
+ "epoch": 0.5833333333333334,
586
+ "grad_norm": 4.6588487462454395,
587
+ "learning_rate": 1e-06,
588
+ "logits/chosen": -1.9979571104049683,
589
+ "logits/rejected": -1.9711703062057495,
590
+ "logps/chosen": -209.10702514648438,
591
+ "logps/rejected": -210.96670532226562,
592
+ "loss": 0.4941,
593
+ "rewards/accuracies": 0.625,
594
+ "rewards/chosen": 0.003486290108412504,
595
+ "rewards/margins": 0.012936287559568882,
596
+ "rewards/rejected": -0.00944999698549509,
597
+ "step": 28
598
+ },
599
+ {
600
+ "debug/policy_chosen_logits": -1.8544397354125977,
601
+ "debug/policy_chosen_logps": -221.3443145751953,
602
+ "debug/policy_rejected_logits": -1.9452507495880127,
603
+ "debug/policy_rejected_logps": -210.4943084716797,
604
+ "debug/reference_chosen_logps": -221.66323852539062,
605
+ "debug/reference_rejected_logps": -209.6286163330078,
606
+ "epoch": 0.6041666666666666,
607
+ "grad_norm": 4.357025587882443,
608
+ "learning_rate": 1e-06,
609
+ "logits/chosen": -1.8544397354125977,
610
+ "logits/rejected": -1.9452507495880127,
611
+ "logps/chosen": -221.3443145751953,
612
+ "logps/rejected": -210.4943084716797,
613
+ "loss": 0.4991,
614
+ "rewards/accuracies": 0.625,
615
+ "rewards/chosen": 0.0031892964616417885,
616
+ "rewards/margins": 0.011846140958368778,
617
+ "rewards/rejected": -0.008656845428049564,
618
+ "step": 29
619
+ },
620
+ {
621
+ "debug/policy_chosen_logits": -1.8338147401809692,
622
+ "debug/policy_chosen_logps": -198.18841552734375,
623
+ "debug/policy_rejected_logits": -1.7472094297409058,
624
+ "debug/policy_rejected_logps": -218.80093383789062,
625
+ "debug/reference_chosen_logps": -198.03680419921875,
626
+ "debug/reference_rejected_logps": -219.40707397460938,
627
+ "epoch": 0.625,
628
+ "grad_norm": 4.842664247824887,
629
+ "learning_rate": 1e-06,
630
+ "logits/chosen": -1.8338147401809692,
631
+ "logits/rejected": -1.7472094297409058,
632
+ "logps/chosen": -198.18841552734375,
633
+ "logps/rejected": -218.80093383789062,
634
+ "loss": 0.4952,
635
+ "rewards/accuracies": 0.25,
636
+ "rewards/chosen": -0.0015161894261837006,
637
+ "rewards/margins": -0.007577533833682537,
638
+ "rewards/rejected": 0.0060613444074988365,
639
+ "step": 30
640
+ },
641
+ {
642
+ "debug/policy_chosen_logits": -1.8182415962219238,
643
+ "debug/policy_chosen_logps": -206.81643676757812,
644
+ "debug/policy_rejected_logits": -1.8333243131637573,
645
+ "debug/policy_rejected_logps": -210.55618286132812,
646
+ "debug/reference_chosen_logps": -205.9772186279297,
647
+ "debug/reference_rejected_logps": -209.88775634765625,
648
+ "epoch": 0.6458333333333334,
649
+ "grad_norm": 4.918727552016112,
650
+ "learning_rate": 1e-06,
651
+ "logits/chosen": -1.8182415962219238,
652
+ "logits/rejected": -1.8333243131637573,
653
+ "logps/chosen": -206.81643676757812,
654
+ "logps/rejected": -210.55618286132812,
655
+ "loss": 0.4881,
656
+ "rewards/accuracies": 0.625,
657
+ "rewards/chosen": -0.008392143063247204,
658
+ "rewards/margins": -0.0017078209202736616,
659
+ "rewards/rejected": -0.0066843219101428986,
660
+ "step": 31
661
+ },
662
+ {
663
+ "debug/policy_chosen_logits": -1.8787778615951538,
664
+ "debug/policy_chosen_logps": -206.3116455078125,
665
+ "debug/policy_rejected_logits": -1.8249777555465698,
666
+ "debug/policy_rejected_logps": -215.7191162109375,
667
+ "debug/reference_chosen_logps": -206.0584259033203,
668
+ "debug/reference_rejected_logps": -215.76406860351562,
669
+ "epoch": 0.6666666666666666,
670
+ "grad_norm": 4.531359648788432,
671
+ "learning_rate": 1e-06,
672
+ "logits/chosen": -1.8787778615951538,
673
+ "logits/rejected": -1.8249777555465698,
674
+ "logps/chosen": -206.3116455078125,
675
+ "logps/rejected": -215.7191162109375,
676
+ "loss": 0.4943,
677
+ "rewards/accuracies": 0.5,
678
+ "rewards/chosen": -0.002532253274694085,
679
+ "rewards/margins": -0.0029817770700901747,
680
+ "rewards/rejected": 0.0004495240282267332,
681
+ "step": 32
682
+ },
683
+ {
684
+ "debug/policy_chosen_logits": -1.9307353496551514,
685
+ "debug/policy_chosen_logps": -210.0487823486328,
686
+ "debug/policy_rejected_logits": -1.7820299863815308,
687
+ "debug/policy_rejected_logps": -219.03224182128906,
688
+ "debug/reference_chosen_logps": -209.83810424804688,
689
+ "debug/reference_rejected_logps": -218.5866241455078,
690
+ "epoch": 0.6875,
691
+ "grad_norm": 4.504314802671299,
692
+ "learning_rate": 1e-06,
693
+ "logits/chosen": -1.9307353496551514,
694
+ "logits/rejected": -1.7820299863815308,
695
+ "logps/chosen": -210.0487823486328,
696
+ "logps/rejected": -219.03224182128906,
697
+ "loss": 0.4976,
698
+ "rewards/accuracies": 0.5,
699
+ "rewards/chosen": -0.002106723375618458,
700
+ "rewards/margins": 0.0023493007756769657,
701
+ "rewards/rejected": -0.004456023685634136,
702
+ "step": 33
703
+ },
704
+ {
705
+ "debug/policy_chosen_logits": -1.823900580406189,
706
+ "debug/policy_chosen_logps": -199.96127319335938,
707
+ "debug/policy_rejected_logits": -1.7855464220046997,
708
+ "debug/policy_rejected_logps": -215.19229125976562,
709
+ "debug/reference_chosen_logps": -199.58285522460938,
710
+ "debug/reference_rejected_logps": -214.38479614257812,
711
+ "epoch": 0.7083333333333334,
712
+ "grad_norm": 4.457022242154747,
713
+ "learning_rate": 1e-06,
714
+ "logits/chosen": -1.823900580406189,
715
+ "logits/rejected": -1.7855464220046997,
716
+ "logps/chosen": -199.96127319335938,
717
+ "logps/rejected": -215.19229125976562,
718
+ "loss": 0.4971,
719
+ "rewards/accuracies": 0.625,
720
+ "rewards/chosen": -0.003784370142966509,
721
+ "rewards/margins": 0.004290657117962837,
722
+ "rewards/rejected": -0.008075027726590633,
723
+ "step": 34
724
+ },
725
+ {
726
+ "debug/policy_chosen_logits": -1.882658839225769,
727
+ "debug/policy_chosen_logps": -209.23643493652344,
728
+ "debug/policy_rejected_logits": -1.8747351169586182,
729
+ "debug/policy_rejected_logps": -215.92868041992188,
730
+ "debug/reference_chosen_logps": -210.23593139648438,
731
+ "debug/reference_rejected_logps": -217.186279296875,
732
+ "epoch": 0.7291666666666666,
733
+ "grad_norm": 4.626965309640125,
734
+ "learning_rate": 1e-06,
735
+ "logits/chosen": -1.882658839225769,
736
+ "logits/rejected": -1.8747351169586182,
737
+ "logps/chosen": -209.23643493652344,
738
+ "logps/rejected": -215.92868041992188,
739
+ "loss": 0.4994,
740
+ "rewards/accuracies": 0.5,
741
+ "rewards/chosen": 0.009994887746870518,
742
+ "rewards/margins": -0.0025810815859586,
743
+ "rewards/rejected": 0.012575969099998474,
744
+ "step": 35
745
+ },
746
+ {
747
+ "debug/policy_chosen_logits": -1.8005609512329102,
748
+ "debug/policy_chosen_logps": -225.9169158935547,
749
+ "debug/policy_rejected_logits": -1.883096694946289,
750
+ "debug/policy_rejected_logps": -234.09506225585938,
751
+ "debug/reference_chosen_logps": -227.3267822265625,
752
+ "debug/reference_rejected_logps": -234.53903198242188,
753
+ "epoch": 0.75,
754
+ "grad_norm": 4.3931691354690825,
755
+ "learning_rate": 1e-06,
756
+ "logits/chosen": -1.8005609512329102,
757
+ "logits/rejected": -1.883096694946289,
758
+ "logps/chosen": -225.9169158935547,
759
+ "logps/rejected": -234.09506225585938,
760
+ "loss": 0.491,
761
+ "rewards/accuracies": 0.25,
762
+ "rewards/chosen": 0.014098738320171833,
763
+ "rewards/margins": 0.009659002535045147,
764
+ "rewards/rejected": 0.004439735319465399,
765
+ "step": 36
766
+ },
767
+ {
768
+ "debug/policy_chosen_logits": -1.7069189548492432,
769
+ "debug/policy_chosen_logps": -228.86813354492188,
770
+ "debug/policy_rejected_logits": -1.6827794313430786,
771
+ "debug/policy_rejected_logps": -214.09796142578125,
772
+ "debug/reference_chosen_logps": -227.4351806640625,
773
+ "debug/reference_rejected_logps": -213.31690979003906,
774
+ "epoch": 0.7708333333333334,
775
+ "grad_norm": 4.654373119163593,
776
+ "learning_rate": 1e-06,
777
+ "logits/chosen": -1.7069189548492432,
778
+ "logits/rejected": -1.6827794313430786,
779
+ "logps/chosen": -228.86813354492188,
780
+ "logps/rejected": -214.09796142578125,
781
+ "loss": 0.4902,
782
+ "rewards/accuracies": 0.25,
783
+ "rewards/chosen": -0.014329585246741772,
784
+ "rewards/margins": -0.00651891715824604,
785
+ "rewards/rejected": -0.007810668554157019,
786
+ "step": 37
787
+ },
788
+ {
789
+ "debug/policy_chosen_logits": -1.840372085571289,
790
+ "debug/policy_chosen_logps": -230.10861206054688,
791
+ "debug/policy_rejected_logits": -1.7545675039291382,
792
+ "debug/policy_rejected_logps": -260.3162841796875,
793
+ "debug/reference_chosen_logps": -230.72247314453125,
794
+ "debug/reference_rejected_logps": -257.7795715332031,
795
+ "epoch": 0.7916666666666666,
796
+ "grad_norm": 4.825975520263496,
797
+ "learning_rate": 1e-06,
798
+ "logits/chosen": -1.840372085571289,
799
+ "logits/rejected": -1.7545675039291382,
800
+ "logps/chosen": -230.10861206054688,
801
+ "logps/rejected": -260.3162841796875,
802
+ "loss": 0.4896,
803
+ "rewards/accuracies": 0.875,
804
+ "rewards/chosen": 0.0061386870220303535,
805
+ "rewards/margins": 0.03150550648570061,
806
+ "rewards/rejected": -0.02536682039499283,
807
+ "step": 38
808
+ },
809
+ {
810
+ "debug/policy_chosen_logits": -1.920078992843628,
811
+ "debug/policy_chosen_logps": -223.3726348876953,
812
+ "debug/policy_rejected_logits": -1.828634262084961,
813
+ "debug/policy_rejected_logps": -215.68679809570312,
814
+ "debug/reference_chosen_logps": -225.49134826660156,
815
+ "debug/reference_rejected_logps": -216.0248260498047,
816
+ "epoch": 0.8125,
817
+ "grad_norm": 4.550119735116361,
818
+ "learning_rate": 1e-06,
819
+ "logits/chosen": -1.920078992843628,
820
+ "logits/rejected": -1.828634262084961,
821
+ "logps/chosen": -223.3726348876953,
822
+ "logps/rejected": -215.68679809570312,
823
+ "loss": 0.4972,
824
+ "rewards/accuracies": 0.625,
825
+ "rewards/chosen": 0.0211871899664402,
826
+ "rewards/margins": 0.01780683360993862,
827
+ "rewards/rejected": 0.0033803561236709356,
828
+ "step": 39
829
+ },
830
+ {
831
+ "debug/policy_chosen_logits": -1.9407376050949097,
832
+ "debug/policy_chosen_logps": -215.61834716796875,
833
+ "debug/policy_rejected_logits": -1.7760881185531616,
834
+ "debug/policy_rejected_logps": -239.66421508789062,
835
+ "debug/reference_chosen_logps": -216.16978454589844,
836
+ "debug/reference_rejected_logps": -234.6765594482422,
837
+ "epoch": 0.8333333333333334,
838
+ "grad_norm": 4.585659415126642,
839
+ "learning_rate": 1e-06,
840
+ "logits/chosen": -1.9407376050949097,
841
+ "logits/rejected": -1.7760881185531616,
842
+ "logps/chosen": -215.61834716796875,
843
+ "logps/rejected": -239.66421508789062,
844
+ "loss": 0.4894,
845
+ "rewards/accuracies": 0.75,
846
+ "rewards/chosen": 0.0055142780765891075,
847
+ "rewards/margins": 0.055390775203704834,
848
+ "rewards/rejected": -0.04987649619579315,
849
+ "step": 40
850
+ },
851
+ {
852
+ "debug/policy_chosen_logits": -1.8515745401382446,
853
+ "debug/policy_chosen_logps": -230.47845458984375,
854
+ "debug/policy_rejected_logits": -1.8017698526382446,
855
+ "debug/policy_rejected_logps": -220.0799560546875,
856
+ "debug/reference_chosen_logps": -231.57496643066406,
857
+ "debug/reference_rejected_logps": -220.48355102539062,
858
+ "epoch": 0.8541666666666666,
859
+ "grad_norm": 4.376154994824471,
860
+ "learning_rate": 1e-06,
861
+ "logits/chosen": -1.8515745401382446,
862
+ "logits/rejected": -1.8017698526382446,
863
+ "logps/chosen": -230.47845458984375,
864
+ "logps/rejected": -220.0799560546875,
865
+ "loss": 0.4876,
866
+ "rewards/accuracies": 0.625,
867
+ "rewards/chosen": 0.010965080000460148,
868
+ "rewards/margins": 0.006929206661880016,
869
+ "rewards/rejected": 0.0040358733385801315,
870
+ "step": 41
871
+ },
872
+ {
873
+ "debug/policy_chosen_logits": -2.073969602584839,
874
+ "debug/policy_chosen_logps": -197.85983276367188,
875
+ "debug/policy_rejected_logits": -1.9918216466903687,
876
+ "debug/policy_rejected_logps": -218.21873474121094,
877
+ "debug/reference_chosen_logps": -199.05039978027344,
878
+ "debug/reference_rejected_logps": -215.7158660888672,
879
+ "epoch": 0.875,
880
+ "grad_norm": 4.549372793147523,
881
+ "learning_rate": 1e-06,
882
+ "logits/chosen": -2.073969602584839,
883
+ "logits/rejected": -1.9918216466903687,
884
+ "logps/chosen": -197.85983276367188,
885
+ "logps/rejected": -218.21873474121094,
886
+ "loss": 0.4785,
887
+ "rewards/accuracies": 1.0,
888
+ "rewards/chosen": 0.011905612424015999,
889
+ "rewards/margins": 0.036934297531843185,
890
+ "rewards/rejected": -0.025028685107827187,
891
+ "step": 42
892
+ },
893
+ {
894
+ "debug/policy_chosen_logits": -1.8891502618789673,
895
+ "debug/policy_chosen_logps": -212.4984588623047,
896
+ "debug/policy_rejected_logits": -1.7939965724945068,
897
+ "debug/policy_rejected_logps": -205.75616455078125,
898
+ "debug/reference_chosen_logps": -214.7135772705078,
899
+ "debug/reference_rejected_logps": -203.42559814453125,
900
+ "epoch": 0.8958333333333334,
901
+ "grad_norm": 4.709082311105444,
902
+ "learning_rate": 1e-06,
903
+ "logits/chosen": -1.8891502618789673,
904
+ "logits/rejected": -1.7939965724945068,
905
+ "logps/chosen": -212.4984588623047,
906
+ "logps/rejected": -205.75616455078125,
907
+ "loss": 0.4908,
908
+ "rewards/accuracies": 0.875,
909
+ "rewards/chosen": 0.022151164710521698,
910
+ "rewards/margins": 0.04545694589614868,
911
+ "rewards/rejected": -0.023305777460336685,
912
+ "step": 43
913
+ },
914
+ {
915
+ "debug/policy_chosen_logits": -1.713548183441162,
916
+ "debug/policy_chosen_logps": -215.9876708984375,
917
+ "debug/policy_rejected_logits": -1.6541578769683838,
918
+ "debug/policy_rejected_logps": -211.833984375,
919
+ "debug/reference_chosen_logps": -216.26913452148438,
920
+ "debug/reference_rejected_logps": -211.96865844726562,
921
+ "epoch": 0.9166666666666666,
922
+ "grad_norm": 4.592980550179187,
923
+ "learning_rate": 1e-06,
924
+ "logits/chosen": -1.713548183441162,
925
+ "logits/rejected": -1.6541578769683838,
926
+ "logps/chosen": -215.9876708984375,
927
+ "logps/rejected": -211.833984375,
928
+ "loss": 0.4974,
929
+ "rewards/accuracies": 0.5,
930
+ "rewards/chosen": 0.0028144647367298603,
931
+ "rewards/margins": 0.00146764749661088,
932
+ "rewards/rejected": 0.0013468170072883368,
933
+ "step": 44
934
+ },
935
+ {
936
+ "debug/policy_chosen_logits": -1.9016904830932617,
937
+ "debug/policy_chosen_logps": -224.2040557861328,
938
+ "debug/policy_rejected_logits": -2.0025641918182373,
939
+ "debug/policy_rejected_logps": -205.37611389160156,
940
+ "debug/reference_chosen_logps": -224.86119079589844,
941
+ "debug/reference_rejected_logps": -206.6584014892578,
942
+ "epoch": 0.9375,
943
+ "grad_norm": 4.466979083240116,
944
+ "learning_rate": 1e-06,
945
+ "logits/chosen": -1.9016904830932617,
946
+ "logits/rejected": -2.0025641918182373,
947
+ "logps/chosen": -224.2040557861328,
948
+ "logps/rejected": -205.37611389160156,
949
+ "loss": 0.4873,
950
+ "rewards/accuracies": 0.5,
951
+ "rewards/chosen": 0.006571331061422825,
952
+ "rewards/margins": -0.006251506507396698,
953
+ "rewards/rejected": 0.012822837568819523,
954
+ "step": 45
955
+ },
956
+ {
957
+ "debug/policy_chosen_logits": -1.9147862195968628,
958
+ "debug/policy_chosen_logps": -214.40841674804688,
959
+ "debug/policy_rejected_logits": -1.7642629146575928,
960
+ "debug/policy_rejected_logps": -226.85809326171875,
961
+ "debug/reference_chosen_logps": -216.05572509765625,
962
+ "debug/reference_rejected_logps": -224.88555908203125,
963
+ "epoch": 0.9583333333333334,
964
+ "grad_norm": 4.657365424136277,
965
+ "learning_rate": 1e-06,
966
+ "logits/chosen": -1.9147862195968628,
967
+ "logits/rejected": -1.7642629146575928,
968
+ "logps/chosen": -214.40841674804688,
969
+ "logps/rejected": -226.85809326171875,
970
+ "loss": 0.4846,
971
+ "rewards/accuracies": 0.75,
972
+ "rewards/chosen": 0.016473084688186646,
973
+ "rewards/margins": 0.03619840741157532,
974
+ "rewards/rejected": -0.019725322723388672,
975
+ "step": 46
976
+ },
977
+ {
978
+ "debug/policy_chosen_logits": -1.9227019548416138,
979
+ "debug/policy_chosen_logps": -232.88064575195312,
980
+ "debug/policy_rejected_logits": -1.82795250415802,
981
+ "debug/policy_rejected_logps": -216.19277954101562,
982
+ "debug/reference_chosen_logps": -235.4039764404297,
983
+ "debug/reference_rejected_logps": -218.12701416015625,
984
+ "epoch": 0.9791666666666666,
985
+ "grad_norm": 4.981963754300823,
986
+ "learning_rate": 1e-06,
987
+ "logits/chosen": -1.9227019548416138,
988
+ "logits/rejected": -1.82795250415802,
989
+ "logps/chosen": -232.88064575195312,
990
+ "logps/rejected": -216.19277954101562,
991
+ "loss": 0.4908,
992
+ "rewards/accuracies": 0.625,
993
+ "rewards/chosen": 0.025233382359147072,
994
+ "rewards/margins": 0.0058908844366669655,
995
+ "rewards/rejected": 0.019342496991157532,
996
+ "step": 47
997
+ },
998
+ {
999
+ "debug/policy_chosen_logits": -1.723406434059143,
1000
+ "debug/policy_chosen_logps": -218.03060913085938,
1001
+ "debug/policy_rejected_logits": -1.8165022134780884,
1002
+ "debug/policy_rejected_logps": -218.10299682617188,
1003
+ "debug/reference_chosen_logps": -217.13656616210938,
1004
+ "debug/reference_rejected_logps": -216.82008361816406,
1005
+ "epoch": 1.0,
1006
+ "grad_norm": 4.726122116654148,
1007
+ "learning_rate": 1e-06,
1008
+ "logits/chosen": -1.723406434059143,
1009
+ "logits/rejected": -1.8165022134780884,
1010
+ "logps/chosen": -218.03060913085938,
1011
+ "logps/rejected": -218.10299682617188,
1012
+ "loss": 0.4855,
1013
+ "rewards/accuracies": 0.5,
1014
+ "rewards/chosen": -0.008940430358052254,
1015
+ "rewards/margins": 0.003888798877596855,
1016
+ "rewards/rejected": -0.01282922737300396,
1017
+ "step": 48
1018
+ },
1019
+ {
1020
+ "epoch": 1.0,
1021
+ "step": 48,
1022
+ "total_flos": 0.0,
1023
+ "train_loss": 0.4953512450059255,
1024
+ "train_runtime": 163.6148,
1025
+ "train_samples_per_second": 18.66,
1026
+ "train_steps_per_second": 0.293
1027
+ }
1028
+ ],
1029
+ "logging_steps": 1,
1030
+ "max_steps": 48,
1031
+ "num_input_tokens_seen": 0,
1032
+ "num_train_epochs": 1,
1033
+ "save_steps": 500,
1034
+ "stateful_callbacks": {
1035
+ "TrainerControl": {
1036
+ "args": {
1037
+ "should_epoch_stop": false,
1038
+ "should_evaluate": false,
1039
+ "should_log": false,
1040
+ "should_save": true,
1041
+ "should_training_stop": true
1042
+ },
1043
+ "attributes": {}
1044
+ }
1045
+ },
1046
+ "total_flos": 0.0,
1047
+ "train_batch_size": 8,
1048
+ "trial_name": null,
1049
+ "trial_params": null
1050
+ }