yiran-wang3 commited on
Commit
a57f6a4
1 Parent(s): 0eaa932

End of training

Browse files
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: yiran-wang3/ds_coder6.7b_reflct_adamw_iter3
5
+ tags:
6
+ - alignment-handbook
7
+ - generated_from_trainer
8
+ - trl
9
+ - dpo
10
+ datasets:
11
+ - self-generate/ds_coder6.7b_reflct_sppo_hard_new_cn_mining_oj_iter3-binarized-reflection-scored
12
+ model-index:
13
+ - name: ds_coder6.7b_reflct_adamw_iter4
14
+ results: []
15
+ ---
16
+
17
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
+ should probably proofread and complete it, then remove this comment. -->
19
+
20
+ # ds_coder6.7b_reflct_adamw_iter4
21
+
22
+ This model is a fine-tuned version of [yiran-wang3/ds_coder6.7b_reflct_adamw_iter3](https://huggingface.co/yiran-wang3/ds_coder6.7b_reflct_adamw_iter3) on the self-generate/ds_coder6.7b_reflct_sppo_hard_new_cn_mining_oj_iter3-binarized-reflection-scored dataset.
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 1e-06
42
+ - train_batch_size: 8
43
+ - eval_batch_size: 4
44
+ - seed: 42
45
+ - distributed_type: multi-GPU
46
+ - num_devices: 8
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 32
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: constant
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - lr_scheduler_warmup_steps: 100
53
+ - num_epochs: 1.0
54
+
55
+ ### Training results
56
+
57
+
58
+
59
+ ### Framework versions
60
+
61
+ - Transformers 4.45.0
62
+ - Pytorch 2.4.0+cu121
63
+ - Datasets 2.14.6
64
+ - Tokenizers 0.20.3
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.47520279742422555,
5
+ "train_runtime": 468.7452,
6
+ "train_samples": 2682,
7
+ "train_samples_per_second": 5.722,
8
+ "train_steps_per_second": 0.09
9
+ }
config.json CHANGED
@@ -29,6 +29,6 @@
29
  "tie_word_embeddings": false,
30
  "torch_dtype": "bfloat16",
31
  "transformers_version": "4.45.0",
32
- "use_cache": false,
33
  "vocab_size": 32256
34
  }
 
29
  "tie_word_embeddings": false,
30
  "torch_dtype": "bfloat16",
31
  "transformers_version": "4.45.0",
32
+ "use_cache": true,
33
  "vocab_size": 32256
34
  }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 32013,
4
+ "eos_token_id": 32021,
5
+ "transformers_version": "4.45.0"
6
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.47520279742422555,
5
+ "train_runtime": 468.7452,
6
+ "train_samples": 2682,
7
+ "train_samples_per_second": 5.722,
8
+ "train_steps_per_second": 0.09
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,924 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 100,
6
+ "global_step": 42,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "debug/policy_chosen_logits": 1.534427523612976,
13
+ "debug/policy_chosen_logps": -246.16604614257812,
14
+ "debug/policy_rejected_logits": 1.8819605112075806,
15
+ "debug/policy_rejected_logps": -303.5055847167969,
16
+ "debug/reference_chosen_logps": -246.16604614257812,
17
+ "debug/reference_rejected_logps": -303.5055847167969,
18
+ "epoch": 0.023809523809523808,
19
+ "grad_norm": 4.998254371607607,
20
+ "learning_rate": 1e-06,
21
+ "logits/chosen": 1.534427523612976,
22
+ "logits/rejected": 1.8819605112075806,
23
+ "logps/chosen": -246.16604614257812,
24
+ "logps/rejected": -303.5055847167969,
25
+ "loss": 0.5,
26
+ "rewards/accuracies": 0.0,
27
+ "rewards/chosen": 0.0,
28
+ "rewards/margins": 0.0,
29
+ "rewards/rejected": 0.0,
30
+ "step": 1
31
+ },
32
+ {
33
+ "debug/policy_chosen_logits": 1.7019506692886353,
34
+ "debug/policy_chosen_logps": -239.2976837158203,
35
+ "debug/policy_rejected_logits": 1.9889742136001587,
36
+ "debug/policy_rejected_logps": -257.10430908203125,
37
+ "debug/reference_chosen_logps": -239.32846069335938,
38
+ "debug/reference_rejected_logps": -257.1052551269531,
39
+ "epoch": 0.047619047619047616,
40
+ "grad_norm": 3.709501541706415,
41
+ "learning_rate": 1e-06,
42
+ "logits/chosen": 1.7019506692886353,
43
+ "logits/rejected": 1.9889742136001587,
44
+ "logps/chosen": -239.2976837158203,
45
+ "logps/rejected": -257.10430908203125,
46
+ "loss": 0.5007,
47
+ "rewards/accuracies": 0.5,
48
+ "rewards/chosen": 0.00030775077175348997,
49
+ "rewards/margins": 0.0002984236925840378,
50
+ "rewards/rejected": 9.32672992348671e-06,
51
+ "step": 2
52
+ },
53
+ {
54
+ "debug/policy_chosen_logits": 2.0194900035858154,
55
+ "debug/policy_chosen_logps": -279.34771728515625,
56
+ "debug/policy_rejected_logits": 1.7646379470825195,
57
+ "debug/policy_rejected_logps": -324.56732177734375,
58
+ "debug/reference_chosen_logps": -278.9559631347656,
59
+ "debug/reference_rejected_logps": -324.5027160644531,
60
+ "epoch": 0.07142857142857142,
61
+ "grad_norm": 4.913166021048808,
62
+ "learning_rate": 1e-06,
63
+ "logits/chosen": 2.0194900035858154,
64
+ "logits/rejected": 1.7646379470825195,
65
+ "logps/chosen": -279.34771728515625,
66
+ "logps/rejected": -324.56732177734375,
67
+ "loss": 0.4996,
68
+ "rewards/accuracies": 0.25,
69
+ "rewards/chosen": -0.00391746498644352,
70
+ "rewards/margins": -0.003271312452852726,
71
+ "rewards/rejected": -0.0006461525335907936,
72
+ "step": 3
73
+ },
74
+ {
75
+ "debug/policy_chosen_logits": 1.8845134973526,
76
+ "debug/policy_chosen_logps": -247.70022583007812,
77
+ "debug/policy_rejected_logits": 1.9669499397277832,
78
+ "debug/policy_rejected_logps": -284.1238098144531,
79
+ "debug/reference_chosen_logps": -247.67445373535156,
80
+ "debug/reference_rejected_logps": -284.0384216308594,
81
+ "epoch": 0.09523809523809523,
82
+ "grad_norm": 3.5714670065259737,
83
+ "learning_rate": 1e-06,
84
+ "logits/chosen": 1.8845134973526,
85
+ "logits/rejected": 1.9669499397277832,
86
+ "logps/chosen": -247.70022583007812,
87
+ "logps/rejected": -284.1238098144531,
88
+ "loss": 0.4988,
89
+ "rewards/accuracies": 0.5,
90
+ "rewards/chosen": -0.0002577209670562297,
91
+ "rewards/margins": 0.0005962752620689571,
92
+ "rewards/rejected": -0.0008539962582290173,
93
+ "step": 4
94
+ },
95
+ {
96
+ "debug/policy_chosen_logits": 1.6867436170578003,
97
+ "debug/policy_chosen_logps": -256.5872802734375,
98
+ "debug/policy_rejected_logits": 1.5141671895980835,
99
+ "debug/policy_rejected_logps": -298.8122253417969,
100
+ "debug/reference_chosen_logps": -255.80136108398438,
101
+ "debug/reference_rejected_logps": -298.3391418457031,
102
+ "epoch": 0.11904761904761904,
103
+ "grad_norm": 4.263705722231279,
104
+ "learning_rate": 1e-06,
105
+ "logits/chosen": 1.6867436170578003,
106
+ "logits/rejected": 1.5141671895980835,
107
+ "logps/chosen": -256.5872802734375,
108
+ "logps/rejected": -298.8122253417969,
109
+ "loss": 0.4988,
110
+ "rewards/accuracies": 0.25,
111
+ "rewards/chosen": -0.007859057746827602,
112
+ "rewards/margins": -0.003128451993688941,
113
+ "rewards/rejected": -0.004730605985969305,
114
+ "step": 5
115
+ },
116
+ {
117
+ "debug/policy_chosen_logits": 1.7323403358459473,
118
+ "debug/policy_chosen_logps": -258.45379638671875,
119
+ "debug/policy_rejected_logits": 1.657837152481079,
120
+ "debug/policy_rejected_logps": -318.19281005859375,
121
+ "debug/reference_chosen_logps": -257.7867736816406,
122
+ "debug/reference_rejected_logps": -316.5732116699219,
123
+ "epoch": 0.14285714285714285,
124
+ "grad_norm": 3.7713550317949176,
125
+ "learning_rate": 1e-06,
126
+ "logits/chosen": 1.7323403358459473,
127
+ "logits/rejected": 1.657837152481079,
128
+ "logps/chosen": -258.45379638671875,
129
+ "logps/rejected": -318.19281005859375,
130
+ "loss": 0.4946,
131
+ "rewards/accuracies": 0.625,
132
+ "rewards/chosen": -0.006670188624411821,
133
+ "rewards/margins": 0.00952566135674715,
134
+ "rewards/rejected": -0.01619585044682026,
135
+ "step": 6
136
+ },
137
+ {
138
+ "debug/policy_chosen_logits": 1.8151267766952515,
139
+ "debug/policy_chosen_logps": -263.91693115234375,
140
+ "debug/policy_rejected_logits": 1.8386905193328857,
141
+ "debug/policy_rejected_logps": -302.0845031738281,
142
+ "debug/reference_chosen_logps": -262.6131286621094,
143
+ "debug/reference_rejected_logps": -300.3056640625,
144
+ "epoch": 0.16666666666666666,
145
+ "grad_norm": 3.598363099027292,
146
+ "learning_rate": 1e-06,
147
+ "logits/chosen": 1.8151267766952515,
148
+ "logits/rejected": 1.8386905193328857,
149
+ "logps/chosen": -263.91693115234375,
150
+ "logps/rejected": -302.0845031738281,
151
+ "loss": 0.4949,
152
+ "rewards/accuracies": 0.625,
153
+ "rewards/chosen": -0.01303796749562025,
154
+ "rewards/margins": 0.004750480409711599,
155
+ "rewards/rejected": -0.017788447439670563,
156
+ "step": 7
157
+ },
158
+ {
159
+ "debug/policy_chosen_logits": 1.9484639167785645,
160
+ "debug/policy_chosen_logps": -267.802978515625,
161
+ "debug/policy_rejected_logits": 1.9488154649734497,
162
+ "debug/policy_rejected_logps": -383.6639404296875,
163
+ "debug/reference_chosen_logps": -266.5791320800781,
164
+ "debug/reference_rejected_logps": -381.67608642578125,
165
+ "epoch": 0.19047619047619047,
166
+ "grad_norm": 4.285563297670631,
167
+ "learning_rate": 1e-06,
168
+ "logits/chosen": 1.9484639167785645,
169
+ "logits/rejected": 1.9488154649734497,
170
+ "logps/chosen": -267.802978515625,
171
+ "logps/rejected": -383.6639404296875,
172
+ "loss": 0.4938,
173
+ "rewards/accuracies": 0.625,
174
+ "rewards/chosen": -0.012238597497344017,
175
+ "rewards/margins": 0.007640190422534943,
176
+ "rewards/rejected": -0.01987878605723381,
177
+ "step": 8
178
+ },
179
+ {
180
+ "debug/policy_chosen_logits": 2.099731683731079,
181
+ "debug/policy_chosen_logps": -273.102294921875,
182
+ "debug/policy_rejected_logits": 1.98048734664917,
183
+ "debug/policy_rejected_logps": -280.704345703125,
184
+ "debug/reference_chosen_logps": -272.2438049316406,
185
+ "debug/reference_rejected_logps": -279.1191101074219,
186
+ "epoch": 0.21428571428571427,
187
+ "grad_norm": 3.880226369464224,
188
+ "learning_rate": 1e-06,
189
+ "logits/chosen": 2.099731683731079,
190
+ "logits/rejected": 1.98048734664917,
191
+ "logps/chosen": -273.102294921875,
192
+ "logps/rejected": -280.704345703125,
193
+ "loss": 0.4914,
194
+ "rewards/accuracies": 0.625,
195
+ "rewards/chosen": -0.008584880270063877,
196
+ "rewards/margins": 0.007267666049301624,
197
+ "rewards/rejected": -0.0158525463193655,
198
+ "step": 9
199
+ },
200
+ {
201
+ "debug/policy_chosen_logits": 1.8367491960525513,
202
+ "debug/policy_chosen_logps": -256.5045471191406,
203
+ "debug/policy_rejected_logits": 1.9624578952789307,
204
+ "debug/policy_rejected_logps": -292.118408203125,
205
+ "debug/reference_chosen_logps": -255.32217407226562,
206
+ "debug/reference_rejected_logps": -290.0760498046875,
207
+ "epoch": 0.23809523809523808,
208
+ "grad_norm": 3.7444578408713256,
209
+ "learning_rate": 1e-06,
210
+ "logits/chosen": 1.8367491960525513,
211
+ "logits/rejected": 1.9624578952789307,
212
+ "logps/chosen": -256.5045471191406,
213
+ "logps/rejected": -292.118408203125,
214
+ "loss": 0.4913,
215
+ "rewards/accuracies": 0.75,
216
+ "rewards/chosen": -0.011823710985481739,
217
+ "rewards/margins": 0.008599948137998581,
218
+ "rewards/rejected": -0.020423660054802895,
219
+ "step": 10
220
+ },
221
+ {
222
+ "debug/policy_chosen_logits": 1.8572165966033936,
223
+ "debug/policy_chosen_logps": -271.85711669921875,
224
+ "debug/policy_rejected_logits": 1.9283175468444824,
225
+ "debug/policy_rejected_logps": -285.6045227050781,
226
+ "debug/reference_chosen_logps": -270.435791015625,
227
+ "debug/reference_rejected_logps": -285.2454528808594,
228
+ "epoch": 0.2619047619047619,
229
+ "grad_norm": 4.787673855426764,
230
+ "learning_rate": 1e-06,
231
+ "logits/chosen": 1.8572165966033936,
232
+ "logits/rejected": 1.9283175468444824,
233
+ "logps/chosen": -271.85711669921875,
234
+ "logps/rejected": -285.6045227050781,
235
+ "loss": 0.489,
236
+ "rewards/accuracies": 0.375,
237
+ "rewards/chosen": -0.014213085174560547,
238
+ "rewards/margins": -0.010622329078614712,
239
+ "rewards/rejected": -0.0035907551646232605,
240
+ "step": 11
241
+ },
242
+ {
243
+ "debug/policy_chosen_logits": 1.7663341760635376,
244
+ "debug/policy_chosen_logps": -238.7307586669922,
245
+ "debug/policy_rejected_logits": 1.7368437051773071,
246
+ "debug/policy_rejected_logps": -312.8492431640625,
247
+ "debug/reference_chosen_logps": -238.7390594482422,
248
+ "debug/reference_rejected_logps": -310.5390625,
249
+ "epoch": 0.2857142857142857,
250
+ "grad_norm": 3.289994710793894,
251
+ "learning_rate": 1e-06,
252
+ "logits/chosen": 1.7663341760635376,
253
+ "logits/rejected": 1.7368437051773071,
254
+ "logps/chosen": -238.7307586669922,
255
+ "logps/rejected": -312.8492431640625,
256
+ "loss": 0.4929,
257
+ "rewards/accuracies": 0.875,
258
+ "rewards/chosen": 8.296966552734375e-05,
259
+ "rewards/margins": 0.02318466082215309,
260
+ "rewards/rejected": -0.023101691156625748,
261
+ "step": 12
262
+ },
263
+ {
264
+ "debug/policy_chosen_logits": 1.7572966814041138,
265
+ "debug/policy_chosen_logps": -243.2532501220703,
266
+ "debug/policy_rejected_logits": 1.85104501247406,
267
+ "debug/policy_rejected_logps": -292.90924072265625,
268
+ "debug/reference_chosen_logps": -244.15411376953125,
269
+ "debug/reference_rejected_logps": -290.55877685546875,
270
+ "epoch": 0.30952380952380953,
271
+ "grad_norm": 3.5856538429745983,
272
+ "learning_rate": 1e-06,
273
+ "logits/chosen": 1.7572966814041138,
274
+ "logits/rejected": 1.85104501247406,
275
+ "logps/chosen": -243.2532501220703,
276
+ "logps/rejected": -292.90924072265625,
277
+ "loss": 0.4863,
278
+ "rewards/accuracies": 0.75,
279
+ "rewards/chosen": 0.00900875125080347,
280
+ "rewards/margins": 0.03251304477453232,
281
+ "rewards/rejected": -0.023504294455051422,
282
+ "step": 13
283
+ },
284
+ {
285
+ "debug/policy_chosen_logits": 1.8065857887268066,
286
+ "debug/policy_chosen_logps": -271.2432861328125,
287
+ "debug/policy_rejected_logits": 1.5463082790374756,
288
+ "debug/policy_rejected_logps": -267.1593017578125,
289
+ "debug/reference_chosen_logps": -271.75762939453125,
290
+ "debug/reference_rejected_logps": -266.90338134765625,
291
+ "epoch": 0.3333333333333333,
292
+ "grad_norm": 3.693572128354582,
293
+ "learning_rate": 1e-06,
294
+ "logits/chosen": 1.8065857887268066,
295
+ "logits/rejected": 1.5463082790374756,
296
+ "logps/chosen": -271.2432861328125,
297
+ "logps/rejected": -267.1593017578125,
298
+ "loss": 0.4826,
299
+ "rewards/accuracies": 0.625,
300
+ "rewards/chosen": 0.0051437378861010075,
301
+ "rewards/margins": 0.007702922448515892,
302
+ "rewards/rejected": -0.0025591840967535973,
303
+ "step": 14
304
+ },
305
+ {
306
+ "debug/policy_chosen_logits": 2.1449954509735107,
307
+ "debug/policy_chosen_logps": -281.3528137207031,
308
+ "debug/policy_rejected_logits": 2.0790512561798096,
309
+ "debug/policy_rejected_logps": -349.5384826660156,
310
+ "debug/reference_chosen_logps": -280.9048156738281,
311
+ "debug/reference_rejected_logps": -344.8339538574219,
312
+ "epoch": 0.35714285714285715,
313
+ "grad_norm": 3.73957295205859,
314
+ "learning_rate": 1e-06,
315
+ "logits/chosen": 2.1449954509735107,
316
+ "logits/rejected": 2.0790512561798096,
317
+ "logps/chosen": -281.3528137207031,
318
+ "logps/rejected": -349.5384826660156,
319
+ "loss": 0.4835,
320
+ "rewards/accuracies": 0.75,
321
+ "rewards/chosen": -0.0044799428433179855,
322
+ "rewards/margins": 0.04256511479616165,
323
+ "rewards/rejected": -0.047045059502124786,
324
+ "step": 15
325
+ },
326
+ {
327
+ "debug/policy_chosen_logits": 1.8549057245254517,
328
+ "debug/policy_chosen_logps": -253.8255615234375,
329
+ "debug/policy_rejected_logits": 1.7824431657791138,
330
+ "debug/policy_rejected_logps": -317.2090759277344,
331
+ "debug/reference_chosen_logps": -256.2930603027344,
332
+ "debug/reference_rejected_logps": -317.3314514160156,
333
+ "epoch": 0.38095238095238093,
334
+ "grad_norm": 3.6930417824241064,
335
+ "learning_rate": 1e-06,
336
+ "logits/chosen": 1.8549057245254517,
337
+ "logits/rejected": 1.7824431657791138,
338
+ "logps/chosen": -253.8255615234375,
339
+ "logps/rejected": -317.2090759277344,
340
+ "loss": 0.4764,
341
+ "rewards/accuracies": 0.875,
342
+ "rewards/chosen": 0.024674739688634872,
343
+ "rewards/margins": 0.02345096506178379,
344
+ "rewards/rejected": 0.0012237741611897945,
345
+ "step": 16
346
+ },
347
+ {
348
+ "debug/policy_chosen_logits": 1.888502836227417,
349
+ "debug/policy_chosen_logps": -235.820068359375,
350
+ "debug/policy_rejected_logits": 1.8494880199432373,
351
+ "debug/policy_rejected_logps": -259.95220947265625,
352
+ "debug/reference_chosen_logps": -239.1522216796875,
353
+ "debug/reference_rejected_logps": -261.6212463378906,
354
+ "epoch": 0.40476190476190477,
355
+ "grad_norm": 4.3518243358558815,
356
+ "learning_rate": 1e-06,
357
+ "logits/chosen": 1.888502836227417,
358
+ "logits/rejected": 1.8494880199432373,
359
+ "logps/chosen": -235.820068359375,
360
+ "logps/rejected": -259.95220947265625,
361
+ "loss": 0.4664,
362
+ "rewards/accuracies": 0.75,
363
+ "rewards/chosen": 0.03332166746258736,
364
+ "rewards/margins": 0.01663154549896717,
365
+ "rewards/rejected": 0.016690120100975037,
366
+ "step": 17
367
+ },
368
+ {
369
+ "debug/policy_chosen_logits": 1.8841761350631714,
370
+ "debug/policy_chosen_logps": -263.9659423828125,
371
+ "debug/policy_rejected_logits": 1.8606880903244019,
372
+ "debug/policy_rejected_logps": -283.3532409667969,
373
+ "debug/reference_chosen_logps": -267.3668212890625,
374
+ "debug/reference_rejected_logps": -283.70703125,
375
+ "epoch": 0.42857142857142855,
376
+ "grad_norm": 3.85077649264808,
377
+ "learning_rate": 1e-06,
378
+ "logits/chosen": 1.8841761350631714,
379
+ "logits/rejected": 1.8606880903244019,
380
+ "logps/chosen": -263.9659423828125,
381
+ "logps/rejected": -283.3532409667969,
382
+ "loss": 0.472,
383
+ "rewards/accuracies": 0.625,
384
+ "rewards/chosen": 0.03400861471891403,
385
+ "rewards/margins": 0.03047073259949684,
386
+ "rewards/rejected": 0.0035378839820623398,
387
+ "step": 18
388
+ },
389
+ {
390
+ "debug/policy_chosen_logits": 1.853868842124939,
391
+ "debug/policy_chosen_logps": -238.86099243164062,
392
+ "debug/policy_rejected_logits": 1.9001590013504028,
393
+ "debug/policy_rejected_logps": -275.0302734375,
394
+ "debug/reference_chosen_logps": -243.44891357421875,
395
+ "debug/reference_rejected_logps": -275.3963623046875,
396
+ "epoch": 0.4523809523809524,
397
+ "grad_norm": 4.61182292080891,
398
+ "learning_rate": 1e-06,
399
+ "logits/chosen": 1.853868842124939,
400
+ "logits/rejected": 1.9001590013504028,
401
+ "logps/chosen": -238.86099243164062,
402
+ "logps/rejected": -275.0302734375,
403
+ "loss": 0.468,
404
+ "rewards/accuracies": 0.875,
405
+ "rewards/chosen": 0.04587903991341591,
406
+ "rewards/margins": 0.042218245565891266,
407
+ "rewards/rejected": 0.0036607934162020683,
408
+ "step": 19
409
+ },
410
+ {
411
+ "debug/policy_chosen_logits": 1.7765668630599976,
412
+ "debug/policy_chosen_logps": -273.059326171875,
413
+ "debug/policy_rejected_logits": 1.9133604764938354,
414
+ "debug/policy_rejected_logps": -296.22113037109375,
415
+ "debug/reference_chosen_logps": -270.8135986328125,
416
+ "debug/reference_rejected_logps": -293.4261474609375,
417
+ "epoch": 0.47619047619047616,
418
+ "grad_norm": 4.414812759605763,
419
+ "learning_rate": 1e-06,
420
+ "logits/chosen": 1.7765668630599976,
421
+ "logits/rejected": 1.9133604764938354,
422
+ "logps/chosen": -273.059326171875,
423
+ "logps/rejected": -296.22113037109375,
424
+ "loss": 0.4692,
425
+ "rewards/accuracies": 0.5,
426
+ "rewards/chosen": -0.022456951439380646,
427
+ "rewards/margins": 0.005493145436048508,
428
+ "rewards/rejected": -0.027950095012784004,
429
+ "step": 20
430
+ },
431
+ {
432
+ "debug/policy_chosen_logits": 2.1494266986846924,
433
+ "debug/policy_chosen_logps": -308.71044921875,
434
+ "debug/policy_rejected_logits": 1.8783735036849976,
435
+ "debug/policy_rejected_logps": -316.08123779296875,
436
+ "debug/reference_chosen_logps": -306.241455078125,
437
+ "debug/reference_rejected_logps": -313.06842041015625,
438
+ "epoch": 0.5,
439
+ "grad_norm": 6.82008931679484,
440
+ "learning_rate": 1e-06,
441
+ "logits/chosen": 2.1494266986846924,
442
+ "logits/rejected": 1.8783735036849976,
443
+ "logps/chosen": -308.71044921875,
444
+ "logps/rejected": -316.08123779296875,
445
+ "loss": 0.4732,
446
+ "rewards/accuracies": 0.5,
447
+ "rewards/chosen": -0.024689501151442528,
448
+ "rewards/margins": 0.005438690539449453,
449
+ "rewards/rejected": -0.03012819215655327,
450
+ "step": 21
451
+ },
452
+ {
453
+ "debug/policy_chosen_logits": 1.6750694513320923,
454
+ "debug/policy_chosen_logps": -233.83047485351562,
455
+ "debug/policy_rejected_logits": 1.8117766380310059,
456
+ "debug/policy_rejected_logps": -300.7946472167969,
457
+ "debug/reference_chosen_logps": -237.87527465820312,
458
+ "debug/reference_rejected_logps": -298.6039733886719,
459
+ "epoch": 0.5238095238095238,
460
+ "grad_norm": 3.8318025594425853,
461
+ "learning_rate": 1e-06,
462
+ "logits/chosen": 1.6750694513320923,
463
+ "logits/rejected": 1.8117766380310059,
464
+ "logps/chosen": -233.83047485351562,
465
+ "logps/rejected": -300.7946472167969,
466
+ "loss": 0.4695,
467
+ "rewards/accuracies": 0.75,
468
+ "rewards/chosen": 0.040447898209095,
469
+ "rewards/margins": 0.062354717403650284,
470
+ "rewards/rejected": -0.021906813606619835,
471
+ "step": 22
472
+ },
473
+ {
474
+ "debug/policy_chosen_logits": 1.744894027709961,
475
+ "debug/policy_chosen_logps": -254.40310668945312,
476
+ "debug/policy_rejected_logits": 1.7900285720825195,
477
+ "debug/policy_rejected_logps": -284.26971435546875,
478
+ "debug/reference_chosen_logps": -257.9718933105469,
479
+ "debug/reference_rejected_logps": -283.98822021484375,
480
+ "epoch": 0.5476190476190477,
481
+ "grad_norm": 3.3732548227061403,
482
+ "learning_rate": 1e-06,
483
+ "logits/chosen": 1.744894027709961,
484
+ "logits/rejected": 1.7900285720825195,
485
+ "logps/chosen": -254.40310668945312,
486
+ "logps/rejected": -284.26971435546875,
487
+ "loss": 0.4648,
488
+ "rewards/accuracies": 0.875,
489
+ "rewards/chosen": 0.0356878861784935,
490
+ "rewards/margins": 0.03850293904542923,
491
+ "rewards/rejected": -0.0028150551952421665,
492
+ "step": 23
493
+ },
494
+ {
495
+ "debug/policy_chosen_logits": 1.7286827564239502,
496
+ "debug/policy_chosen_logps": -233.19664001464844,
497
+ "debug/policy_rejected_logits": 1.9754953384399414,
498
+ "debug/policy_rejected_logps": -311.8176574707031,
499
+ "debug/reference_chosen_logps": -234.96006774902344,
500
+ "debug/reference_rejected_logps": -306.427734375,
501
+ "epoch": 0.5714285714285714,
502
+ "grad_norm": 3.836706307908044,
503
+ "learning_rate": 1e-06,
504
+ "logits/chosen": 1.7286827564239502,
505
+ "logits/rejected": 1.9754953384399414,
506
+ "logps/chosen": -233.19664001464844,
507
+ "logps/rejected": -311.8176574707031,
508
+ "loss": 0.4643,
509
+ "rewards/accuracies": 0.875,
510
+ "rewards/chosen": 0.017634237185120583,
511
+ "rewards/margins": 0.07153362035751343,
512
+ "rewards/rejected": -0.0538993775844574,
513
+ "step": 24
514
+ },
515
+ {
516
+ "debug/policy_chosen_logits": 2.00459623336792,
517
+ "debug/policy_chosen_logps": -258.64495849609375,
518
+ "debug/policy_rejected_logits": 2.0122740268707275,
519
+ "debug/policy_rejected_logps": -321.5993957519531,
520
+ "debug/reference_chosen_logps": -261.73284912109375,
521
+ "debug/reference_rejected_logps": -313.8829650878906,
522
+ "epoch": 0.5952380952380952,
523
+ "grad_norm": 3.5765138124804974,
524
+ "learning_rate": 1e-06,
525
+ "logits/chosen": 2.00459623336792,
526
+ "logits/rejected": 2.0122740268707275,
527
+ "logps/chosen": -258.64495849609375,
528
+ "logps/rejected": -321.5993957519531,
529
+ "loss": 0.4708,
530
+ "rewards/accuracies": 0.75,
531
+ "rewards/chosen": 0.03087867610156536,
532
+ "rewards/margins": 0.10804271697998047,
533
+ "rewards/rejected": -0.07716403901576996,
534
+ "step": 25
535
+ },
536
+ {
537
+ "debug/policy_chosen_logits": 2.0935933589935303,
538
+ "debug/policy_chosen_logps": -291.4290466308594,
539
+ "debug/policy_rejected_logits": 2.211636543273926,
540
+ "debug/policy_rejected_logps": -322.15032958984375,
541
+ "debug/reference_chosen_logps": -290.2638854980469,
542
+ "debug/reference_rejected_logps": -320.8800048828125,
543
+ "epoch": 0.6190476190476191,
544
+ "grad_norm": 4.470923881317887,
545
+ "learning_rate": 1e-06,
546
+ "logits/chosen": 2.0935933589935303,
547
+ "logits/rejected": 2.211636543273926,
548
+ "logps/chosen": -291.4290466308594,
549
+ "logps/rejected": -322.15032958984375,
550
+ "loss": 0.4664,
551
+ "rewards/accuracies": 0.75,
552
+ "rewards/chosen": -0.011651594191789627,
553
+ "rewards/margins": 0.0010515935719013214,
554
+ "rewards/rejected": -0.012703188695013523,
555
+ "step": 26
556
+ },
557
+ {
558
+ "debug/policy_chosen_logits": 1.740605354309082,
559
+ "debug/policy_chosen_logps": -232.287841796875,
560
+ "debug/policy_rejected_logits": 1.9395997524261475,
561
+ "debug/policy_rejected_logps": -288.00982666015625,
562
+ "debug/reference_chosen_logps": -237.18685913085938,
563
+ "debug/reference_rejected_logps": -290.4499816894531,
564
+ "epoch": 0.6428571428571429,
565
+ "grad_norm": 3.5105118829949893,
566
+ "learning_rate": 1e-06,
567
+ "logits/chosen": 1.740605354309082,
568
+ "logits/rejected": 1.9395997524261475,
569
+ "logps/chosen": -232.287841796875,
570
+ "logps/rejected": -288.00982666015625,
571
+ "loss": 0.4616,
572
+ "rewards/accuracies": 0.5,
573
+ "rewards/chosen": 0.04899021238088608,
574
+ "rewards/margins": 0.024588564410805702,
575
+ "rewards/rejected": 0.024401644244790077,
576
+ "step": 27
577
+ },
578
+ {
579
+ "debug/policy_chosen_logits": 1.749884843826294,
580
+ "debug/policy_chosen_logps": -256.37811279296875,
581
+ "debug/policy_rejected_logits": 1.836501955986023,
582
+ "debug/policy_rejected_logps": -292.520751953125,
583
+ "debug/reference_chosen_logps": -258.786865234375,
584
+ "debug/reference_rejected_logps": -290.3285827636719,
585
+ "epoch": 0.6666666666666666,
586
+ "grad_norm": 3.9108599274749087,
587
+ "learning_rate": 1e-06,
588
+ "logits/chosen": 1.749884843826294,
589
+ "logits/rejected": 1.836501955986023,
590
+ "logps/chosen": -256.37811279296875,
591
+ "logps/rejected": -292.520751953125,
592
+ "loss": 0.4599,
593
+ "rewards/accuracies": 0.75,
594
+ "rewards/chosen": 0.024087373167276382,
595
+ "rewards/margins": 0.0460088886320591,
596
+ "rewards/rejected": -0.021921521052718163,
597
+ "step": 28
598
+ },
599
+ {
600
+ "debug/policy_chosen_logits": 1.9136433601379395,
601
+ "debug/policy_chosen_logps": -244.74368286132812,
602
+ "debug/policy_rejected_logits": 1.7372866868972778,
603
+ "debug/policy_rejected_logps": -231.080322265625,
604
+ "debug/reference_chosen_logps": -247.04966735839844,
605
+ "debug/reference_rejected_logps": -237.48455810546875,
606
+ "epoch": 0.6904761904761905,
607
+ "grad_norm": 3.4874983509246062,
608
+ "learning_rate": 1e-06,
609
+ "logits/chosen": 1.9136433601379395,
610
+ "logits/rejected": 1.7372866868972778,
611
+ "logps/chosen": -244.74368286132812,
612
+ "logps/rejected": -231.080322265625,
613
+ "loss": 0.4688,
614
+ "rewards/accuracies": 0.375,
615
+ "rewards/chosen": 0.023059826344251633,
616
+ "rewards/margins": -0.040982604026794434,
617
+ "rewards/rejected": 0.06404243409633636,
618
+ "step": 29
619
+ },
620
+ {
621
+ "debug/policy_chosen_logits": 1.843129277229309,
622
+ "debug/policy_chosen_logps": -253.27679443359375,
623
+ "debug/policy_rejected_logits": 1.9183768033981323,
624
+ "debug/policy_rejected_logps": -299.0466613769531,
625
+ "debug/reference_chosen_logps": -258.10870361328125,
626
+ "debug/reference_rejected_logps": -294.8966064453125,
627
+ "epoch": 0.7142857142857143,
628
+ "grad_norm": 3.6637277625193767,
629
+ "learning_rate": 1e-06,
630
+ "logits/chosen": 1.843129277229309,
631
+ "logits/rejected": 1.9183768033981323,
632
+ "logps/chosen": -253.27679443359375,
633
+ "logps/rejected": -299.0466613769531,
634
+ "loss": 0.4543,
635
+ "rewards/accuracies": 0.5,
636
+ "rewards/chosen": 0.04831913113594055,
637
+ "rewards/margins": 0.08981965482234955,
638
+ "rewards/rejected": -0.041500527411699295,
639
+ "step": 30
640
+ },
641
+ {
642
+ "debug/policy_chosen_logits": 1.9597934484481812,
643
+ "debug/policy_chosen_logps": -230.39987182617188,
644
+ "debug/policy_rejected_logits": 1.8873902559280396,
645
+ "debug/policy_rejected_logps": -306.35150146484375,
646
+ "debug/reference_chosen_logps": -236.17076110839844,
647
+ "debug/reference_rejected_logps": -293.26123046875,
648
+ "epoch": 0.7380952380952381,
649
+ "grad_norm": 4.055553525568484,
650
+ "learning_rate": 1e-06,
651
+ "logits/chosen": 1.9597934484481812,
652
+ "logits/rejected": 1.8873902559280396,
653
+ "logps/chosen": -230.39987182617188,
654
+ "logps/rejected": -306.35150146484375,
655
+ "loss": 0.4559,
656
+ "rewards/accuracies": 0.75,
657
+ "rewards/chosen": 0.057708967477083206,
658
+ "rewards/margins": 0.1886114776134491,
659
+ "rewards/rejected": -0.1309025138616562,
660
+ "step": 31
661
+ },
662
+ {
663
+ "debug/policy_chosen_logits": 1.5884038209915161,
664
+ "debug/policy_chosen_logps": -284.81689453125,
665
+ "debug/policy_rejected_logits": 1.5565730333328247,
666
+ "debug/policy_rejected_logps": -333.30059814453125,
667
+ "debug/reference_chosen_logps": -281.670166015625,
668
+ "debug/reference_rejected_logps": -328.1829528808594,
669
+ "epoch": 0.7619047619047619,
670
+ "grad_norm": 6.113113891791621,
671
+ "learning_rate": 1e-06,
672
+ "logits/chosen": 1.5884038209915161,
673
+ "logits/rejected": 1.5565730333328247,
674
+ "logps/chosen": -284.81689453125,
675
+ "logps/rejected": -333.30059814453125,
676
+ "loss": 0.4904,
677
+ "rewards/accuracies": 0.75,
678
+ "rewards/chosen": -0.03146745637059212,
679
+ "rewards/margins": 0.01970868930220604,
680
+ "rewards/rejected": -0.051176149398088455,
681
+ "step": 32
682
+ },
683
+ {
684
+ "debug/policy_chosen_logits": 1.8103567361831665,
685
+ "debug/policy_chosen_logps": -326.88555908203125,
686
+ "debug/policy_rejected_logits": 1.857008457183838,
687
+ "debug/policy_rejected_logps": -313.01300048828125,
688
+ "debug/reference_chosen_logps": -312.245849609375,
689
+ "debug/reference_rejected_logps": -309.3544921875,
690
+ "epoch": 0.7857142857142857,
691
+ "grad_norm": 5.345219796137713,
692
+ "learning_rate": 1e-06,
693
+ "logits/chosen": 1.8103567361831665,
694
+ "logits/rejected": 1.857008457183838,
695
+ "logps/chosen": -326.88555908203125,
696
+ "logps/rejected": -313.01300048828125,
697
+ "loss": 0.4793,
698
+ "rewards/accuracies": 0.5,
699
+ "rewards/chosen": -0.14639724791049957,
700
+ "rewards/margins": -0.10981196165084839,
701
+ "rewards/rejected": -0.03658527135848999,
702
+ "step": 33
703
+ },
704
+ {
705
+ "debug/policy_chosen_logits": 2.2111659049987793,
706
+ "debug/policy_chosen_logps": -270.5108642578125,
707
+ "debug/policy_rejected_logits": 2.1455953121185303,
708
+ "debug/policy_rejected_logps": -306.1610107421875,
709
+ "debug/reference_chosen_logps": -272.406982421875,
710
+ "debug/reference_rejected_logps": -304.00537109375,
711
+ "epoch": 0.8095238095238095,
712
+ "grad_norm": 4.096315029962809,
713
+ "learning_rate": 1e-06,
714
+ "logits/chosen": 2.2111659049987793,
715
+ "logits/rejected": 2.1455953121185303,
716
+ "logps/chosen": -270.5108642578125,
717
+ "logps/rejected": -306.1610107421875,
718
+ "loss": 0.4473,
719
+ "rewards/accuracies": 0.5,
720
+ "rewards/chosen": 0.018961027264595032,
721
+ "rewards/margins": 0.04051744192838669,
722
+ "rewards/rejected": -0.021556414663791656,
723
+ "step": 34
724
+ },
725
+ {
726
+ "debug/policy_chosen_logits": 1.6460078954696655,
727
+ "debug/policy_chosen_logps": -233.59458923339844,
728
+ "debug/policy_rejected_logits": 1.5349398851394653,
729
+ "debug/policy_rejected_logps": -266.34307861328125,
730
+ "debug/reference_chosen_logps": -242.21400451660156,
731
+ "debug/reference_rejected_logps": -272.995849609375,
732
+ "epoch": 0.8333333333333334,
733
+ "grad_norm": 3.974452548040394,
734
+ "learning_rate": 1e-06,
735
+ "logits/chosen": 1.6460078954696655,
736
+ "logits/rejected": 1.5349398851394653,
737
+ "logps/chosen": -233.59458923339844,
738
+ "logps/rejected": -266.34307861328125,
739
+ "loss": 0.4712,
740
+ "rewards/accuracies": 0.75,
741
+ "rewards/chosen": 0.08619418740272522,
742
+ "rewards/margins": 0.019666405394673347,
743
+ "rewards/rejected": 0.06652778387069702,
744
+ "step": 35
745
+ },
746
+ {
747
+ "debug/policy_chosen_logits": 1.9347574710845947,
748
+ "debug/policy_chosen_logps": -262.56744384765625,
749
+ "debug/policy_rejected_logits": 1.9021203517913818,
750
+ "debug/policy_rejected_logps": -330.6053466796875,
751
+ "debug/reference_chosen_logps": -268.55999755859375,
752
+ "debug/reference_rejected_logps": -324.86761474609375,
753
+ "epoch": 0.8571428571428571,
754
+ "grad_norm": 4.265267674670063,
755
+ "learning_rate": 1e-06,
756
+ "logits/chosen": 1.9347574710845947,
757
+ "logits/rejected": 1.9021203517913818,
758
+ "logps/chosen": -262.56744384765625,
759
+ "logps/rejected": -330.6053466796875,
760
+ "loss": 0.48,
761
+ "rewards/accuracies": 0.875,
762
+ "rewards/chosen": 0.05992528796195984,
763
+ "rewards/margins": 0.1173023134469986,
764
+ "rewards/rejected": -0.057377032935619354,
765
+ "step": 36
766
+ },
767
+ {
768
+ "debug/policy_chosen_logits": 1.627541422843933,
769
+ "debug/policy_chosen_logps": -271.72589111328125,
770
+ "debug/policy_rejected_logits": 1.6616926193237305,
771
+ "debug/policy_rejected_logps": -293.35400390625,
772
+ "debug/reference_chosen_logps": -279.6279296875,
773
+ "debug/reference_rejected_logps": -297.64208984375,
774
+ "epoch": 0.8809523809523809,
775
+ "grad_norm": 4.332181044842491,
776
+ "learning_rate": 1e-06,
777
+ "logits/chosen": 1.627541422843933,
778
+ "logits/rejected": 1.6616926193237305,
779
+ "logps/chosen": -271.72589111328125,
780
+ "logps/rejected": -293.35400390625,
781
+ "loss": 0.4686,
782
+ "rewards/accuracies": 0.5,
783
+ "rewards/chosen": 0.07902045547962189,
784
+ "rewards/margins": 0.03613943234086037,
785
+ "rewards/rejected": 0.04288103058934212,
786
+ "step": 37
787
+ },
788
+ {
789
+ "debug/policy_chosen_logits": 1.6821520328521729,
790
+ "debug/policy_chosen_logps": -265.2967834472656,
791
+ "debug/policy_rejected_logits": 2.1404964923858643,
792
+ "debug/policy_rejected_logps": -281.3368225097656,
793
+ "debug/reference_chosen_logps": -268.13934326171875,
794
+ "debug/reference_rejected_logps": -283.97998046875,
795
+ "epoch": 0.9047619047619048,
796
+ "grad_norm": 5.381447680651191,
797
+ "learning_rate": 1e-06,
798
+ "logits/chosen": 1.6821520328521729,
799
+ "logits/rejected": 2.1404964923858643,
800
+ "logps/chosen": -265.2967834472656,
801
+ "logps/rejected": -281.3368225097656,
802
+ "loss": 0.479,
803
+ "rewards/accuracies": 0.5,
804
+ "rewards/chosen": 0.02842530980706215,
805
+ "rewards/margins": 0.0019937902688980103,
806
+ "rewards/rejected": 0.026431521400809288,
807
+ "step": 38
808
+ },
809
+ {
810
+ "debug/policy_chosen_logits": 1.7588127851486206,
811
+ "debug/policy_chosen_logps": -255.28640747070312,
812
+ "debug/policy_rejected_logits": 1.9087820053100586,
813
+ "debug/policy_rejected_logps": -321.52362060546875,
814
+ "debug/reference_chosen_logps": -257.3486022949219,
815
+ "debug/reference_rejected_logps": -308.3670959472656,
816
+ "epoch": 0.9285714285714286,
817
+ "grad_norm": 3.886541281859103,
818
+ "learning_rate": 1e-06,
819
+ "logits/chosen": 1.7588127851486206,
820
+ "logits/rejected": 1.9087820053100586,
821
+ "logps/chosen": -255.28640747070312,
822
+ "logps/rejected": -321.52362060546875,
823
+ "loss": 0.4353,
824
+ "rewards/accuracies": 0.75,
825
+ "rewards/chosen": 0.0206218920648098,
826
+ "rewards/margins": 0.15218740701675415,
827
+ "rewards/rejected": -0.13156552612781525,
828
+ "step": 39
829
+ },
830
+ {
831
+ "debug/policy_chosen_logits": 1.9523510932922363,
832
+ "debug/policy_chosen_logps": -269.86749267578125,
833
+ "debug/policy_rejected_logits": 1.881919264793396,
834
+ "debug/policy_rejected_logps": -322.18280029296875,
835
+ "debug/reference_chosen_logps": -268.71282958984375,
836
+ "debug/reference_rejected_logps": -310.1279296875,
837
+ "epoch": 0.9523809523809523,
838
+ "grad_norm": 3.7307936710208236,
839
+ "learning_rate": 1e-06,
840
+ "logits/chosen": 1.9523510932922363,
841
+ "logits/rejected": 1.881919264793396,
842
+ "logps/chosen": -269.86749267578125,
843
+ "logps/rejected": -322.18280029296875,
844
+ "loss": 0.4498,
845
+ "rewards/accuracies": 0.5,
846
+ "rewards/chosen": -0.011546440422534943,
847
+ "rewards/margins": 0.10900209844112396,
848
+ "rewards/rejected": -0.12054853141307831,
849
+ "step": 40
850
+ },
851
+ {
852
+ "debug/policy_chosen_logits": 1.8507214784622192,
853
+ "debug/policy_chosen_logps": -251.6080322265625,
854
+ "debug/policy_rejected_logits": 1.9003260135650635,
855
+ "debug/policy_rejected_logps": -305.5479736328125,
856
+ "debug/reference_chosen_logps": -253.77499389648438,
857
+ "debug/reference_rejected_logps": -301.3538818359375,
858
+ "epoch": 0.9761904761904762,
859
+ "grad_norm": 4.724014063098657,
860
+ "learning_rate": 1e-06,
861
+ "logits/chosen": 1.8507214784622192,
862
+ "logits/rejected": 1.9003260135650635,
863
+ "logps/chosen": -251.6080322265625,
864
+ "logps/rejected": -305.5479736328125,
865
+ "loss": 0.4489,
866
+ "rewards/accuracies": 0.875,
867
+ "rewards/chosen": 0.021669579669833183,
868
+ "rewards/margins": 0.0636105090379715,
869
+ "rewards/rejected": -0.04194093495607376,
870
+ "step": 41
871
+ },
872
+ {
873
+ "debug/policy_chosen_logits": 2.058990955352783,
874
+ "debug/policy_chosen_logps": -246.43154907226562,
875
+ "debug/policy_rejected_logits": 2.1252870559692383,
876
+ "debug/policy_rejected_logps": -260.11871337890625,
877
+ "debug/reference_chosen_logps": -251.91116333007812,
878
+ "debug/reference_rejected_logps": -259.431884765625,
879
+ "epoch": 1.0,
880
+ "grad_norm": 4.008863560347053,
881
+ "learning_rate": 1e-06,
882
+ "logits/chosen": 2.058990955352783,
883
+ "logits/rejected": 2.1252870559692383,
884
+ "logps/chosen": -246.43154907226562,
885
+ "logps/rejected": -260.11871337890625,
886
+ "loss": 0.449,
887
+ "rewards/accuracies": 0.625,
888
+ "rewards/chosen": 0.05479610711336136,
889
+ "rewards/margins": 0.061664048582315445,
890
+ "rewards/rejected": -0.006867942400276661,
891
+ "step": 42
892
+ },
893
+ {
894
+ "epoch": 1.0,
895
+ "step": 42,
896
+ "total_flos": 0.0,
897
+ "train_loss": 0.47520279742422555,
898
+ "train_runtime": 468.7452,
899
+ "train_samples_per_second": 5.722,
900
+ "train_steps_per_second": 0.09
901
+ }
902
+ ],
903
+ "logging_steps": 1,
904
+ "max_steps": 42,
905
+ "num_input_tokens_seen": 0,
906
+ "num_train_epochs": 1,
907
+ "save_steps": 500,
908
+ "stateful_callbacks": {
909
+ "TrainerControl": {
910
+ "args": {
911
+ "should_epoch_stop": false,
912
+ "should_evaluate": false,
913
+ "should_log": false,
914
+ "should_save": true,
915
+ "should_training_stop": true
916
+ },
917
+ "attributes": {}
918
+ }
919
+ },
920
+ "total_flos": 0.0,
921
+ "train_batch_size": 8,
922
+ "trial_name": null,
923
+ "trial_params": null
924
+ }