yiran-wang3 commited on
Commit
bd4aa30
1 Parent(s): 694f076

End of training

Browse files
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: yiran-wang3/sigmoid_ds_chat_rmsprop_iter2
5
+ tags:
6
+ - alignment-handbook
7
+ - generated_from_trainer
8
+ - trl
9
+ - dpo
10
+ datasets:
11
+ - reflection-gen/ds_chat_rmsprop_iter2_sigmoid_cn_mining_oj_iter2-binarized
12
+ model-index:
13
+ - name: sigmoid_ds_chat_rmsprop_iter3
14
+ results: []
15
+ ---
16
+
17
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
+ should probably proofread and complete it, then remove this comment. -->
19
+
20
+ # sigmoid_ds_chat_rmsprop_iter3
21
+
22
+ This model is a fine-tuned version of [yiran-wang3/sigmoid_ds_chat_rmsprop_iter2](https://huggingface.co/yiran-wang3/sigmoid_ds_chat_rmsprop_iter2) on the reflection-gen/ds_chat_rmsprop_iter2_sigmoid_cn_mining_oj_iter2-binarized dataset.
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 5e-07
42
+ - train_batch_size: 8
43
+ - eval_batch_size: 4
44
+ - seed: 42
45
+ - distributed_type: multi-GPU
46
+ - num_devices: 8
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 32
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: constant
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - lr_scheduler_warmup_steps: 100
53
+ - num_epochs: 1.0
54
+
55
+ ### Training results
56
+
57
+
58
+
59
+ ### Framework versions
60
+
61
+ - Transformers 4.45.0
62
+ - Pytorch 2.4.0+cu121
63
+ - Datasets 2.14.6
64
+ - Tokenizers 0.20.3
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.6828021520660037,
5
+ "train_runtime": 390.3052,
6
+ "train_samples": 2648,
7
+ "train_samples_per_second": 6.784,
8
+ "train_steps_per_second": 0.108
9
+ }
config.json CHANGED
@@ -25,6 +25,6 @@
25
  "tie_word_embeddings": false,
26
  "torch_dtype": "bfloat16",
27
  "transformers_version": "4.45.0",
28
- "use_cache": false,
29
  "vocab_size": 102400
30
  }
 
25
  "tie_word_embeddings": false,
26
  "torch_dtype": "bfloat16",
27
  "transformers_version": "4.45.0",
28
+ "use_cache": true,
29
  "vocab_size": 102400
30
  }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 100000,
4
+ "do_sample": true,
5
+ "eos_token_id": 100001,
6
+ "temperature": 0.7,
7
+ "top_p": 0.95,
8
+ "transformers_version": "4.45.0"
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.6828021520660037,
5
+ "train_runtime": 390.3052,
6
+ "train_samples": 2648,
7
+ "train_samples_per_second": 6.784,
8
+ "train_steps_per_second": 0.108
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,924 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 100,
6
+ "global_step": 42,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "debug/policy_chosen_logits": 0.09027537703514099,
13
+ "debug/policy_chosen_logps": -162.730224609375,
14
+ "debug/policy_rejected_logits": 0.5158556699752808,
15
+ "debug/policy_rejected_logps": -184.16571044921875,
16
+ "debug/reference_chosen_logps": -162.730224609375,
17
+ "debug/reference_rejected_logps": -184.16571044921875,
18
+ "epoch": 0.023809523809523808,
19
+ "grad_norm": 4.685966665550777,
20
+ "learning_rate": 5e-07,
21
+ "logits/chosen": 0.09027537703514099,
22
+ "logits/rejected": 0.5158556699752808,
23
+ "logps/chosen": -162.730224609375,
24
+ "logps/rejected": -184.16571044921875,
25
+ "loss": 0.6931,
26
+ "rewards/accuracies": 0.0,
27
+ "rewards/chosen": 0.0,
28
+ "rewards/margins": 0.0,
29
+ "rewards/rejected": 0.0,
30
+ "step": 1
31
+ },
32
+ {
33
+ "debug/policy_chosen_logits": 0.7074397802352905,
34
+ "debug/policy_chosen_logps": -150.46807861328125,
35
+ "debug/policy_rejected_logits": 0.43174317479133606,
36
+ "debug/policy_rejected_logps": -140.48440551757812,
37
+ "debug/reference_chosen_logps": -153.92564392089844,
38
+ "debug/reference_rejected_logps": -142.85406494140625,
39
+ "epoch": 0.047619047619047616,
40
+ "grad_norm": 15.044621835270732,
41
+ "learning_rate": 5e-07,
42
+ "logits/chosen": 0.7074397802352905,
43
+ "logits/rejected": 0.43174317479133606,
44
+ "logps/chosen": -150.46807861328125,
45
+ "logps/rejected": -140.48440551757812,
46
+ "loss": 0.6973,
47
+ "rewards/accuracies": 0.625,
48
+ "rewards/chosen": 0.034575510770082474,
49
+ "rewards/margins": 0.010879031382501125,
50
+ "rewards/rejected": 0.023696478456258774,
51
+ "step": 2
52
+ },
53
+ {
54
+ "debug/policy_chosen_logits": 0.4280781149864197,
55
+ "debug/policy_chosen_logps": -153.80137634277344,
56
+ "debug/policy_rejected_logits": 1.081570029258728,
57
+ "debug/policy_rejected_logps": -173.27056884765625,
58
+ "debug/reference_chosen_logps": -151.54473876953125,
59
+ "debug/reference_rejected_logps": -169.95703125,
60
+ "epoch": 0.07142857142857142,
61
+ "grad_norm": 4.933146485934281,
62
+ "learning_rate": 5e-07,
63
+ "logits/chosen": 0.4280781149864197,
64
+ "logits/rejected": 1.081570029258728,
65
+ "logps/chosen": -153.80137634277344,
66
+ "logps/rejected": -173.27056884765625,
67
+ "loss": 0.6923,
68
+ "rewards/accuracies": 0.75,
69
+ "rewards/chosen": -0.022566460072994232,
70
+ "rewards/margins": 0.010568867437541485,
71
+ "rewards/rejected": -0.03313532844185829,
72
+ "step": 3
73
+ },
74
+ {
75
+ "debug/policy_chosen_logits": 0.49108535051345825,
76
+ "debug/policy_chosen_logps": -155.66534423828125,
77
+ "debug/policy_rejected_logits": 0.4322296977043152,
78
+ "debug/policy_rejected_logps": -149.7783203125,
79
+ "debug/reference_chosen_logps": -153.9282989501953,
80
+ "debug/reference_rejected_logps": -148.75108337402344,
81
+ "epoch": 0.09523809523809523,
82
+ "grad_norm": 6.198349828531256,
83
+ "learning_rate": 5e-07,
84
+ "logits/chosen": 0.49108535051345825,
85
+ "logits/rejected": 0.4322296977043152,
86
+ "logps/chosen": -155.66534423828125,
87
+ "logps/rejected": -149.7783203125,
88
+ "loss": 0.6899,
89
+ "rewards/accuracies": 0.375,
90
+ "rewards/chosen": -0.0173704344779253,
91
+ "rewards/margins": -0.0070981215685606,
92
+ "rewards/rejected": -0.010272311978042126,
93
+ "step": 4
94
+ },
95
+ {
96
+ "debug/policy_chosen_logits": 1.0142210721969604,
97
+ "debug/policy_chosen_logps": -158.026611328125,
98
+ "debug/policy_rejected_logits": 1.0418132543563843,
99
+ "debug/policy_rejected_logps": -199.23785400390625,
100
+ "debug/reference_chosen_logps": -155.4060516357422,
101
+ "debug/reference_rejected_logps": -194.74618530273438,
102
+ "epoch": 0.11904761904761904,
103
+ "grad_norm": 4.004423522491588,
104
+ "learning_rate": 5e-07,
105
+ "logits/chosen": 1.0142210721969604,
106
+ "logits/rejected": 1.0418132543563843,
107
+ "logps/chosen": -158.026611328125,
108
+ "logps/rejected": -199.23785400390625,
109
+ "loss": 0.6916,
110
+ "rewards/accuracies": 0.875,
111
+ "rewards/chosen": -0.026205480098724365,
112
+ "rewards/margins": 0.018711339682340622,
113
+ "rewards/rejected": -0.04491681978106499,
114
+ "step": 5
115
+ },
116
+ {
117
+ "debug/policy_chosen_logits": 0.803213894367218,
118
+ "debug/policy_chosen_logps": -156.5592041015625,
119
+ "debug/policy_rejected_logits": 0.9607799649238586,
120
+ "debug/policy_rejected_logps": -173.58987426757812,
121
+ "debug/reference_chosen_logps": -146.985595703125,
122
+ "debug/reference_rejected_logps": -163.15786743164062,
123
+ "epoch": 0.14285714285714285,
124
+ "grad_norm": 10.059361542630269,
125
+ "learning_rate": 5e-07,
126
+ "logits/chosen": 0.803213894367218,
127
+ "logits/rejected": 0.9607799649238586,
128
+ "logps/chosen": -156.5592041015625,
129
+ "logps/rejected": -173.58987426757812,
130
+ "loss": 0.6869,
131
+ "rewards/accuracies": 0.75,
132
+ "rewards/chosen": -0.09573620557785034,
133
+ "rewards/margins": 0.008583765476942062,
134
+ "rewards/rejected": -0.1043199747800827,
135
+ "step": 6
136
+ },
137
+ {
138
+ "debug/policy_chosen_logits": 0.5479093194007874,
139
+ "debug/policy_chosen_logps": -176.60946655273438,
140
+ "debug/policy_rejected_logits": 0.11254727840423584,
141
+ "debug/policy_rejected_logps": -175.84767150878906,
142
+ "debug/reference_chosen_logps": -174.020751953125,
143
+ "debug/reference_rejected_logps": -170.24949645996094,
144
+ "epoch": 0.16666666666666666,
145
+ "grad_norm": 20.07889018294013,
146
+ "learning_rate": 5e-07,
147
+ "logits/chosen": 0.5479093194007874,
148
+ "logits/rejected": 0.11254727840423584,
149
+ "logps/chosen": -176.60946655273438,
150
+ "logps/rejected": -175.84767150878906,
151
+ "loss": 0.6892,
152
+ "rewards/accuracies": 0.875,
153
+ "rewards/chosen": -0.025887146592140198,
154
+ "rewards/margins": 0.030094660818576813,
155
+ "rewards/rejected": -0.05598180741071701,
156
+ "step": 7
157
+ },
158
+ {
159
+ "debug/policy_chosen_logits": 0.5916139483451843,
160
+ "debug/policy_chosen_logps": -164.50384521484375,
161
+ "debug/policy_rejected_logits": 0.5801162123680115,
162
+ "debug/policy_rejected_logps": -156.9475860595703,
163
+ "debug/reference_chosen_logps": -153.96173095703125,
164
+ "debug/reference_rejected_logps": -147.9217071533203,
165
+ "epoch": 0.19047619047619047,
166
+ "grad_norm": 4.5921375475915776,
167
+ "learning_rate": 5e-07,
168
+ "logits/chosen": 0.5916139483451843,
169
+ "logits/rejected": 0.5801162123680115,
170
+ "logps/chosen": -164.50384521484375,
171
+ "logps/rejected": -156.9475860595703,
172
+ "loss": 0.69,
173
+ "rewards/accuracies": 0.375,
174
+ "rewards/chosen": -0.10542111098766327,
175
+ "rewards/margins": -0.015162268653512001,
176
+ "rewards/rejected": -0.09025884419679642,
177
+ "step": 8
178
+ },
179
+ {
180
+ "debug/policy_chosen_logits": 0.2481817752122879,
181
+ "debug/policy_chosen_logps": -169.9642333984375,
182
+ "debug/policy_rejected_logits": 0.48347601294517517,
183
+ "debug/policy_rejected_logps": -190.4657440185547,
184
+ "debug/reference_chosen_logps": -162.74264526367188,
185
+ "debug/reference_rejected_logps": -181.60940551757812,
186
+ "epoch": 0.21428571428571427,
187
+ "grad_norm": 5.023010276429253,
188
+ "learning_rate": 5e-07,
189
+ "logits/chosen": 0.2481817752122879,
190
+ "logits/rejected": 0.48347601294517517,
191
+ "logps/chosen": -169.9642333984375,
192
+ "logps/rejected": -190.4657440185547,
193
+ "loss": 0.6877,
194
+ "rewards/accuracies": 0.75,
195
+ "rewards/chosen": -0.07221580296754837,
196
+ "rewards/margins": 0.01634763740003109,
197
+ "rewards/rejected": -0.08856344223022461,
198
+ "step": 9
199
+ },
200
+ {
201
+ "debug/policy_chosen_logits": 0.2575492560863495,
202
+ "debug/policy_chosen_logps": -184.9705810546875,
203
+ "debug/policy_rejected_logits": 0.5005592703819275,
204
+ "debug/policy_rejected_logps": -172.09518432617188,
205
+ "debug/reference_chosen_logps": -172.8156280517578,
206
+ "debug/reference_rejected_logps": -160.72515869140625,
207
+ "epoch": 0.23809523809523808,
208
+ "grad_norm": 8.326836844842857,
209
+ "learning_rate": 5e-07,
210
+ "logits/chosen": 0.2575492560863495,
211
+ "logits/rejected": 0.5005592703819275,
212
+ "logps/chosen": -184.9705810546875,
213
+ "logps/rejected": -172.09518432617188,
214
+ "loss": 0.6811,
215
+ "rewards/accuracies": 0.375,
216
+ "rewards/chosen": -0.12154942750930786,
217
+ "rewards/margins": -0.00784902460873127,
218
+ "rewards/rejected": -0.11370040476322174,
219
+ "step": 10
220
+ },
221
+ {
222
+ "debug/policy_chosen_logits": 0.02329457737505436,
223
+ "debug/policy_chosen_logps": -151.5535888671875,
224
+ "debug/policy_rejected_logits": 0.5039985179901123,
225
+ "debug/policy_rejected_logps": -164.93890380859375,
226
+ "debug/reference_chosen_logps": -145.47381591796875,
227
+ "debug/reference_rejected_logps": -155.04107666015625,
228
+ "epoch": 0.2619047619047619,
229
+ "grad_norm": 8.956479762651878,
230
+ "learning_rate": 5e-07,
231
+ "logits/chosen": 0.02329457737505436,
232
+ "logits/rejected": 0.5039985179901123,
233
+ "logps/chosen": -151.5535888671875,
234
+ "logps/rejected": -164.93890380859375,
235
+ "loss": 0.6868,
236
+ "rewards/accuracies": 0.625,
237
+ "rewards/chosen": -0.06079769879579544,
238
+ "rewards/margins": 0.038180749863386154,
239
+ "rewards/rejected": -0.0989784449338913,
240
+ "step": 11
241
+ },
242
+ {
243
+ "debug/policy_chosen_logits": 0.47942259907722473,
244
+ "debug/policy_chosen_logps": -159.60877990722656,
245
+ "debug/policy_rejected_logits": 0.5704939365386963,
246
+ "debug/policy_rejected_logps": -154.61744689941406,
247
+ "debug/reference_chosen_logps": -147.24301147460938,
248
+ "debug/reference_rejected_logps": -141.2715301513672,
249
+ "epoch": 0.2857142857142857,
250
+ "grad_norm": 11.992421788281984,
251
+ "learning_rate": 5e-07,
252
+ "logits/chosen": 0.47942259907722473,
253
+ "logits/rejected": 0.5704939365386963,
254
+ "logps/chosen": -159.60877990722656,
255
+ "logps/rejected": -154.61744689941406,
256
+ "loss": 0.6853,
257
+ "rewards/accuracies": 0.5,
258
+ "rewards/chosen": -0.12365761399269104,
259
+ "rewards/margins": 0.009801514446735382,
260
+ "rewards/rejected": -0.13345913589000702,
261
+ "step": 12
262
+ },
263
+ {
264
+ "debug/policy_chosen_logits": -0.20997528731822968,
265
+ "debug/policy_chosen_logps": -142.75146484375,
266
+ "debug/policy_rejected_logits": 0.3726802468299866,
267
+ "debug/policy_rejected_logps": -175.70962524414062,
268
+ "debug/reference_chosen_logps": -134.545166015625,
269
+ "debug/reference_rejected_logps": -164.0076141357422,
270
+ "epoch": 0.30952380952380953,
271
+ "grad_norm": 5.358671233182435,
272
+ "learning_rate": 5e-07,
273
+ "logits/chosen": -0.20997528731822968,
274
+ "logits/rejected": 0.3726802468299866,
275
+ "logps/chosen": -142.75146484375,
276
+ "logps/rejected": -175.70962524414062,
277
+ "loss": 0.689,
278
+ "rewards/accuracies": 0.625,
279
+ "rewards/chosen": -0.08206304907798767,
280
+ "rewards/margins": 0.0349571518599987,
281
+ "rewards/rejected": -0.11702020466327667,
282
+ "step": 13
283
+ },
284
+ {
285
+ "debug/policy_chosen_logits": 0.20442210137844086,
286
+ "debug/policy_chosen_logps": -177.51991271972656,
287
+ "debug/policy_rejected_logits": 0.34923601150512695,
288
+ "debug/policy_rejected_logps": -161.62881469726562,
289
+ "debug/reference_chosen_logps": -164.69485473632812,
290
+ "debug/reference_rejected_logps": -150.70733642578125,
291
+ "epoch": 0.3333333333333333,
292
+ "grad_norm": 6.471200581198782,
293
+ "learning_rate": 5e-07,
294
+ "logits/chosen": 0.20442210137844086,
295
+ "logits/rejected": 0.34923601150512695,
296
+ "logps/chosen": -177.51991271972656,
297
+ "logps/rejected": -161.62881469726562,
298
+ "loss": 0.6899,
299
+ "rewards/accuracies": 0.375,
300
+ "rewards/chosen": -0.12825068831443787,
301
+ "rewards/margins": -0.019035786390304565,
302
+ "rewards/rejected": -0.1092148944735527,
303
+ "step": 14
304
+ },
305
+ {
306
+ "debug/policy_chosen_logits": 0.20042140781879425,
307
+ "debug/policy_chosen_logps": -198.75933837890625,
308
+ "debug/policy_rejected_logits": 0.24180738627910614,
309
+ "debug/policy_rejected_logps": -177.14825439453125,
310
+ "debug/reference_chosen_logps": -178.46697998046875,
311
+ "debug/reference_rejected_logps": -158.2596435546875,
312
+ "epoch": 0.35714285714285715,
313
+ "grad_norm": 15.202276429910315,
314
+ "learning_rate": 5e-07,
315
+ "logits/chosen": 0.20042140781879425,
316
+ "logits/rejected": 0.24180738627910614,
317
+ "logps/chosen": -198.75933837890625,
318
+ "logps/rejected": -177.14825439453125,
319
+ "loss": 0.6823,
320
+ "rewards/accuracies": 0.25,
321
+ "rewards/chosen": -0.2029237002134323,
322
+ "rewards/margins": -0.014037556946277618,
323
+ "rewards/rejected": -0.1888861358165741,
324
+ "step": 15
325
+ },
326
+ {
327
+ "debug/policy_chosen_logits": 0.3575197160243988,
328
+ "debug/policy_chosen_logps": -159.78720092773438,
329
+ "debug/policy_rejected_logits": 0.6836833953857422,
330
+ "debug/policy_rejected_logps": -167.8487548828125,
331
+ "debug/reference_chosen_logps": -148.35433959960938,
332
+ "debug/reference_rejected_logps": -153.20465087890625,
333
+ "epoch": 0.38095238095238093,
334
+ "grad_norm": 5.011772584899409,
335
+ "learning_rate": 5e-07,
336
+ "logits/chosen": 0.3575197160243988,
337
+ "logits/rejected": 0.6836833953857422,
338
+ "logps/chosen": -159.78720092773438,
339
+ "logps/rejected": -167.8487548828125,
340
+ "loss": 0.6846,
341
+ "rewards/accuracies": 0.625,
342
+ "rewards/chosen": -0.11432872712612152,
343
+ "rewards/margins": 0.0321124903857708,
344
+ "rewards/rejected": -0.14644122123718262,
345
+ "step": 16
346
+ },
347
+ {
348
+ "debug/policy_chosen_logits": 0.3418176472187042,
349
+ "debug/policy_chosen_logps": -153.46934509277344,
350
+ "debug/policy_rejected_logits": 0.33436495065689087,
351
+ "debug/policy_rejected_logps": -164.7410888671875,
352
+ "debug/reference_chosen_logps": -145.3973388671875,
353
+ "debug/reference_rejected_logps": -149.6763458251953,
354
+ "epoch": 0.40476190476190477,
355
+ "grad_norm": 20.39016346970483,
356
+ "learning_rate": 5e-07,
357
+ "logits/chosen": 0.3418176472187042,
358
+ "logits/rejected": 0.33436495065689087,
359
+ "logps/chosen": -153.46934509277344,
360
+ "logps/rejected": -164.7410888671875,
361
+ "loss": 0.6823,
362
+ "rewards/accuracies": 0.875,
363
+ "rewards/chosen": -0.08072000741958618,
364
+ "rewards/margins": 0.06992734968662262,
365
+ "rewards/rejected": -0.1506473571062088,
366
+ "step": 17
367
+ },
368
+ {
369
+ "debug/policy_chosen_logits": 0.21861077845096588,
370
+ "debug/policy_chosen_logps": -184.25680541992188,
371
+ "debug/policy_rejected_logits": 0.3139030635356903,
372
+ "debug/policy_rejected_logps": -186.92042541503906,
373
+ "debug/reference_chosen_logps": -167.13250732421875,
374
+ "debug/reference_rejected_logps": -167.22145080566406,
375
+ "epoch": 0.42857142857142855,
376
+ "grad_norm": 9.604003163834712,
377
+ "learning_rate": 5e-07,
378
+ "logits/chosen": 0.21861077845096588,
379
+ "logits/rejected": 0.3139030635356903,
380
+ "logps/chosen": -184.25680541992188,
381
+ "logps/rejected": -186.92042541503906,
382
+ "loss": 0.6871,
383
+ "rewards/accuracies": 0.5,
384
+ "rewards/chosen": -0.17124298214912415,
385
+ "rewards/margins": 0.025746773928403854,
386
+ "rewards/rejected": -0.1969897449016571,
387
+ "step": 18
388
+ },
389
+ {
390
+ "debug/policy_chosen_logits": 0.4868623614311218,
391
+ "debug/policy_chosen_logps": -164.40745544433594,
392
+ "debug/policy_rejected_logits": 0.407953143119812,
393
+ "debug/policy_rejected_logps": -192.24801635742188,
394
+ "debug/reference_chosen_logps": -149.22422790527344,
395
+ "debug/reference_rejected_logps": -176.274658203125,
396
+ "epoch": 0.4523809523809524,
397
+ "grad_norm": 5.467573091749328,
398
+ "learning_rate": 5e-07,
399
+ "logits/chosen": 0.4868623614311218,
400
+ "logits/rejected": 0.407953143119812,
401
+ "logps/chosen": -164.40745544433594,
402
+ "logps/rejected": -192.24801635742188,
403
+ "loss": 0.6788,
404
+ "rewards/accuracies": 0.625,
405
+ "rewards/chosen": -0.1518322378396988,
406
+ "rewards/margins": 0.007901255041360855,
407
+ "rewards/rejected": -0.15973350405693054,
408
+ "step": 19
409
+ },
410
+ {
411
+ "debug/policy_chosen_logits": 0.059103213250637054,
412
+ "debug/policy_chosen_logps": -174.8370361328125,
413
+ "debug/policy_rejected_logits": 0.5786897540092468,
414
+ "debug/policy_rejected_logps": -199.56698608398438,
415
+ "debug/reference_chosen_logps": -161.7840118408203,
416
+ "debug/reference_rejected_logps": -185.17050170898438,
417
+ "epoch": 0.47619047619047616,
418
+ "grad_norm": 5.234500875119642,
419
+ "learning_rate": 5e-07,
420
+ "logits/chosen": 0.059103213250637054,
421
+ "logits/rejected": 0.5786897540092468,
422
+ "logps/chosen": -174.8370361328125,
423
+ "logps/rejected": -199.56698608398438,
424
+ "loss": 0.6848,
425
+ "rewards/accuracies": 0.5,
426
+ "rewards/chosen": -0.13053017854690552,
427
+ "rewards/margins": 0.013434587977826595,
428
+ "rewards/rejected": -0.1439647674560547,
429
+ "step": 20
430
+ },
431
+ {
432
+ "debug/policy_chosen_logits": 0.5743213891983032,
433
+ "debug/policy_chosen_logps": -178.57594299316406,
434
+ "debug/policy_rejected_logits": 0.20286375284194946,
435
+ "debug/policy_rejected_logps": -179.84762573242188,
436
+ "debug/reference_chosen_logps": -158.3350067138672,
437
+ "debug/reference_rejected_logps": -160.45053100585938,
438
+ "epoch": 0.5,
439
+ "grad_norm": 6.102878645835385,
440
+ "learning_rate": 5e-07,
441
+ "logits/chosen": 0.5743213891983032,
442
+ "logits/rejected": 0.20286375284194946,
443
+ "logps/chosen": -178.57594299316406,
444
+ "logps/rejected": -179.84762573242188,
445
+ "loss": 0.6845,
446
+ "rewards/accuracies": 0.625,
447
+ "rewards/chosen": -0.20240947604179382,
448
+ "rewards/margins": -0.008438415825366974,
449
+ "rewards/rejected": -0.19397103786468506,
450
+ "step": 21
451
+ },
452
+ {
453
+ "debug/policy_chosen_logits": 0.3122093677520752,
454
+ "debug/policy_chosen_logps": -164.51431274414062,
455
+ "debug/policy_rejected_logits": 0.3530707061290741,
456
+ "debug/policy_rejected_logps": -176.93601989746094,
457
+ "debug/reference_chosen_logps": -147.92042541503906,
458
+ "debug/reference_rejected_logps": -156.74851989746094,
459
+ "epoch": 0.5238095238095238,
460
+ "grad_norm": 6.315550057559611,
461
+ "learning_rate": 5e-07,
462
+ "logits/chosen": 0.3122093677520752,
463
+ "logits/rejected": 0.3530707061290741,
464
+ "logps/chosen": -164.51431274414062,
465
+ "logps/rejected": -176.93601989746094,
466
+ "loss": 0.6815,
467
+ "rewards/accuracies": 0.625,
468
+ "rewards/chosen": -0.16593879461288452,
469
+ "rewards/margins": 0.03593616560101509,
470
+ "rewards/rejected": -0.20187495648860931,
471
+ "step": 22
472
+ },
473
+ {
474
+ "debug/policy_chosen_logits": -0.044268831610679626,
475
+ "debug/policy_chosen_logps": -157.81039428710938,
476
+ "debug/policy_rejected_logits": 0.3559872508049011,
477
+ "debug/policy_rejected_logps": -147.35525512695312,
478
+ "debug/reference_chosen_logps": -150.7872314453125,
479
+ "debug/reference_rejected_logps": -138.82229614257812,
480
+ "epoch": 0.5476190476190477,
481
+ "grad_norm": 23.620495593781406,
482
+ "learning_rate": 5e-07,
483
+ "logits/chosen": -0.044268831610679626,
484
+ "logits/rejected": 0.3559872508049011,
485
+ "logps/chosen": -157.81039428710938,
486
+ "logps/rejected": -147.35525512695312,
487
+ "loss": 0.6842,
488
+ "rewards/accuracies": 0.5,
489
+ "rewards/chosen": -0.07023164629936218,
490
+ "rewards/margins": 0.015097856521606445,
491
+ "rewards/rejected": -0.08532950282096863,
492
+ "step": 23
493
+ },
494
+ {
495
+ "debug/policy_chosen_logits": 0.018966468051075935,
496
+ "debug/policy_chosen_logps": -159.8534698486328,
497
+ "debug/policy_rejected_logits": 0.1428254395723343,
498
+ "debug/policy_rejected_logps": -164.7286834716797,
499
+ "debug/reference_chosen_logps": -147.0450439453125,
500
+ "debug/reference_rejected_logps": -149.85382080078125,
501
+ "epoch": 0.5714285714285714,
502
+ "grad_norm": 4.677349017880479,
503
+ "learning_rate": 5e-07,
504
+ "logits/chosen": 0.018966468051075935,
505
+ "logits/rejected": 0.1428254395723343,
506
+ "logps/chosen": -159.8534698486328,
507
+ "logps/rejected": -164.7286834716797,
508
+ "loss": 0.6819,
509
+ "rewards/accuracies": 0.75,
510
+ "rewards/chosen": -0.1280841827392578,
511
+ "rewards/margins": 0.02066453918814659,
512
+ "rewards/rejected": -0.1487487256526947,
513
+ "step": 24
514
+ },
515
+ {
516
+ "debug/policy_chosen_logits": 0.2853807806968689,
517
+ "debug/policy_chosen_logps": -176.42611694335938,
518
+ "debug/policy_rejected_logits": 0.13074414432048798,
519
+ "debug/policy_rejected_logps": -155.15415954589844,
520
+ "debug/reference_chosen_logps": -158.4669647216797,
521
+ "debug/reference_rejected_logps": -136.29364013671875,
522
+ "epoch": 0.5952380952380952,
523
+ "grad_norm": 11.079802876278416,
524
+ "learning_rate": 5e-07,
525
+ "logits/chosen": 0.2853807806968689,
526
+ "logits/rejected": 0.13074414432048798,
527
+ "logps/chosen": -176.42611694335938,
528
+ "logps/rejected": -155.15415954589844,
529
+ "loss": 0.6821,
530
+ "rewards/accuracies": 0.375,
531
+ "rewards/chosen": -0.179591566324234,
532
+ "rewards/margins": 0.00901371892541647,
533
+ "rewards/rejected": -0.18860529363155365,
534
+ "step": 25
535
+ },
536
+ {
537
+ "debug/policy_chosen_logits": 0.40262606739997864,
538
+ "debug/policy_chosen_logps": -153.15701293945312,
539
+ "debug/policy_rejected_logits": 0.7936873435974121,
540
+ "debug/policy_rejected_logps": -169.62689208984375,
541
+ "debug/reference_chosen_logps": -140.70889282226562,
542
+ "debug/reference_rejected_logps": -152.8561248779297,
543
+ "epoch": 0.6190476190476191,
544
+ "grad_norm": 5.32694876840012,
545
+ "learning_rate": 5e-07,
546
+ "logits/chosen": 0.40262606739997864,
547
+ "logits/rejected": 0.7936873435974121,
548
+ "logps/chosen": -153.15701293945312,
549
+ "logps/rejected": -169.62689208984375,
550
+ "loss": 0.6733,
551
+ "rewards/accuracies": 0.75,
552
+ "rewards/chosen": -0.12448111921548843,
553
+ "rewards/margins": 0.04322664067149162,
554
+ "rewards/rejected": -0.16770777106285095,
555
+ "step": 26
556
+ },
557
+ {
558
+ "debug/policy_chosen_logits": 0.4326091408729553,
559
+ "debug/policy_chosen_logps": -157.39126586914062,
560
+ "debug/policy_rejected_logits": 0.607225239276886,
561
+ "debug/policy_rejected_logps": -188.34918212890625,
562
+ "debug/reference_chosen_logps": -145.33380126953125,
563
+ "debug/reference_rejected_logps": -162.48890686035156,
564
+ "epoch": 0.6428571428571429,
565
+ "grad_norm": 4.700163204340666,
566
+ "learning_rate": 5e-07,
567
+ "logits/chosen": 0.4326091408729553,
568
+ "logits/rejected": 0.607225239276886,
569
+ "logps/chosen": -157.39126586914062,
570
+ "logps/rejected": -188.34918212890625,
571
+ "loss": 0.6642,
572
+ "rewards/accuracies": 0.625,
573
+ "rewards/chosen": -0.12057456374168396,
574
+ "rewards/margins": 0.13802826404571533,
575
+ "rewards/rejected": -0.2586028277873993,
576
+ "step": 27
577
+ },
578
+ {
579
+ "debug/policy_chosen_logits": 0.031958021223545074,
580
+ "debug/policy_chosen_logps": -159.67169189453125,
581
+ "debug/policy_rejected_logits": 0.1899116486310959,
582
+ "debug/policy_rejected_logps": -187.28189086914062,
583
+ "debug/reference_chosen_logps": -141.6376953125,
584
+ "debug/reference_rejected_logps": -162.19659423828125,
585
+ "epoch": 0.6666666666666666,
586
+ "grad_norm": 6.569512152499291,
587
+ "learning_rate": 5e-07,
588
+ "logits/chosen": 0.031958021223545074,
589
+ "logits/rejected": 0.1899116486310959,
590
+ "logps/chosen": -159.67169189453125,
591
+ "logps/rejected": -187.28189086914062,
592
+ "loss": 0.6824,
593
+ "rewards/accuracies": 0.625,
594
+ "rewards/chosen": -0.180339977145195,
595
+ "rewards/margins": 0.07051312178373337,
596
+ "rewards/rejected": -0.2508530914783478,
597
+ "step": 28
598
+ },
599
+ {
600
+ "debug/policy_chosen_logits": 0.10809233784675598,
601
+ "debug/policy_chosen_logps": -176.20724487304688,
602
+ "debug/policy_rejected_logits": 0.41801854968070984,
603
+ "debug/policy_rejected_logps": -197.15541076660156,
604
+ "debug/reference_chosen_logps": -158.1036834716797,
605
+ "debug/reference_rejected_logps": -176.26634216308594,
606
+ "epoch": 0.6904761904761905,
607
+ "grad_norm": 6.541965417685165,
608
+ "learning_rate": 5e-07,
609
+ "logits/chosen": 0.10809233784675598,
610
+ "logits/rejected": 0.41801854968070984,
611
+ "logps/chosen": -176.20724487304688,
612
+ "logps/rejected": -197.15541076660156,
613
+ "loss": 0.6681,
614
+ "rewards/accuracies": 0.75,
615
+ "rewards/chosen": -0.181035578250885,
616
+ "rewards/margins": 0.02785516157746315,
617
+ "rewards/rejected": -0.20889073610305786,
618
+ "step": 29
619
+ },
620
+ {
621
+ "debug/policy_chosen_logits": -0.15116974711418152,
622
+ "debug/policy_chosen_logps": -164.75247192382812,
623
+ "debug/policy_rejected_logits": 0.12009341269731522,
624
+ "debug/policy_rejected_logps": -175.12867736816406,
625
+ "debug/reference_chosen_logps": -146.2644805908203,
626
+ "debug/reference_rejected_logps": -149.75460815429688,
627
+ "epoch": 0.7142857142857143,
628
+ "grad_norm": 4.855645858354973,
629
+ "learning_rate": 5e-07,
630
+ "logits/chosen": -0.15116974711418152,
631
+ "logits/rejected": 0.12009341269731522,
632
+ "logps/chosen": -164.75247192382812,
633
+ "logps/rejected": -175.12867736816406,
634
+ "loss": 0.6786,
635
+ "rewards/accuracies": 0.625,
636
+ "rewards/chosen": -0.18488001823425293,
637
+ "rewards/margins": 0.06886060535907745,
638
+ "rewards/rejected": -0.2537406086921692,
639
+ "step": 30
640
+ },
641
+ {
642
+ "debug/policy_chosen_logits": 0.17149780690670013,
643
+ "debug/policy_chosen_logps": -188.273193359375,
644
+ "debug/policy_rejected_logits": 0.3072517216205597,
645
+ "debug/policy_rejected_logps": -187.52651977539062,
646
+ "debug/reference_chosen_logps": -165.44805908203125,
647
+ "debug/reference_rejected_logps": -158.7125244140625,
648
+ "epoch": 0.7380952380952381,
649
+ "grad_norm": 6.097067478609718,
650
+ "learning_rate": 5e-07,
651
+ "logits/chosen": 0.17149780690670013,
652
+ "logits/rejected": 0.3072517216205597,
653
+ "logps/chosen": -188.273193359375,
654
+ "logps/rejected": -187.52651977539062,
655
+ "loss": 0.6829,
656
+ "rewards/accuracies": 0.625,
657
+ "rewards/chosen": -0.22825142741203308,
658
+ "rewards/margins": 0.059888482093811035,
659
+ "rewards/rejected": -0.2881399095058441,
660
+ "step": 31
661
+ },
662
+ {
663
+ "debug/policy_chosen_logits": 0.46266281604766846,
664
+ "debug/policy_chosen_logps": -169.596435546875,
665
+ "debug/policy_rejected_logits": 0.4477306604385376,
666
+ "debug/policy_rejected_logps": -173.21981811523438,
667
+ "debug/reference_chosen_logps": -152.52273559570312,
668
+ "debug/reference_rejected_logps": -151.03050231933594,
669
+ "epoch": 0.7619047619047619,
670
+ "grad_norm": 10.520633954833,
671
+ "learning_rate": 5e-07,
672
+ "logits/chosen": 0.46266281604766846,
673
+ "logits/rejected": 0.4477306604385376,
674
+ "logps/chosen": -169.596435546875,
675
+ "logps/rejected": -173.21981811523438,
676
+ "loss": 0.6878,
677
+ "rewards/accuracies": 0.625,
678
+ "rewards/chosen": -0.17073702812194824,
679
+ "rewards/margins": 0.05115606635808945,
680
+ "rewards/rejected": -0.2218931019306183,
681
+ "step": 32
682
+ },
683
+ {
684
+ "debug/policy_chosen_logits": 0.7084560394287109,
685
+ "debug/policy_chosen_logps": -161.38735961914062,
686
+ "debug/policy_rejected_logits": 0.6357196569442749,
687
+ "debug/policy_rejected_logps": -164.42718505859375,
688
+ "debug/reference_chosen_logps": -154.86412048339844,
689
+ "debug/reference_rejected_logps": -157.79238891601562,
690
+ "epoch": 0.7857142857142857,
691
+ "grad_norm": 15.939572697711974,
692
+ "learning_rate": 5e-07,
693
+ "logits/chosen": 0.7084560394287109,
694
+ "logits/rejected": 0.6357196569442749,
695
+ "logps/chosen": -161.38735961914062,
696
+ "logps/rejected": -164.42718505859375,
697
+ "loss": 0.6803,
698
+ "rewards/accuracies": 0.5,
699
+ "rewards/chosen": -0.06523235142230988,
700
+ "rewards/margins": 0.001115655992180109,
701
+ "rewards/rejected": -0.06634800881147385,
702
+ "step": 33
703
+ },
704
+ {
705
+ "debug/policy_chosen_logits": 0.12151144444942474,
706
+ "debug/policy_chosen_logps": -161.50515747070312,
707
+ "debug/policy_rejected_logits": 1.0420786142349243,
708
+ "debug/policy_rejected_logps": -164.21615600585938,
709
+ "debug/reference_chosen_logps": -150.59689331054688,
710
+ "debug/reference_rejected_logps": -152.6244354248047,
711
+ "epoch": 0.8095238095238095,
712
+ "grad_norm": 9.868780868712001,
713
+ "learning_rate": 5e-07,
714
+ "logits/chosen": 0.12151144444942474,
715
+ "logits/rejected": 1.0420786142349243,
716
+ "logps/chosen": -161.50515747070312,
717
+ "logps/rejected": -164.21615600585938,
718
+ "loss": 0.6786,
719
+ "rewards/accuracies": 0.375,
720
+ "rewards/chosen": -0.10908253490924835,
721
+ "rewards/margins": 0.006834707222878933,
722
+ "rewards/rejected": -0.11591724306344986,
723
+ "step": 34
724
+ },
725
+ {
726
+ "debug/policy_chosen_logits": 0.027947237715125084,
727
+ "debug/policy_chosen_logps": -157.77456665039062,
728
+ "debug/policy_rejected_logits": 0.31475889682769775,
729
+ "debug/policy_rejected_logps": -159.06002807617188,
730
+ "debug/reference_chosen_logps": -136.69996643066406,
731
+ "debug/reference_rejected_logps": -138.58349609375,
732
+ "epoch": 0.8333333333333334,
733
+ "grad_norm": 8.177811902764494,
734
+ "learning_rate": 5e-07,
735
+ "logits/chosen": 0.027947237715125084,
736
+ "logits/rejected": 0.31475889682769775,
737
+ "logps/chosen": -157.77456665039062,
738
+ "logps/rejected": -159.06002807617188,
739
+ "loss": 0.6874,
740
+ "rewards/accuracies": 0.5,
741
+ "rewards/chosen": -0.21074604988098145,
742
+ "rewards/margins": -0.005980661138892174,
743
+ "rewards/rejected": -0.20476537942886353,
744
+ "step": 35
745
+ },
746
+ {
747
+ "debug/policy_chosen_logits": 0.2593013346195221,
748
+ "debug/policy_chosen_logps": -165.31509399414062,
749
+ "debug/policy_rejected_logits": -0.04938528686761856,
750
+ "debug/policy_rejected_logps": -168.42660522460938,
751
+ "debug/reference_chosen_logps": -145.93374633789062,
752
+ "debug/reference_rejected_logps": -145.29168701171875,
753
+ "epoch": 0.8571428571428571,
754
+ "grad_norm": 12.91232630873052,
755
+ "learning_rate": 5e-07,
756
+ "logits/chosen": 0.2593013346195221,
757
+ "logits/rejected": -0.04938528686761856,
758
+ "logps/chosen": -165.31509399414062,
759
+ "logps/rejected": -168.42660522460938,
760
+ "loss": 0.6712,
761
+ "rewards/accuracies": 0.5,
762
+ "rewards/chosen": -0.19381342828273773,
763
+ "rewards/margins": 0.03753571957349777,
764
+ "rewards/rejected": -0.2313491404056549,
765
+ "step": 36
766
+ },
767
+ {
768
+ "debug/policy_chosen_logits": 1.0012390613555908,
769
+ "debug/policy_chosen_logps": -165.09347534179688,
770
+ "debug/policy_rejected_logits": 1.0178093910217285,
771
+ "debug/policy_rejected_logps": -171.16152954101562,
772
+ "debug/reference_chosen_logps": -150.7286834716797,
773
+ "debug/reference_rejected_logps": -150.47354125976562,
774
+ "epoch": 0.8809523809523809,
775
+ "grad_norm": 4.747576829430422,
776
+ "learning_rate": 5e-07,
777
+ "logits/chosen": 1.0012390613555908,
778
+ "logits/rejected": 1.0178093910217285,
779
+ "logps/chosen": -165.09347534179688,
780
+ "logps/rejected": -171.16152954101562,
781
+ "loss": 0.6849,
782
+ "rewards/accuracies": 0.75,
783
+ "rewards/chosen": -0.1436479538679123,
784
+ "rewards/margins": 0.06323190778493881,
785
+ "rewards/rejected": -0.2068798542022705,
786
+ "step": 37
787
+ },
788
+ {
789
+ "debug/policy_chosen_logits": -0.11002390831708908,
790
+ "debug/policy_chosen_logps": -173.82029724121094,
791
+ "debug/policy_rejected_logits": 0.446510910987854,
792
+ "debug/policy_rejected_logps": -212.50643920898438,
793
+ "debug/reference_chosen_logps": -150.2817840576172,
794
+ "debug/reference_rejected_logps": -185.846923828125,
795
+ "epoch": 0.9047619047619048,
796
+ "grad_norm": 5.476549746954031,
797
+ "learning_rate": 5e-07,
798
+ "logits/chosen": -0.11002390831708908,
799
+ "logits/rejected": 0.446510910987854,
800
+ "logps/chosen": -173.82029724121094,
801
+ "logps/rejected": -212.50643920898438,
802
+ "loss": 0.6724,
803
+ "rewards/accuracies": 0.5,
804
+ "rewards/chosen": -0.23538516461849213,
805
+ "rewards/margins": 0.031209895387291908,
806
+ "rewards/rejected": -0.2665950655937195,
807
+ "step": 38
808
+ },
809
+ {
810
+ "debug/policy_chosen_logits": -0.20052273571491241,
811
+ "debug/policy_chosen_logps": -163.867431640625,
812
+ "debug/policy_rejected_logits": 0.5886087417602539,
813
+ "debug/policy_rejected_logps": -185.58941650390625,
814
+ "debug/reference_chosen_logps": -141.18801879882812,
815
+ "debug/reference_rejected_logps": -159.76058959960938,
816
+ "epoch": 0.9285714285714286,
817
+ "grad_norm": 12.5233373844755,
818
+ "learning_rate": 5e-07,
819
+ "logits/chosen": -0.20052273571491241,
820
+ "logits/rejected": 0.5886087417602539,
821
+ "logps/chosen": -163.867431640625,
822
+ "logps/rejected": -185.58941650390625,
823
+ "loss": 0.6685,
824
+ "rewards/accuracies": 0.5,
825
+ "rewards/chosen": -0.22679391503334045,
826
+ "rewards/margins": 0.0314943790435791,
827
+ "rewards/rejected": -0.25828829407691956,
828
+ "step": 39
829
+ },
830
+ {
831
+ "debug/policy_chosen_logits": 0.25073912739753723,
832
+ "debug/policy_chosen_logps": -157.51876831054688,
833
+ "debug/policy_rejected_logits": 0.30381596088409424,
834
+ "debug/policy_rejected_logps": -176.589599609375,
835
+ "debug/reference_chosen_logps": -138.98110961914062,
836
+ "debug/reference_rejected_logps": -154.03880310058594,
837
+ "epoch": 0.9523809523809523,
838
+ "grad_norm": 5.529655141820795,
839
+ "learning_rate": 5e-07,
840
+ "logits/chosen": 0.25073912739753723,
841
+ "logits/rejected": 0.30381596088409424,
842
+ "logps/chosen": -157.51876831054688,
843
+ "logps/rejected": -176.589599609375,
844
+ "loss": 0.6694,
845
+ "rewards/accuracies": 0.625,
846
+ "rewards/chosen": -0.1853766143321991,
847
+ "rewards/margins": 0.040131378918886185,
848
+ "rewards/rejected": -0.22550798952579498,
849
+ "step": 40
850
+ },
851
+ {
852
+ "debug/policy_chosen_logits": 0.23757268488407135,
853
+ "debug/policy_chosen_logps": -167.54112243652344,
854
+ "debug/policy_rejected_logits": 0.40399065613746643,
855
+ "debug/policy_rejected_logps": -191.4041748046875,
856
+ "debug/reference_chosen_logps": -145.42752075195312,
857
+ "debug/reference_rejected_logps": -164.51507568359375,
858
+ "epoch": 0.9761904761904762,
859
+ "grad_norm": 5.959237498635192,
860
+ "learning_rate": 5e-07,
861
+ "logits/chosen": 0.23757268488407135,
862
+ "logits/rejected": 0.40399065613746643,
863
+ "logps/chosen": -167.54112243652344,
864
+ "logps/rejected": -191.4041748046875,
865
+ "loss": 0.6706,
866
+ "rewards/accuracies": 0.75,
867
+ "rewards/chosen": -0.22113589942455292,
868
+ "rewards/margins": 0.047754913568496704,
869
+ "rewards/rejected": -0.2688907980918884,
870
+ "step": 41
871
+ },
872
+ {
873
+ "debug/policy_chosen_logits": 0.029359659180045128,
874
+ "debug/policy_chosen_logps": -178.32794189453125,
875
+ "debug/policy_rejected_logits": 0.20693586766719818,
876
+ "debug/policy_rejected_logps": -185.86007690429688,
877
+ "debug/reference_chosen_logps": -151.25294494628906,
878
+ "debug/reference_rejected_logps": -157.61886596679688,
879
+ "epoch": 1.0,
880
+ "grad_norm": 15.740573011032756,
881
+ "learning_rate": 5e-07,
882
+ "logits/chosen": 0.029359659180045128,
883
+ "logits/rejected": 0.20693586766719818,
884
+ "logps/chosen": -178.32794189453125,
885
+ "logps/rejected": -185.86007690429688,
886
+ "loss": 0.6925,
887
+ "rewards/accuracies": 0.625,
888
+ "rewards/chosen": -0.27075010538101196,
889
+ "rewards/margins": 0.01166202500462532,
890
+ "rewards/rejected": -0.2824121117591858,
891
+ "step": 42
892
+ },
893
+ {
894
+ "epoch": 1.0,
895
+ "step": 42,
896
+ "total_flos": 0.0,
897
+ "train_loss": 0.6828021520660037,
898
+ "train_runtime": 390.3052,
899
+ "train_samples_per_second": 6.784,
900
+ "train_steps_per_second": 0.108
901
+ }
902
+ ],
903
+ "logging_steps": 1,
904
+ "max_steps": 42,
905
+ "num_input_tokens_seen": 0,
906
+ "num_train_epochs": 1,
907
+ "save_steps": 500,
908
+ "stateful_callbacks": {
909
+ "TrainerControl": {
910
+ "args": {
911
+ "should_epoch_stop": false,
912
+ "should_evaluate": false,
913
+ "should_log": false,
914
+ "should_save": true,
915
+ "should_training_stop": true
916
+ },
917
+ "attributes": {}
918
+ }
919
+ },
920
+ "total_flos": 0.0,
921
+ "train_batch_size": 8,
922
+ "trial_name": null,
923
+ "trial_params": null
924
+ }