yiran-wang3 commited on
Commit
86b32f7
1 Parent(s): 8cd4618

End of training

Browse files
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: yiran-wang3/qwen1_chat_adamw_iter6
5
+ tags:
6
+ - alignment-handbook
7
+ - generated_from_trainer
8
+ - trl
9
+ - dpo
10
+ datasets:
11
+ - self-generate/qw1_sppo_hard_new_cn_mining_oj_iter6-binarized
12
+ model-index:
13
+ - name: qwen1_chat_adamw_iter7
14
+ results: []
15
+ ---
16
+
17
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
+ should probably proofread and complete it, then remove this comment. -->
19
+
20
+ # qwen1_chat_adamw_iter7
21
+
22
+ This model is a fine-tuned version of [yiran-wang3/qwen1_chat_adamw_iter6](https://huggingface.co/yiran-wang3/qwen1_chat_adamw_iter6) on the self-generate/qw1_sppo_hard_new_cn_mining_oj_iter6-binarized dataset.
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 1e-06
42
+ - train_batch_size: 8
43
+ - eval_batch_size: 4
44
+ - seed: 42
45
+ - distributed_type: multi-GPU
46
+ - num_devices: 8
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 32
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: constant
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - lr_scheduler_warmup_steps: 100
53
+ - num_epochs: 1.0
54
+
55
+ ### Training results
56
+
57
+
58
+
59
+ ### Framework versions
60
+
61
+ - Transformers 4.45.0
62
+ - Pytorch 2.4.0+cu121
63
+ - Datasets 2.14.6
64
+ - Tokenizers 0.20.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.49694290051334783,
5
+ "train_runtime": 142.4117,
6
+ "train_samples": 2410,
7
+ "train_samples_per_second": 16.923,
8
+ "train_steps_per_second": 0.267
9
+ }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.45.0",
26
- "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
 
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.45.0",
26
+ "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.45.0"
14
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.49694290051334783,
5
+ "train_runtime": 142.4117,
6
+ "train_samples": 2410,
7
+ "train_samples_per_second": 16.923,
8
+ "train_steps_per_second": 0.267
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,840 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 100,
6
+ "global_step": 38,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "debug/policy_chosen_logits": -0.7997890710830688,
13
+ "debug/policy_chosen_logps": -225.56033325195312,
14
+ "debug/policy_rejected_logits": -0.7811033725738525,
15
+ "debug/policy_rejected_logps": -194.36915588378906,
16
+ "debug/reference_chosen_logps": -225.56033325195312,
17
+ "debug/reference_rejected_logps": -194.36915588378906,
18
+ "epoch": 0.02631578947368421,
19
+ "grad_norm": 10.059155945927795,
20
+ "learning_rate": 1e-06,
21
+ "logits/chosen": -0.7997890710830688,
22
+ "logits/rejected": -0.7811033725738525,
23
+ "logps/chosen": -225.56033325195312,
24
+ "logps/rejected": -194.36915588378906,
25
+ "loss": 0.5,
26
+ "rewards/accuracies": 0.0,
27
+ "rewards/chosen": 0.0,
28
+ "rewards/margins": 0.0,
29
+ "rewards/rejected": 0.0,
30
+ "step": 1
31
+ },
32
+ {
33
+ "debug/policy_chosen_logits": -0.787833571434021,
34
+ "debug/policy_chosen_logps": -192.9886474609375,
35
+ "debug/policy_rejected_logits": -0.8347375988960266,
36
+ "debug/policy_rejected_logps": -183.05389404296875,
37
+ "debug/reference_chosen_logps": -193.40695190429688,
38
+ "debug/reference_rejected_logps": -182.5074920654297,
39
+ "epoch": 0.05263157894736842,
40
+ "grad_norm": 9.106220560523727,
41
+ "learning_rate": 1e-06,
42
+ "logits/chosen": -0.787833571434021,
43
+ "logits/rejected": -0.8347375988960266,
44
+ "logps/chosen": -192.9886474609375,
45
+ "logps/rejected": -183.05389404296875,
46
+ "loss": 0.4992,
47
+ "rewards/accuracies": 0.75,
48
+ "rewards/chosen": 0.004183177836239338,
49
+ "rewards/margins": 0.009647198021411896,
50
+ "rewards/rejected": -0.0054640197195112705,
51
+ "step": 2
52
+ },
53
+ {
54
+ "debug/policy_chosen_logits": -0.7686588168144226,
55
+ "debug/policy_chosen_logps": -166.56585693359375,
56
+ "debug/policy_rejected_logits": -0.823331892490387,
57
+ "debug/policy_rejected_logps": -206.15472412109375,
58
+ "debug/reference_chosen_logps": -166.86782836914062,
59
+ "debug/reference_rejected_logps": -206.29469299316406,
60
+ "epoch": 0.07894736842105263,
61
+ "grad_norm": 9.89971860615226,
62
+ "learning_rate": 1e-06,
63
+ "logits/chosen": -0.7686588168144226,
64
+ "logits/rejected": -0.823331892490387,
65
+ "logps/chosen": -166.56585693359375,
66
+ "logps/rejected": -206.15472412109375,
67
+ "loss": 0.4972,
68
+ "rewards/accuracies": 0.625,
69
+ "rewards/chosen": 0.0030196956358850002,
70
+ "rewards/margins": 0.0016200444661080837,
71
+ "rewards/rejected": 0.0013996504712849855,
72
+ "step": 3
73
+ },
74
+ {
75
+ "debug/policy_chosen_logits": -0.8045904040336609,
76
+ "debug/policy_chosen_logps": -167.75006103515625,
77
+ "debug/policy_rejected_logits": -0.9390885829925537,
78
+ "debug/policy_rejected_logps": -183.21987915039062,
79
+ "debug/reference_chosen_logps": -168.35263061523438,
80
+ "debug/reference_rejected_logps": -183.38255310058594,
81
+ "epoch": 0.10526315789473684,
82
+ "grad_norm": 9.923126638306748,
83
+ "learning_rate": 1e-06,
84
+ "logits/chosen": -0.8045904040336609,
85
+ "logits/rejected": -0.9390885829925537,
86
+ "logps/chosen": -167.75006103515625,
87
+ "logps/rejected": -183.21987915039062,
88
+ "loss": 0.498,
89
+ "rewards/accuracies": 0.375,
90
+ "rewards/chosen": 0.006025714334100485,
91
+ "rewards/margins": 0.004398994147777557,
92
+ "rewards/rejected": 0.001626720535568893,
93
+ "step": 4
94
+ },
95
+ {
96
+ "debug/policy_chosen_logits": -0.8231886625289917,
97
+ "debug/policy_chosen_logps": -179.09014892578125,
98
+ "debug/policy_rejected_logits": -0.8653745055198669,
99
+ "debug/policy_rejected_logps": -173.03317260742188,
100
+ "debug/reference_chosen_logps": -179.02658081054688,
101
+ "debug/reference_rejected_logps": -172.79678344726562,
102
+ "epoch": 0.13157894736842105,
103
+ "grad_norm": 10.543939600237236,
104
+ "learning_rate": 1e-06,
105
+ "logits/chosen": -0.8231886625289917,
106
+ "logits/rejected": -0.8653745055198669,
107
+ "logps/chosen": -179.09014892578125,
108
+ "logps/rejected": -173.03317260742188,
109
+ "loss": 0.4976,
110
+ "rewards/accuracies": 0.625,
111
+ "rewards/chosen": -0.0006354906363412738,
112
+ "rewards/margins": 0.0017283153720200062,
113
+ "rewards/rejected": -0.002363805891945958,
114
+ "step": 5
115
+ },
116
+ {
117
+ "debug/policy_chosen_logits": -0.9182553887367249,
118
+ "debug/policy_chosen_logps": -175.53131103515625,
119
+ "debug/policy_rejected_logits": -0.9559266567230225,
120
+ "debug/policy_rejected_logps": -208.5589141845703,
121
+ "debug/reference_chosen_logps": -176.2925567626953,
122
+ "debug/reference_rejected_logps": -208.50613403320312,
123
+ "epoch": 0.15789473684210525,
124
+ "grad_norm": 11.060840140231088,
125
+ "learning_rate": 1e-06,
126
+ "logits/chosen": -0.9182553887367249,
127
+ "logits/rejected": -0.9559266567230225,
128
+ "logps/chosen": -175.53131103515625,
129
+ "logps/rejected": -208.5589141845703,
130
+ "loss": 0.4986,
131
+ "rewards/accuracies": 0.75,
132
+ "rewards/chosen": 0.0076125627383589745,
133
+ "rewards/margins": 0.00814034417271614,
134
+ "rewards/rejected": -0.0005277825985103846,
135
+ "step": 6
136
+ },
137
+ {
138
+ "debug/policy_chosen_logits": -0.8945421576499939,
139
+ "debug/policy_chosen_logps": -154.65753173828125,
140
+ "debug/policy_rejected_logits": -0.9487111568450928,
141
+ "debug/policy_rejected_logps": -207.63418579101562,
142
+ "debug/reference_chosen_logps": -153.6591796875,
143
+ "debug/reference_rejected_logps": -208.0142364501953,
144
+ "epoch": 0.18421052631578946,
145
+ "grad_norm": 10.113024121054902,
146
+ "learning_rate": 1e-06,
147
+ "logits/chosen": -0.8945421576499939,
148
+ "logits/rejected": -0.9487111568450928,
149
+ "logps/chosen": -154.65753173828125,
150
+ "logps/rejected": -207.63418579101562,
151
+ "loss": 0.4995,
152
+ "rewards/accuracies": 0.5,
153
+ "rewards/chosen": -0.009983415715396404,
154
+ "rewards/margins": -0.013783845119178295,
155
+ "rewards/rejected": 0.0038004303351044655,
156
+ "step": 7
157
+ },
158
+ {
159
+ "debug/policy_chosen_logits": -0.8738376498222351,
160
+ "debug/policy_chosen_logps": -183.52603149414062,
161
+ "debug/policy_rejected_logits": -0.9855321645736694,
162
+ "debug/policy_rejected_logps": -162.13426208496094,
163
+ "debug/reference_chosen_logps": -182.94473266601562,
164
+ "debug/reference_rejected_logps": -161.57327270507812,
165
+ "epoch": 0.21052631578947367,
166
+ "grad_norm": 11.253835109468124,
167
+ "learning_rate": 1e-06,
168
+ "logits/chosen": -0.8738376498222351,
169
+ "logits/rejected": -0.9855321645736694,
170
+ "logps/chosen": -183.52603149414062,
171
+ "logps/rejected": -162.13426208496094,
172
+ "loss": 0.4972,
173
+ "rewards/accuracies": 0.75,
174
+ "rewards/chosen": -0.005812949500977993,
175
+ "rewards/margins": -0.00020312238484621048,
176
+ "rewards/rejected": -0.005609826650470495,
177
+ "step": 8
178
+ },
179
+ {
180
+ "debug/policy_chosen_logits": -0.7741842269897461,
181
+ "debug/policy_chosen_logps": -155.53382873535156,
182
+ "debug/policy_rejected_logits": -0.8124099373817444,
183
+ "debug/policy_rejected_logps": -194.9011993408203,
184
+ "debug/reference_chosen_logps": -155.30392456054688,
185
+ "debug/reference_rejected_logps": -194.22259521484375,
186
+ "epoch": 0.23684210526315788,
187
+ "grad_norm": 10.505846566132414,
188
+ "learning_rate": 1e-06,
189
+ "logits/chosen": -0.7741842269897461,
190
+ "logits/rejected": -0.8124099373817444,
191
+ "logps/chosen": -155.53382873535156,
192
+ "logps/rejected": -194.9011993408203,
193
+ "loss": 0.4954,
194
+ "rewards/accuracies": 0.625,
195
+ "rewards/chosen": -0.0022989464923739433,
196
+ "rewards/margins": 0.00448720995336771,
197
+ "rewards/rejected": -0.006786155980080366,
198
+ "step": 9
199
+ },
200
+ {
201
+ "debug/policy_chosen_logits": -0.8215097188949585,
202
+ "debug/policy_chosen_logps": -169.93019104003906,
203
+ "debug/policy_rejected_logits": -1.0346907377243042,
204
+ "debug/policy_rejected_logps": -170.85223388671875,
205
+ "debug/reference_chosen_logps": -171.34588623046875,
206
+ "debug/reference_rejected_logps": -169.4602508544922,
207
+ "epoch": 0.2631578947368421,
208
+ "grad_norm": 10.830589104143412,
209
+ "learning_rate": 1e-06,
210
+ "logits/chosen": -0.8215097188949585,
211
+ "logits/rejected": -1.0346907377243042,
212
+ "logps/chosen": -169.93019104003906,
213
+ "logps/rejected": -170.85223388671875,
214
+ "loss": 0.4913,
215
+ "rewards/accuracies": 0.75,
216
+ "rewards/chosen": 0.014157085679471493,
217
+ "rewards/margins": 0.02807692438364029,
218
+ "rewards/rejected": -0.013919839635491371,
219
+ "step": 10
220
+ },
221
+ {
222
+ "debug/policy_chosen_logits": -0.979742705821991,
223
+ "debug/policy_chosen_logps": -150.38729858398438,
224
+ "debug/policy_rejected_logits": -0.8156647086143494,
225
+ "debug/policy_rejected_logps": -194.0210418701172,
226
+ "debug/reference_chosen_logps": -149.69064331054688,
227
+ "debug/reference_rejected_logps": -192.29783630371094,
228
+ "epoch": 0.2894736842105263,
229
+ "grad_norm": 11.460987827797899,
230
+ "learning_rate": 1e-06,
231
+ "logits/chosen": -0.979742705821991,
232
+ "logits/rejected": -0.8156647086143494,
233
+ "logps/chosen": -150.38729858398438,
234
+ "logps/rejected": -194.0210418701172,
235
+ "loss": 0.494,
236
+ "rewards/accuracies": 0.625,
237
+ "rewards/chosen": -0.006966524291783571,
238
+ "rewards/margins": 0.010265503078699112,
239
+ "rewards/rejected": -0.017232026904821396,
240
+ "step": 11
241
+ },
242
+ {
243
+ "debug/policy_chosen_logits": -0.9658709764480591,
244
+ "debug/policy_chosen_logps": -156.47149658203125,
245
+ "debug/policy_rejected_logits": -0.9509150385856628,
246
+ "debug/policy_rejected_logps": -190.25225830078125,
247
+ "debug/reference_chosen_logps": -156.70692443847656,
248
+ "debug/reference_rejected_logps": -187.9228515625,
249
+ "epoch": 0.3157894736842105,
250
+ "grad_norm": 12.5990794235455,
251
+ "learning_rate": 1e-06,
252
+ "logits/chosen": -0.9658709764480591,
253
+ "logits/rejected": -0.9509150385856628,
254
+ "logps/chosen": -156.47149658203125,
255
+ "logps/rejected": -190.25225830078125,
256
+ "loss": 0.5009,
257
+ "rewards/accuracies": 0.875,
258
+ "rewards/chosen": 0.002354326192289591,
259
+ "rewards/margins": 0.02564830705523491,
260
+ "rewards/rejected": -0.023293981328606606,
261
+ "step": 12
262
+ },
263
+ {
264
+ "debug/policy_chosen_logits": -0.9551741480827332,
265
+ "debug/policy_chosen_logps": -159.93605041503906,
266
+ "debug/policy_rejected_logits": -1.077520489692688,
267
+ "debug/policy_rejected_logps": -175.4044189453125,
268
+ "debug/reference_chosen_logps": -162.68727111816406,
269
+ "debug/reference_rejected_logps": -176.97789001464844,
270
+ "epoch": 0.34210526315789475,
271
+ "grad_norm": 11.764195807799357,
272
+ "learning_rate": 1e-06,
273
+ "logits/chosen": -0.9551741480827332,
274
+ "logits/rejected": -1.077520489692688,
275
+ "logps/chosen": -159.93605041503906,
276
+ "logps/rejected": -175.4044189453125,
277
+ "loss": 0.4988,
278
+ "rewards/accuracies": 0.625,
279
+ "rewards/chosen": 0.02751227281987667,
280
+ "rewards/margins": 0.011777523905038834,
281
+ "rewards/rejected": 0.015734750777482986,
282
+ "step": 13
283
+ },
284
+ {
285
+ "debug/policy_chosen_logits": -0.8166245222091675,
286
+ "debug/policy_chosen_logps": -192.5528106689453,
287
+ "debug/policy_rejected_logits": -0.7194980382919312,
288
+ "debug/policy_rejected_logps": -193.76840209960938,
289
+ "debug/reference_chosen_logps": -192.17568969726562,
290
+ "debug/reference_rejected_logps": -193.7100372314453,
291
+ "epoch": 0.3684210526315789,
292
+ "grad_norm": 13.966553473864488,
293
+ "learning_rate": 1e-06,
294
+ "logits/chosen": -0.8166245222091675,
295
+ "logits/rejected": -0.7194980382919312,
296
+ "logps/chosen": -192.5528106689453,
297
+ "logps/rejected": -193.76840209960938,
298
+ "loss": 0.4966,
299
+ "rewards/accuracies": 0.625,
300
+ "rewards/chosen": -0.0037712082266807556,
301
+ "rewards/margins": -0.0031875791028141975,
302
+ "rewards/rejected": -0.000583629822358489,
303
+ "step": 14
304
+ },
305
+ {
306
+ "debug/policy_chosen_logits": -0.8419144749641418,
307
+ "debug/policy_chosen_logps": -156.88470458984375,
308
+ "debug/policy_rejected_logits": -0.7425439357757568,
309
+ "debug/policy_rejected_logps": -156.3319091796875,
310
+ "debug/reference_chosen_logps": -157.73068237304688,
311
+ "debug/reference_rejected_logps": -156.30401611328125,
312
+ "epoch": 0.39473684210526316,
313
+ "grad_norm": 12.533768329750274,
314
+ "learning_rate": 1e-06,
315
+ "logits/chosen": -0.8419144749641418,
316
+ "logits/rejected": -0.7425439357757568,
317
+ "logps/chosen": -156.88470458984375,
318
+ "logps/rejected": -156.3319091796875,
319
+ "loss": 0.5016,
320
+ "rewards/accuracies": 0.375,
321
+ "rewards/chosen": 0.008459766395390034,
322
+ "rewards/margins": 0.008738689124584198,
323
+ "rewards/rejected": -0.00027891993522644043,
324
+ "step": 15
325
+ },
326
+ {
327
+ "debug/policy_chosen_logits": -0.9986197352409363,
328
+ "debug/policy_chosen_logps": -178.00741577148438,
329
+ "debug/policy_rejected_logits": -1.0260645151138306,
330
+ "debug/policy_rejected_logps": -168.3215789794922,
331
+ "debug/reference_chosen_logps": -175.48635864257812,
332
+ "debug/reference_rejected_logps": -159.8695068359375,
333
+ "epoch": 0.42105263157894735,
334
+ "grad_norm": 13.68956407505082,
335
+ "learning_rate": 1e-06,
336
+ "logits/chosen": -0.9986197352409363,
337
+ "logits/rejected": -1.0260645151138306,
338
+ "logps/chosen": -178.00741577148438,
339
+ "logps/rejected": -168.3215789794922,
340
+ "loss": 0.5002,
341
+ "rewards/accuracies": 0.875,
342
+ "rewards/chosen": -0.02521066553890705,
343
+ "rewards/margins": 0.059310123324394226,
344
+ "rewards/rejected": -0.08452078700065613,
345
+ "step": 16
346
+ },
347
+ {
348
+ "debug/policy_chosen_logits": -0.9570299983024597,
349
+ "debug/policy_chosen_logps": -155.34194946289062,
350
+ "debug/policy_rejected_logits": -0.8540157079696655,
351
+ "debug/policy_rejected_logps": -158.7489013671875,
352
+ "debug/reference_chosen_logps": -153.97943115234375,
353
+ "debug/reference_rejected_logps": -161.26361083984375,
354
+ "epoch": 0.4473684210526316,
355
+ "grad_norm": 11.796187058429084,
356
+ "learning_rate": 1e-06,
357
+ "logits/chosen": -0.9570299983024597,
358
+ "logits/rejected": -0.8540157079696655,
359
+ "logps/chosen": -155.34194946289062,
360
+ "logps/rejected": -158.7489013671875,
361
+ "loss": 0.4944,
362
+ "rewards/accuracies": 0.25,
363
+ "rewards/chosen": -0.013625269755721092,
364
+ "rewards/margins": -0.03877229616045952,
365
+ "rewards/rejected": 0.025147024542093277,
366
+ "step": 17
367
+ },
368
+ {
369
+ "debug/policy_chosen_logits": -0.9488164186477661,
370
+ "debug/policy_chosen_logps": -158.7830047607422,
371
+ "debug/policy_rejected_logits": -0.6659660935401917,
372
+ "debug/policy_rejected_logps": -177.89169311523438,
373
+ "debug/reference_chosen_logps": -160.00875854492188,
374
+ "debug/reference_rejected_logps": -178.63717651367188,
375
+ "epoch": 0.47368421052631576,
376
+ "grad_norm": 12.509178210406022,
377
+ "learning_rate": 1e-06,
378
+ "logits/chosen": -0.9488164186477661,
379
+ "logits/rejected": -0.6659660935401917,
380
+ "logps/chosen": -158.7830047607422,
381
+ "logps/rejected": -177.89169311523438,
382
+ "loss": 0.5023,
383
+ "rewards/accuracies": 0.625,
384
+ "rewards/chosen": 0.012257632799446583,
385
+ "rewards/margins": 0.004802837502211332,
386
+ "rewards/rejected": 0.007454794831573963,
387
+ "step": 18
388
+ },
389
+ {
390
+ "debug/policy_chosen_logits": -1.0241601467132568,
391
+ "debug/policy_chosen_logps": -146.33169555664062,
392
+ "debug/policy_rejected_logits": -0.9203600883483887,
393
+ "debug/policy_rejected_logps": -185.3747100830078,
394
+ "debug/reference_chosen_logps": -144.0775146484375,
395
+ "debug/reference_rejected_logps": -182.81109619140625,
396
+ "epoch": 0.5,
397
+ "grad_norm": 14.193595912188917,
398
+ "learning_rate": 1e-06,
399
+ "logits/chosen": -1.0241601467132568,
400
+ "logits/rejected": -0.9203600883483887,
401
+ "logps/chosen": -146.33169555664062,
402
+ "logps/rejected": -185.3747100830078,
403
+ "loss": 0.4921,
404
+ "rewards/accuracies": 0.625,
405
+ "rewards/chosen": -0.022541627287864685,
406
+ "rewards/margins": 0.003094470128417015,
407
+ "rewards/rejected": -0.02563609927892685,
408
+ "step": 19
409
+ },
410
+ {
411
+ "debug/policy_chosen_logits": -0.7719177603721619,
412
+ "debug/policy_chosen_logps": -170.9295196533203,
413
+ "debug/policy_rejected_logits": -0.8293173909187317,
414
+ "debug/policy_rejected_logps": -213.50628662109375,
415
+ "debug/reference_chosen_logps": -173.196044921875,
416
+ "debug/reference_rejected_logps": -216.7070770263672,
417
+ "epoch": 0.5263157894736842,
418
+ "grad_norm": 12.254973925319538,
419
+ "learning_rate": 1e-06,
420
+ "logits/chosen": -0.7719177603721619,
421
+ "logits/rejected": -0.8293173909187317,
422
+ "logps/chosen": -170.9295196533203,
423
+ "logps/rejected": -213.50628662109375,
424
+ "loss": 0.5029,
425
+ "rewards/accuracies": 0.25,
426
+ "rewards/chosen": 0.02266528084874153,
427
+ "rewards/margins": -0.009342546574771404,
428
+ "rewards/rejected": 0.03200782835483551,
429
+ "step": 20
430
+ },
431
+ {
432
+ "debug/policy_chosen_logits": -0.8476613163948059,
433
+ "debug/policy_chosen_logps": -176.2569122314453,
434
+ "debug/policy_rejected_logits": -0.8481642007827759,
435
+ "debug/policy_rejected_logps": -157.81607055664062,
436
+ "debug/reference_chosen_logps": -175.54322814941406,
437
+ "debug/reference_rejected_logps": -157.49563598632812,
438
+ "epoch": 0.5526315789473685,
439
+ "grad_norm": 13.463741700626029,
440
+ "learning_rate": 1e-06,
441
+ "logits/chosen": -0.8476613163948059,
442
+ "logits/rejected": -0.8481642007827759,
443
+ "logps/chosen": -176.2569122314453,
444
+ "logps/rejected": -157.81607055664062,
445
+ "loss": 0.5035,
446
+ "rewards/accuracies": 0.375,
447
+ "rewards/chosen": -0.007136850152164698,
448
+ "rewards/margins": -0.00393272377550602,
449
+ "rewards/rejected": -0.0032041254453361034,
450
+ "step": 21
451
+ },
452
+ {
453
+ "debug/policy_chosen_logits": -0.9509191513061523,
454
+ "debug/policy_chosen_logps": -163.81971740722656,
455
+ "debug/policy_rejected_logits": -1.03162682056427,
456
+ "debug/policy_rejected_logps": -166.19873046875,
457
+ "debug/reference_chosen_logps": -163.58975219726562,
458
+ "debug/reference_rejected_logps": -163.93687438964844,
459
+ "epoch": 0.5789473684210527,
460
+ "grad_norm": 12.346043910937565,
461
+ "learning_rate": 1e-06,
462
+ "logits/chosen": -0.9509191513061523,
463
+ "logits/rejected": -1.03162682056427,
464
+ "logps/chosen": -163.81971740722656,
465
+ "logps/rejected": -166.19873046875,
466
+ "loss": 0.5033,
467
+ "rewards/accuracies": 0.5,
468
+ "rewards/chosen": -0.0022995760664343834,
469
+ "rewards/margins": 0.020319033414125443,
470
+ "rewards/rejected": -0.02261860854923725,
471
+ "step": 22
472
+ },
473
+ {
474
+ "debug/policy_chosen_logits": -0.747829794883728,
475
+ "debug/policy_chosen_logps": -205.87387084960938,
476
+ "debug/policy_rejected_logits": -0.7931165099143982,
477
+ "debug/policy_rejected_logps": -202.55450439453125,
478
+ "debug/reference_chosen_logps": -207.89743041992188,
479
+ "debug/reference_rejected_logps": -202.11566162109375,
480
+ "epoch": 0.6052631578947368,
481
+ "grad_norm": 13.406646451993275,
482
+ "learning_rate": 1e-06,
483
+ "logits/chosen": -0.747829794883728,
484
+ "logits/rejected": -0.7931165099143982,
485
+ "logps/chosen": -205.87387084960938,
486
+ "logps/rejected": -202.55450439453125,
487
+ "loss": 0.5008,
488
+ "rewards/accuracies": 0.75,
489
+ "rewards/chosen": 0.020235728472471237,
490
+ "rewards/margins": 0.024624040350317955,
491
+ "rewards/rejected": -0.004388311877846718,
492
+ "step": 23
493
+ },
494
+ {
495
+ "debug/policy_chosen_logits": -0.7797695994377136,
496
+ "debug/policy_chosen_logps": -183.11962890625,
497
+ "debug/policy_rejected_logits": -0.8020097017288208,
498
+ "debug/policy_rejected_logps": -179.82424926757812,
499
+ "debug/reference_chosen_logps": -182.20074462890625,
500
+ "debug/reference_rejected_logps": -179.76992797851562,
501
+ "epoch": 0.631578947368421,
502
+ "grad_norm": 13.008691124944571,
503
+ "learning_rate": 1e-06,
504
+ "logits/chosen": -0.7797695994377136,
505
+ "logits/rejected": -0.8020097017288208,
506
+ "logps/chosen": -183.11962890625,
507
+ "logps/rejected": -179.82424926757812,
508
+ "loss": 0.5058,
509
+ "rewards/accuracies": 0.375,
510
+ "rewards/chosen": -0.00918867252767086,
511
+ "rewards/margins": -0.008645400404930115,
512
+ "rewards/rejected": -0.0005432693287730217,
513
+ "step": 24
514
+ },
515
+ {
516
+ "debug/policy_chosen_logits": -0.9057678580284119,
517
+ "debug/policy_chosen_logps": -176.46397399902344,
518
+ "debug/policy_rejected_logits": -0.7624039649963379,
519
+ "debug/policy_rejected_logps": -191.3661346435547,
520
+ "debug/reference_chosen_logps": -175.11502075195312,
521
+ "debug/reference_rejected_logps": -189.20126342773438,
522
+ "epoch": 0.6578947368421053,
523
+ "grad_norm": 12.793107149155997,
524
+ "learning_rate": 1e-06,
525
+ "logits/chosen": -0.9057678580284119,
526
+ "logits/rejected": -0.7624039649963379,
527
+ "logps/chosen": -176.46397399902344,
528
+ "logps/rejected": -191.3661346435547,
529
+ "loss": 0.4976,
530
+ "rewards/accuracies": 0.75,
531
+ "rewards/chosen": -0.01348949410021305,
532
+ "rewards/margins": 0.008159112185239792,
533
+ "rewards/rejected": -0.021648606285452843,
534
+ "step": 25
535
+ },
536
+ {
537
+ "debug/policy_chosen_logits": -0.8435552716255188,
538
+ "debug/policy_chosen_logps": -168.30003356933594,
539
+ "debug/policy_rejected_logits": -0.8235043883323669,
540
+ "debug/policy_rejected_logps": -181.6990203857422,
541
+ "debug/reference_chosen_logps": -170.16717529296875,
542
+ "debug/reference_rejected_logps": -178.15789794921875,
543
+ "epoch": 0.6842105263157895,
544
+ "grad_norm": 16.18283932437879,
545
+ "learning_rate": 1e-06,
546
+ "logits/chosen": -0.8435552716255188,
547
+ "logits/rejected": -0.8235043883323669,
548
+ "logps/chosen": -168.30003356933594,
549
+ "logps/rejected": -181.6990203857422,
550
+ "loss": 0.5103,
551
+ "rewards/accuracies": 0.75,
552
+ "rewards/chosen": 0.018671445548534393,
553
+ "rewards/margins": 0.05408259481191635,
554
+ "rewards/rejected": -0.03541114926338196,
555
+ "step": 26
556
+ },
557
+ {
558
+ "debug/policy_chosen_logits": -0.89864182472229,
559
+ "debug/policy_chosen_logps": -190.04278564453125,
560
+ "debug/policy_rejected_logits": -0.765967845916748,
561
+ "debug/policy_rejected_logps": -206.12942504882812,
562
+ "debug/reference_chosen_logps": -186.39132690429688,
563
+ "debug/reference_rejected_logps": -204.19808959960938,
564
+ "epoch": 0.7105263157894737,
565
+ "grad_norm": 11.950723408804826,
566
+ "learning_rate": 1e-06,
567
+ "logits/chosen": -0.89864182472229,
568
+ "logits/rejected": -0.765967845916748,
569
+ "logps/chosen": -190.04278564453125,
570
+ "logps/rejected": -206.12942504882812,
571
+ "loss": 0.4956,
572
+ "rewards/accuracies": 0.25,
573
+ "rewards/chosen": -0.0365147590637207,
574
+ "rewards/margins": -0.01720167137682438,
575
+ "rewards/rejected": -0.019313087686896324,
576
+ "step": 27
577
+ },
578
+ {
579
+ "debug/policy_chosen_logits": -0.9197551608085632,
580
+ "debug/policy_chosen_logps": -159.85580444335938,
581
+ "debug/policy_rejected_logits": -0.8922968506813049,
582
+ "debug/policy_rejected_logps": -154.68045043945312,
583
+ "debug/reference_chosen_logps": -157.56961059570312,
584
+ "debug/reference_rejected_logps": -153.11502075195312,
585
+ "epoch": 0.7368421052631579,
586
+ "grad_norm": 11.128732665194095,
587
+ "learning_rate": 1e-06,
588
+ "logits/chosen": -0.9197551608085632,
589
+ "logits/rejected": -0.8922968506813049,
590
+ "logps/chosen": -159.85580444335938,
591
+ "logps/rejected": -154.68045043945312,
592
+ "loss": 0.491,
593
+ "rewards/accuracies": 0.375,
594
+ "rewards/chosen": -0.022861871868371964,
595
+ "rewards/margins": -0.007207621354609728,
596
+ "rewards/rejected": -0.015654249116778374,
597
+ "step": 28
598
+ },
599
+ {
600
+ "debug/policy_chosen_logits": -0.9484947919845581,
601
+ "debug/policy_chosen_logps": -152.24813842773438,
602
+ "debug/policy_rejected_logits": -0.8805264234542847,
603
+ "debug/policy_rejected_logps": -168.650634765625,
604
+ "debug/reference_chosen_logps": -152.11996459960938,
605
+ "debug/reference_rejected_logps": -169.3289794921875,
606
+ "epoch": 0.7631578947368421,
607
+ "grad_norm": 12.54768054735427,
608
+ "learning_rate": 1e-06,
609
+ "logits/chosen": -0.9484947919845581,
610
+ "logits/rejected": -0.8805264234542847,
611
+ "logps/chosen": -152.24813842773438,
612
+ "logps/rejected": -168.650634765625,
613
+ "loss": 0.5024,
614
+ "rewards/accuracies": 0.375,
615
+ "rewards/chosen": -0.001281691249459982,
616
+ "rewards/margins": -0.008065233007073402,
617
+ "rewards/rejected": 0.0067835417576134205,
618
+ "step": 29
619
+ },
620
+ {
621
+ "debug/policy_chosen_logits": -0.9098101258277893,
622
+ "debug/policy_chosen_logps": -144.8257598876953,
623
+ "debug/policy_rejected_logits": -0.9026345014572144,
624
+ "debug/policy_rejected_logps": -178.5587921142578,
625
+ "debug/reference_chosen_logps": -146.36148071289062,
626
+ "debug/reference_rejected_logps": -177.2120361328125,
627
+ "epoch": 0.7894736842105263,
628
+ "grad_norm": 11.052110219658221,
629
+ "learning_rate": 1e-06,
630
+ "logits/chosen": -0.9098101258277893,
631
+ "logits/rejected": -0.9026345014572144,
632
+ "logps/chosen": -144.8257598876953,
633
+ "logps/rejected": -178.5587921142578,
634
+ "loss": 0.4892,
635
+ "rewards/accuracies": 0.75,
636
+ "rewards/chosen": 0.015357255935668945,
637
+ "rewards/margins": 0.028824787586927414,
638
+ "rewards/rejected": -0.013467530719935894,
639
+ "step": 30
640
+ },
641
+ {
642
+ "debug/policy_chosen_logits": -0.8571977615356445,
643
+ "debug/policy_chosen_logps": -149.52532958984375,
644
+ "debug/policy_rejected_logits": -0.8092418313026428,
645
+ "debug/policy_rejected_logps": -160.9595947265625,
646
+ "debug/reference_chosen_logps": -151.14004516601562,
647
+ "debug/reference_rejected_logps": -160.58103942871094,
648
+ "epoch": 0.8157894736842105,
649
+ "grad_norm": 11.744991657971045,
650
+ "learning_rate": 1e-06,
651
+ "logits/chosen": -0.8571977615356445,
652
+ "logits/rejected": -0.8092418313026428,
653
+ "logps/chosen": -149.52532958984375,
654
+ "logps/rejected": -160.9595947265625,
655
+ "loss": 0.5024,
656
+ "rewards/accuracies": 0.625,
657
+ "rewards/chosen": 0.016147155314683914,
658
+ "rewards/margins": 0.019932862371206284,
659
+ "rewards/rejected": -0.00378570519387722,
660
+ "step": 31
661
+ },
662
+ {
663
+ "debug/policy_chosen_logits": -0.8337549567222595,
664
+ "debug/policy_chosen_logps": -145.87689208984375,
665
+ "debug/policy_rejected_logits": -0.774303138256073,
666
+ "debug/policy_rejected_logps": -159.3026123046875,
667
+ "debug/reference_chosen_logps": -147.57308959960938,
668
+ "debug/reference_rejected_logps": -158.9627685546875,
669
+ "epoch": 0.8421052631578947,
670
+ "grad_norm": 10.910880747605056,
671
+ "learning_rate": 1e-06,
672
+ "logits/chosen": -0.8337549567222595,
673
+ "logits/rejected": -0.774303138256073,
674
+ "logps/chosen": -145.87689208984375,
675
+ "logps/rejected": -159.3026123046875,
676
+ "loss": 0.491,
677
+ "rewards/accuracies": 0.75,
678
+ "rewards/chosen": 0.01696179248392582,
679
+ "rewards/margins": 0.020359963178634644,
680
+ "rewards/rejected": -0.0033981711603701115,
681
+ "step": 32
682
+ },
683
+ {
684
+ "debug/policy_chosen_logits": -0.9494245648384094,
685
+ "debug/policy_chosen_logps": -162.24545288085938,
686
+ "debug/policy_rejected_logits": -1.0418776273727417,
687
+ "debug/policy_rejected_logps": -194.7138671875,
688
+ "debug/reference_chosen_logps": -162.0795135498047,
689
+ "debug/reference_rejected_logps": -193.1683807373047,
690
+ "epoch": 0.868421052631579,
691
+ "grad_norm": 10.621881858417531,
692
+ "learning_rate": 1e-06,
693
+ "logits/chosen": -0.9494245648384094,
694
+ "logits/rejected": -1.0418776273727417,
695
+ "logps/chosen": -162.24545288085938,
696
+ "logps/rejected": -194.7138671875,
697
+ "loss": 0.5043,
698
+ "rewards/accuracies": 0.75,
699
+ "rewards/chosen": -0.0016593635082244873,
700
+ "rewards/margins": 0.013795491307973862,
701
+ "rewards/rejected": -0.015454854816198349,
702
+ "step": 33
703
+ },
704
+ {
705
+ "debug/policy_chosen_logits": -0.7892115712165833,
706
+ "debug/policy_chosen_logps": -153.41824340820312,
707
+ "debug/policy_rejected_logits": -0.7181702852249146,
708
+ "debug/policy_rejected_logps": -175.0258026123047,
709
+ "debug/reference_chosen_logps": -152.8340606689453,
710
+ "debug/reference_rejected_logps": -174.9475555419922,
711
+ "epoch": 0.8947368421052632,
712
+ "grad_norm": 11.510187964278074,
713
+ "learning_rate": 1e-06,
714
+ "logits/chosen": -0.7892115712165833,
715
+ "logits/rejected": -0.7181702852249146,
716
+ "logps/chosen": -153.41824340820312,
717
+ "logps/rejected": -175.0258026123047,
718
+ "loss": 0.4869,
719
+ "rewards/accuracies": 0.625,
720
+ "rewards/chosen": -0.005841732025146484,
721
+ "rewards/margins": -0.005059261806309223,
722
+ "rewards/rejected": -0.0007824706844985485,
723
+ "step": 34
724
+ },
725
+ {
726
+ "debug/policy_chosen_logits": -0.6696067452430725,
727
+ "debug/policy_chosen_logps": -184.62283325195312,
728
+ "debug/policy_rejected_logits": -0.6000730395317078,
729
+ "debug/policy_rejected_logps": -208.19822692871094,
730
+ "debug/reference_chosen_logps": -184.1222686767578,
731
+ "debug/reference_rejected_logps": -207.0794677734375,
732
+ "epoch": 0.9210526315789473,
733
+ "grad_norm": 10.959485144211554,
734
+ "learning_rate": 1e-06,
735
+ "logits/chosen": -0.6696067452430725,
736
+ "logits/rejected": -0.6000730395317078,
737
+ "logps/chosen": -184.62283325195312,
738
+ "logps/rejected": -208.19822692871094,
739
+ "loss": 0.4963,
740
+ "rewards/accuracies": 0.75,
741
+ "rewards/chosen": -0.005005750805139542,
742
+ "rewards/margins": 0.006181859411299229,
743
+ "rewards/rejected": -0.011187611147761345,
744
+ "step": 35
745
+ },
746
+ {
747
+ "debug/policy_chosen_logits": -0.8848521113395691,
748
+ "debug/policy_chosen_logps": -160.16763305664062,
749
+ "debug/policy_rejected_logits": -0.8665605187416077,
750
+ "debug/policy_rejected_logps": -213.43040466308594,
751
+ "debug/reference_chosen_logps": -160.1614990234375,
752
+ "debug/reference_rejected_logps": -211.8297882080078,
753
+ "epoch": 0.9473684210526315,
754
+ "grad_norm": 11.865862578065203,
755
+ "learning_rate": 1e-06,
756
+ "logits/chosen": -0.8848521113395691,
757
+ "logits/rejected": -0.8665605187416077,
758
+ "logps/chosen": -160.16763305664062,
759
+ "logps/rejected": -213.43040466308594,
760
+ "loss": 0.5022,
761
+ "rewards/accuracies": 0.625,
762
+ "rewards/chosen": -6.138812750577927e-05,
763
+ "rewards/margins": 0.015944700688123703,
764
+ "rewards/rejected": -0.016006087884306908,
765
+ "step": 36
766
+ },
767
+ {
768
+ "debug/policy_chosen_logits": -0.6179525256156921,
769
+ "debug/policy_chosen_logps": -215.09716796875,
770
+ "debug/policy_rejected_logits": -0.8251385688781738,
771
+ "debug/policy_rejected_logps": -191.83261108398438,
772
+ "debug/reference_chosen_logps": -214.5953369140625,
773
+ "debug/reference_rejected_logps": -192.8199462890625,
774
+ "epoch": 0.9736842105263158,
775
+ "grad_norm": 12.158709477313657,
776
+ "learning_rate": 1e-06,
777
+ "logits/chosen": -0.6179525256156921,
778
+ "logits/rejected": -0.8251385688781738,
779
+ "logps/chosen": -215.09716796875,
780
+ "logps/rejected": -191.83261108398438,
781
+ "loss": 0.4844,
782
+ "rewards/accuracies": 0.375,
783
+ "rewards/chosen": -0.005018271971493959,
784
+ "rewards/margins": -0.014891558326780796,
785
+ "rewards/rejected": 0.00987328588962555,
786
+ "step": 37
787
+ },
788
+ {
789
+ "debug/policy_chosen_logits": -0.794685959815979,
790
+ "debug/policy_chosen_logps": -220.12693786621094,
791
+ "debug/policy_rejected_logits": -0.8812568187713623,
792
+ "debug/policy_rejected_logps": -166.08612060546875,
793
+ "debug/reference_chosen_logps": -219.10116577148438,
794
+ "debug/reference_rejected_logps": -165.18740844726562,
795
+ "epoch": 1.0,
796
+ "grad_norm": 10.941646073739074,
797
+ "learning_rate": 1e-06,
798
+ "logits/chosen": -0.794685959815979,
799
+ "logits/rejected": -0.8812568187713623,
800
+ "logps/chosen": -220.12693786621094,
801
+ "logps/rejected": -166.08612060546875,
802
+ "loss": 0.4589,
803
+ "rewards/accuracies": 0.375,
804
+ "rewards/chosen": -0.010257730260491371,
805
+ "rewards/margins": -0.001270495355129242,
806
+ "rewards/rejected": -0.00898723490536213,
807
+ "step": 38
808
+ },
809
+ {
810
+ "epoch": 1.0,
811
+ "step": 38,
812
+ "total_flos": 0.0,
813
+ "train_loss": 0.49694290051334783,
814
+ "train_runtime": 142.4117,
815
+ "train_samples_per_second": 16.923,
816
+ "train_steps_per_second": 0.267
817
+ }
818
+ ],
819
+ "logging_steps": 1,
820
+ "max_steps": 38,
821
+ "num_input_tokens_seen": 0,
822
+ "num_train_epochs": 1,
823
+ "save_steps": 500,
824
+ "stateful_callbacks": {
825
+ "TrainerControl": {
826
+ "args": {
827
+ "should_epoch_stop": false,
828
+ "should_evaluate": false,
829
+ "should_log": false,
830
+ "should_save": true,
831
+ "should_training_stop": true
832
+ },
833
+ "attributes": {}
834
+ }
835
+ },
836
+ "total_flos": 0.0,
837
+ "train_batch_size": 8,
838
+ "trial_name": null,
839
+ "trial_params": null
840
+ }