wzhouad commited on
Commit
6312bba
1 Parent(s): c475147

Model save

Browse files
README.md CHANGED
@@ -17,18 +17,18 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.1314
21
- - Rewards/chosen: -1.5200
22
- - Rewards/rejected: -2.4344
23
- - Rewards/accuracies: 0.75
24
- - Rewards/margins: 0.9144
25
- - Logps/rejected: -500.7934
26
- - Logps/chosen: -409.0388
27
- - Logits/rejected: -2.1508
28
- - Logits/chosen: -2.1830
29
- - Debug/policy Weights: 0.2589
30
- - Debug/losses: 0.1297
31
- - Debug/raw Losses: 0.4817
32
 
33
  ## Model description
34
 
@@ -65,15 +65,15 @@ The following hyperparameters were used during training:
65
 
66
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen | Debug/policy Weights | Debug/losses | Debug/raw Losses |
67
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|:--------------------:|:------------:|:----------------:|
68
- | 0.2168 | 0.21 | 100 | 0.2150 | -0.5440 | -1.0580 | 0.7383 | 0.5141 | -363.1571 | -311.4377 | -2.6827 | -2.6979 | 0.3735 | 0.2082 | 0.5529 |
69
- | 0.1396 | 0.42 | 200 | 0.1416 | -1.3480 | -2.1286 | 0.7656 | 0.7807 | -470.2158 | -391.8350 | -2.2733 | -2.2968 | 0.2687 | 0.1390 | 0.5030 |
70
- | 0.1294 | 0.63 | 300 | 0.1309 | -1.6003 | -2.4486 | 0.7383 | 0.8483 | -502.2112 | -417.0714 | -2.1589 | -2.1885 | 0.2545 | 0.1284 | 0.4935 |
71
- | 0.1329 | 0.84 | 400 | 0.1314 | -1.5200 | -2.4344 | 0.75 | 0.9144 | -500.7934 | -409.0388 | -2.1508 | -2.1830 | 0.2589 | 0.1297 | 0.4817 |
72
 
73
 
74
  ### Framework versions
75
 
76
- - Transformers 4.35.2
77
  - Pytorch 2.1.2+cu121
78
  - Datasets 2.14.6
79
- - Tokenizers 0.14.1
 
17
 
18
  This model is a fine-tuned version of [HuggingFaceH4/mistral-7b-sft-beta](https://huggingface.co/HuggingFaceH4/mistral-7b-sft-beta) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.1422
21
+ - Rewards/chosen: -1.3154
22
+ - Rewards/rejected: -2.2768
23
+ - Rewards/accuracies: 0.7617
24
+ - Rewards/margins: 0.9613
25
+ - Logps/rejected: -483.9327
26
+ - Logps/chosen: -386.7366
27
+ - Logits/rejected: -2.1695
28
+ - Logits/chosen: -2.2036
29
+ - Debug/policy Weights: 0.2815
30
+ - Debug/losses: 0.1397
31
+ - Debug/raw Losses: 0.4727
32
 
33
  ## Model description
34
 
 
65
 
66
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen | Debug/policy Weights | Debug/losses | Debug/raw Losses |
67
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|:--------------------:|:------------:|:----------------:|
68
+ | 0.1781 | 0.21 | 100 | 0.2007 | -0.6478 | -1.1693 | 0.7344 | 0.5214 | -373.1867 | -319.9806 | -2.6910 | -2.7080 | 0.3512 | 0.1953 | 0.5590 |
69
+ | 0.1616 | 0.42 | 200 | 0.1669 | -0.8830 | -1.6003 | 0.7109 | 0.7173 | -416.2844 | -343.4914 | -2.4277 | -2.4499 | 0.3174 | 0.1671 | 0.5079 |
70
+ | 0.1343 | 0.63 | 300 | 0.1368 | -1.5021 | -2.3715 | 0.7578 | 0.8695 | -493.4114 | -405.4042 | -2.2283 | -2.2618 | 0.2666 | 0.1365 | 0.4953 |
71
+ | 0.1398 | 0.84 | 400 | 0.1422 | -1.3154 | -2.2768 | 0.7617 | 0.9613 | -483.9327 | -386.7366 | -2.1695 | -2.2036 | 0.2815 | 0.1397 | 0.4727 |
72
 
73
 
74
  ### Framework versions
75
 
76
+ - Transformers 4.39.3
77
  - Pytorch 2.1.2+cu121
78
  - Datasets 2.14.6
79
+ - Tokenizers 0.15.2
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.17621936496331603,
4
- "train_runtime": 4510.4366,
5
  "train_samples": 61134,
6
- "train_samples_per_second": 13.554,
7
- "train_steps_per_second": 0.106
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.17903520272865456,
4
+ "train_runtime": 4631.245,
5
  "train_samples": 61134,
6
+ "train_samples_per_second": 13.2,
7
+ "train_steps_per_second": 0.103
8
  }
config.json CHANGED
@@ -3,6 +3,7 @@
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
 
6
  "bos_token_id": 1,
7
  "eos_token_id": 2,
8
  "hidden_act": "silu",
@@ -19,7 +20,7 @@
19
  "sliding_window": 4096,
20
  "tie_word_embeddings": false,
21
  "torch_dtype": "bfloat16",
22
- "transformers_version": "4.35.2",
23
  "use_cache": false,
24
  "vocab_size": 32000
25
  }
 
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
6
+ "attention_dropout": 0.0,
7
  "bos_token_id": 1,
8
  "eos_token_id": 2,
9
  "hidden_act": "silu",
 
20
  "sliding_window": 4096,
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.39.3",
24
  "use_cache": false,
25
  "vocab_size": 32000
26
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
- "transformers_version": "4.35.2"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
+ "transformers_version": "4.39.3"
6
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09cc7ca4a2419236436e6c007340b593bba2a32c8ae2632430bedae57caa0e7d
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18d00ba6a8e8636533d5f9f026b67bade132d932eb7e25cf20a5471d14cf4938
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a18ed3e61ca7e185538cfad27e95e49182c44b8e7cf83e0a3e9e75dfa4402b4
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a726c7d9b0e02053f892980f475aa8bdf053ba38518c5ec8f29a48cd43f1e728
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe431dd8a696ee319a6385010068d99a7950c45d95d27f41443264941966e681
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abb64bdbf0bdca4349e17eb7d2424fa102b61d7e91947dde62627f2995cbf4b8
3
  size 4540516344
tokenizer_config.json CHANGED
@@ -1,4 +1,6 @@
1
  {
 
 
2
  "added_tokens_decoder": {
3
  "0": {
4
  "content": "<unk>",
 
1
  {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.17621936496331603,
4
- "train_runtime": 4510.4366,
5
  "train_samples": 61134,
6
- "train_samples_per_second": 13.554,
7
- "train_steps_per_second": 0.106
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.17903520272865456,
4
+ "train_runtime": 4631.245,
5
  "train_samples": 61134,
6
+ "train_samples_per_second": 13.2,
7
+ "train_steps_per_second": 0.103
8
  }
trainer_state.json CHANGED
@@ -9,16 +9,17 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "debug/losses": 0.34217238426208496,
13
- "debug/policy_weights": 0.4936503767967224,
14
  "debug/raw_losses": 0.6931471824645996,
15
  "epoch": 0.0,
 
16
  "learning_rate": 1.0416666666666666e-08,
17
- "logits/chosen": -2.8099329471588135,
18
- "logits/rejected": -2.7572641372680664,
19
- "logps/chosen": -241.48843383789062,
20
- "logps/rejected": -197.4517822265625,
21
- "loss": 0.3561,
22
  "rewards/accuracies": 0.0,
23
  "rewards/chosen": 0.0,
24
  "rewards/margins": 0.0,
@@ -26,895 +27,944 @@
26
  "step": 1
27
  },
28
  {
29
- "debug/losses": 0.3613118529319763,
30
- "debug/policy_weights": 0.5213115215301514,
31
- "debug/raw_losses": 0.6931909918785095,
32
  "epoch": 0.02,
 
33
  "learning_rate": 1.0416666666666667e-07,
34
- "logits/chosen": -2.8320045471191406,
35
- "logits/rejected": -2.8085670471191406,
36
- "logps/chosen": -292.685546875,
37
- "logps/rejected": -278.5729064941406,
38
- "loss": 0.3674,
39
  "rewards/accuracies": 0.4236111044883728,
40
- "rewards/chosen": 5.248460729490034e-05,
41
- "rewards/margins": -7.99686458776705e-05,
42
- "rewards/rejected": 0.00013245324953459203,
43
  "step": 10
44
  },
45
  {
46
- "debug/losses": 0.3490375578403473,
47
- "debug/policy_weights": 0.5044432878494263,
48
- "debug/raw_losses": 0.6918557286262512,
49
  "epoch": 0.04,
 
50
  "learning_rate": 2.0833333333333333e-07,
51
- "logits/chosen": -2.811972141265869,
52
- "logits/rejected": -2.78340482711792,
53
- "logps/chosen": -290.2806396484375,
54
- "logps/rejected": -290.8512268066406,
55
- "loss": 0.3549,
56
- "rewards/accuracies": 0.612500011920929,
57
- "rewards/chosen": 0.0011896035866811872,
58
- "rewards/margins": 0.0026031401939690113,
59
- "rewards/rejected": -0.0014135364908725023,
60
  "step": 20
61
  },
62
  {
63
- "debug/losses": 0.3571945130825043,
64
- "debug/policy_weights": 0.518287181854248,
65
- "debug/raw_losses": 0.6891354322433472,
66
  "epoch": 0.06,
 
67
  "learning_rate": 3.1249999999999997e-07,
68
- "logits/chosen": -2.759937047958374,
69
- "logits/rejected": -2.7286112308502197,
70
- "logps/chosen": -246.35159301757812,
71
- "logps/rejected": -227.08651733398438,
72
- "loss": 0.3602,
73
- "rewards/accuracies": 0.637499988079071,
74
- "rewards/chosen": 0.00010563675459707156,
75
- "rewards/margins": 0.008171903900802135,
76
- "rewards/rejected": -0.008066266775131226,
77
  "step": 30
78
  },
79
  {
80
- "debug/losses": 0.3552504777908325,
81
- "debug/policy_weights": 0.52684086561203,
82
- "debug/raw_losses": 0.6755487322807312,
83
  "epoch": 0.08,
 
84
  "learning_rate": 4.1666666666666667e-07,
85
- "logits/chosen": -2.7985873222351074,
86
- "logits/rejected": -2.7698562145233154,
87
- "logps/chosen": -298.9928283691406,
88
- "logps/rejected": -264.44781494140625,
89
- "loss": 0.3544,
90
- "rewards/accuracies": 0.7124999761581421,
91
- "rewards/chosen": 0.005288169719278812,
92
- "rewards/margins": 0.03713225945830345,
93
- "rewards/rejected": -0.031844086945056915,
94
  "step": 40
95
  },
96
  {
97
- "debug/losses": 0.3372410833835602,
98
- "debug/policy_weights": 0.50788414478302,
99
- "debug/raw_losses": 0.6635575890541077,
100
  "epoch": 0.1,
 
101
  "learning_rate": 4.999733114418725e-07,
102
- "logits/chosen": -2.719874143600464,
103
- "logits/rejected": -2.698538064956665,
104
- "logps/chosen": -279.17694091796875,
105
- "logps/rejected": -272.10687255859375,
106
- "loss": 0.3418,
107
- "rewards/accuracies": 0.65625,
108
- "rewards/chosen": -0.017464958131313324,
109
- "rewards/margins": 0.06663360446691513,
110
- "rewards/rejected": -0.08409856259822845,
111
  "step": 50
112
  },
113
  {
114
- "debug/losses": 0.32032984495162964,
115
- "debug/policy_weights": 0.49015456438064575,
116
- "debug/raw_losses": 0.6468743085861206,
117
  "epoch": 0.13,
 
118
  "learning_rate": 4.990398100856366e-07,
119
- "logits/chosen": -2.741781234741211,
120
- "logits/rejected": -2.718924045562744,
121
- "logps/chosen": -264.25994873046875,
122
- "logps/rejected": -249.49978637695312,
123
- "loss": 0.3195,
124
- "rewards/accuracies": 0.6499999761581421,
125
- "rewards/chosen": -0.05183681100606918,
126
- "rewards/margins": 0.11701079457998276,
127
- "rewards/rejected": -0.16884759068489075,
128
  "step": 60
129
  },
130
  {
131
- "debug/losses": 0.2948753237724304,
132
- "debug/policy_weights": 0.4617387652397156,
133
- "debug/raw_losses": 0.6441112756729126,
134
  "epoch": 0.15,
 
135
  "learning_rate": 4.967775735898179e-07,
136
- "logits/chosen": -2.7274184226989746,
137
- "logits/rejected": -2.697922706604004,
138
- "logps/chosen": -283.56719970703125,
139
- "logps/rejected": -265.3944091796875,
140
- "loss": 0.2944,
141
- "rewards/accuracies": 0.6499999761581421,
142
- "rewards/chosen": -0.15144629776477814,
143
- "rewards/margins": 0.15366096794605255,
144
- "rewards/rejected": -0.3051072657108307,
145
  "step": 70
146
  },
147
  {
148
- "debug/losses": 0.24030272662639618,
149
- "debug/policy_weights": 0.39300116896629333,
150
- "debug/raw_losses": 0.61235511302948,
151
  "epoch": 0.17,
 
152
  "learning_rate": 4.931986719649298e-07,
153
- "logits/chosen": -2.6943717002868652,
154
- "logits/rejected": -2.6917672157287598,
155
- "logps/chosen": -306.8209533691406,
156
- "logps/rejected": -331.4168701171875,
157
- "loss": 0.2366,
158
  "rewards/accuracies": 0.6875,
159
- "rewards/chosen": -0.36089134216308594,
160
- "rewards/margins": 0.27803000807762146,
161
- "rewards/rejected": -0.638921320438385,
162
  "step": 80
163
  },
164
  {
165
- "debug/losses": 0.21284589171409607,
166
- "debug/policy_weights": 0.36208364367485046,
167
- "debug/raw_losses": 0.6084356307983398,
168
  "epoch": 0.19,
 
169
  "learning_rate": 4.883222001996351e-07,
170
- "logits/chosen": -2.742755889892578,
171
- "logits/rejected": -2.7089426517486572,
172
- "logps/chosen": -306.11700439453125,
173
- "logps/rejected": -313.5643005371094,
174
- "loss": 0.2153,
175
- "rewards/accuracies": 0.675000011920929,
176
- "rewards/chosen": -0.5115253925323486,
177
- "rewards/margins": 0.27717387676239014,
178
- "rewards/rejected": -0.7886992692947388,
179
  "step": 90
180
  },
181
  {
182
- "debug/losses": 0.23982000350952148,
183
- "debug/policy_weights": 0.39243510365486145,
184
- "debug/raw_losses": 0.625305712223053,
185
  "epoch": 0.21,
 
186
  "learning_rate": 4.821741763807186e-07,
187
- "logits/chosen": -2.754852533340454,
188
- "logits/rejected": -2.7216262817382812,
189
- "logps/chosen": -346.60498046875,
190
- "logps/rejected": -359.0435485839844,
191
- "loss": 0.2168,
192
- "rewards/accuracies": 0.637499988079071,
193
- "rewards/chosen": -0.5716595649719238,
194
- "rewards/margins": 0.3401753306388855,
195
- "rewards/rejected": -0.9118350148200989,
196
  "step": 100
197
  },
198
  {
199
  "epoch": 0.21,
200
- "eval_debug/losses": 0.2082262486219406,
201
- "eval_debug/policy_weights": 0.37346428632736206,
202
- "eval_debug/raw_losses": 0.5528886318206787,
203
- "eval_logits/chosen": -2.697880268096924,
204
- "eval_logits/rejected": -2.6826982498168945,
205
- "eval_logps/chosen": -311.4377136230469,
206
- "eval_logps/rejected": -363.1571350097656,
207
- "eval_loss": 0.21503373980522156,
208
- "eval_rewards/accuracies": 0.73828125,
209
- "eval_rewards/chosen": -0.5439806580543518,
210
- "eval_rewards/margins": 0.5140582323074341,
211
- "eval_rewards/rejected": -1.0580389499664307,
212
- "eval_runtime": 53.0291,
213
- "eval_samples_per_second": 37.715,
214
- "eval_steps_per_second": 0.603,
215
  "step": 100
216
  },
217
  {
218
- "debug/losses": 0.1748097836971283,
219
- "debug/policy_weights": 0.3250483572483063,
220
- "debug/raw_losses": 0.5473231077194214,
221
  "epoch": 0.23,
 
222
  "learning_rate": 4.747874028753375e-07,
223
- "logits/chosen": -2.5498530864715576,
224
- "logits/rejected": -2.5195746421813965,
225
- "logps/chosen": -338.24639892578125,
226
- "logps/rejected": -351.0300598144531,
227
- "loss": 0.2041,
228
- "rewards/accuracies": 0.6875,
229
- "rewards/chosen": -0.6814893484115601,
230
- "rewards/margins": 0.6107165813446045,
231
- "rewards/rejected": -1.292205810546875,
232
  "step": 110
233
  },
234
  {
235
- "debug/losses": 0.1997809112071991,
236
- "debug/policy_weights": 0.35038530826568604,
237
- "debug/raw_losses": 0.5690494775772095,
238
  "epoch": 0.25,
 
239
  "learning_rate": 4.662012913161997e-07,
240
- "logits/chosen": -2.6230885982513428,
241
- "logits/rejected": -2.5906565189361572,
242
- "logps/chosen": -337.9828186035156,
243
- "logps/rejected": -383.8708801269531,
244
- "loss": 0.1885,
245
- "rewards/accuracies": 0.7250000238418579,
246
- "rewards/chosen": -0.6913173794746399,
247
- "rewards/margins": 0.5188819169998169,
248
- "rewards/rejected": -1.2101994752883911,
249
  "step": 120
250
  },
251
  {
252
- "debug/losses": 0.17902129888534546,
253
- "debug/policy_weights": 0.3178775906562805,
254
- "debug/raw_losses": 0.5685083866119385,
255
  "epoch": 0.27,
 
256
  "learning_rate": 4.5646165232345103e-07,
257
- "logits/chosen": -2.591015100479126,
258
- "logits/rejected": -2.576317548751831,
259
- "logps/chosen": -350.18768310546875,
260
- "logps/rejected": -370.1181640625,
261
- "loss": 0.1792,
262
- "rewards/accuracies": 0.6937500238418579,
263
- "rewards/chosen": -0.7569286227226257,
264
- "rewards/margins": 0.4377259314060211,
265
- "rewards/rejected": -1.1946544647216797,
266
  "step": 130
267
  },
268
  {
269
- "debug/losses": 0.18843333423137665,
270
- "debug/policy_weights": 0.3299049437046051,
271
- "debug/raw_losses": 0.550617516040802,
272
  "epoch": 0.29,
 
273
  "learning_rate": 4.456204510851956e-07,
274
- "logits/chosen": -2.570383071899414,
275
- "logits/rejected": -2.5405478477478027,
276
- "logps/chosen": -350.4234924316406,
277
- "logps/rejected": -398.32403564453125,
278
- "loss": 0.1848,
279
  "rewards/accuracies": 0.7250000238418579,
280
- "rewards/chosen": -0.69789057970047,
281
- "rewards/margins": 0.577714741230011,
282
- "rewards/rejected": -1.275605320930481,
283
  "step": 140
284
  },
285
  {
286
- "debug/losses": 0.18008050322532654,
287
- "debug/policy_weights": 0.34716594219207764,
288
- "debug/raw_losses": 0.5200980305671692,
289
  "epoch": 0.31,
 
290
  "learning_rate": 4.337355301007335e-07,
291
- "logits/chosen": -2.520066738128662,
292
- "logits/rejected": -2.5489494800567627,
293
- "logps/chosen": -280.36163330078125,
294
- "logps/rejected": -348.47869873046875,
295
- "loss": 0.1859,
296
- "rewards/accuracies": 0.793749988079071,
297
- "rewards/chosen": -0.5368272066116333,
298
- "rewards/margins": 0.585098147392273,
299
- "rewards/rejected": -1.1219253540039062,
300
  "step": 150
301
  },
302
  {
303
- "debug/losses": 0.17896616458892822,
304
- "debug/policy_weights": 0.3156106472015381,
305
- "debug/raw_losses": 0.599802553653717,
306
  "epoch": 0.33,
 
307
  "learning_rate": 4.2087030056579986e-07,
308
- "logits/chosen": -2.5032947063446045,
309
- "logits/rejected": -2.476680278778076,
310
- "logps/chosen": -341.430908203125,
311
- "logps/rejected": -376.94244384765625,
312
- "loss": 0.1759,
313
- "rewards/accuracies": 0.675000011920929,
314
- "rewards/chosen": -0.8545902967453003,
315
- "rewards/margins": 0.4560604691505432,
316
- "rewards/rejected": -1.3106508255004883,
317
  "step": 160
318
  },
319
  {
320
- "debug/losses": 0.12902560830116272,
321
- "debug/policy_weights": 0.25470516085624695,
322
- "debug/raw_losses": 0.5344475507736206,
323
  "epoch": 0.36,
 
324
  "learning_rate": 4.070934040463998e-07,
325
- "logits/chosen": -2.3815720081329346,
326
- "logits/rejected": -2.365286350250244,
327
- "logps/chosen": -394.91192626953125,
328
- "logps/rejected": -446.85321044921875,
329
- "loss": 0.1496,
330
- "rewards/accuracies": 0.7437499761581421,
331
- "rewards/chosen": -1.1274508237838745,
332
- "rewards/margins": 0.7393988966941833,
333
- "rewards/rejected": -1.866849660873413,
334
  "step": 170
335
  },
336
  {
337
- "debug/losses": 0.12488824129104614,
338
- "debug/policy_weights": 0.24605941772460938,
339
- "debug/raw_losses": 0.5098173022270203,
340
  "epoch": 0.38,
 
341
  "learning_rate": 3.9247834624635404e-07,
342
- "logits/chosen": -2.3833303451538086,
343
- "logits/rejected": -2.389314889907837,
344
- "logps/chosen": -389.9188232421875,
345
- "logps/rejected": -450.25140380859375,
346
- "loss": 0.1436,
347
- "rewards/accuracies": 0.768750011920929,
348
- "rewards/chosen": -1.2074089050292969,
349
- "rewards/margins": 0.7412688136100769,
350
- "rewards/rejected": -1.9486777782440186,
351
  "step": 180
352
  },
353
  {
354
- "debug/losses": 0.1292407363653183,
355
- "debug/policy_weights": 0.26770901679992676,
356
- "debug/raw_losses": 0.48357778787612915,
357
  "epoch": 0.4,
 
358
  "learning_rate": 3.7710310482256523e-07,
359
- "logits/chosen": -2.3599636554718018,
360
- "logits/rejected": -2.3223559856414795,
361
- "logps/chosen": -414.10699462890625,
362
- "logps/rejected": -469.5603942871094,
363
- "loss": 0.1421,
364
- "rewards/accuracies": 0.793749988079071,
365
- "rewards/chosen": -1.227112054824829,
366
- "rewards/margins": 0.8277386426925659,
367
- "rewards/rejected": -2.0548505783081055,
368
  "step": 190
369
  },
370
  {
371
- "debug/losses": 0.15382704138755798,
372
- "debug/policy_weights": 0.2786335349082947,
373
- "debug/raw_losses": 0.5533261299133301,
374
  "epoch": 0.42,
 
375
  "learning_rate": 3.610497133404795e-07,
376
- "logits/chosen": -2.2979869842529297,
377
- "logits/rejected": -2.312802791595459,
378
- "logps/chosen": -419.5997009277344,
379
- "logps/rejected": -456.7002868652344,
380
- "loss": 0.1396,
381
  "rewards/accuracies": 0.731249988079071,
382
- "rewards/chosen": -1.2343220710754395,
383
- "rewards/margins": 0.6895232200622559,
384
- "rewards/rejected": -1.9238452911376953,
385
  "step": 200
386
  },
387
  {
388
  "epoch": 0.42,
389
- "eval_debug/losses": 0.13899114727973938,
390
- "eval_debug/policy_weights": 0.2686985731124878,
391
- "eval_debug/raw_losses": 0.5029721260070801,
392
- "eval_logits/chosen": -2.2968499660491943,
393
- "eval_logits/rejected": -2.273340940475464,
394
- "eval_logps/chosen": -391.8349609375,
395
- "eval_logps/rejected": -470.2158203125,
396
- "eval_loss": 0.14160528779029846,
397
- "eval_rewards/accuracies": 0.765625,
398
- "eval_rewards/chosen": -1.3479530811309814,
399
- "eval_rewards/margins": 0.7806724309921265,
400
- "eval_rewards/rejected": -2.1286253929138184,
401
- "eval_runtime": 52.9895,
402
- "eval_samples_per_second": 37.743,
403
- "eval_steps_per_second": 0.604,
404
  "step": 200
405
  },
406
  {
407
- "debug/losses": 0.12412895262241364,
408
- "debug/policy_weights": 0.26891231536865234,
409
- "debug/raw_losses": 0.47236162424087524,
410
  "epoch": 0.44,
 
411
  "learning_rate": 3.4440382358952115e-07,
412
- "logits/chosen": -2.2376811504364014,
413
- "logits/rejected": -2.2135262489318848,
414
- "logps/chosen": -395.38421630859375,
415
- "logps/rejected": -459.84210205078125,
416
- "loss": 0.1418,
417
- "rewards/accuracies": 0.762499988079071,
418
- "rewards/chosen": -1.2235102653503418,
419
- "rewards/margins": 0.8555533289909363,
420
- "rewards/rejected": -2.0790634155273438,
421
  "step": 210
422
  },
423
  {
424
- "debug/losses": 0.13483984768390656,
425
- "debug/policy_weights": 0.2588108479976654,
426
- "debug/raw_losses": 0.5093666911125183,
427
  "epoch": 0.46,
 
428
  "learning_rate": 3.272542485937368e-07,
429
- "logits/chosen": -2.2282791137695312,
430
- "logits/rejected": -2.1731905937194824,
431
- "logps/chosen": -391.88104248046875,
432
- "logps/rejected": -457.2334899902344,
433
- "loss": 0.1418,
434
- "rewards/accuracies": 0.7562500238418579,
435
- "rewards/chosen": -1.1691535711288452,
436
- "rewards/margins": 0.7801142930984497,
437
- "rewards/rejected": -1.9492677450180054,
438
  "step": 220
439
  },
440
  {
441
- "debug/losses": 0.14488555490970612,
442
- "debug/policy_weights": 0.26247432827949524,
443
- "debug/raw_losses": 0.5395208597183228,
444
  "epoch": 0.48,
 
445
  "learning_rate": 3.096924887558854e-07,
446
- "logits/chosen": -2.2395882606506348,
447
- "logits/rejected": -2.1690821647644043,
448
- "logps/chosen": -417.9415588378906,
449
- "logps/rejected": -489.2323303222656,
450
- "loss": 0.1451,
451
- "rewards/accuracies": 0.699999988079071,
452
- "rewards/chosen": -1.433936595916748,
453
- "rewards/margins": 0.758230984210968,
454
- "rewards/rejected": -2.1921677589416504,
455
  "step": 230
456
  },
457
  {
458
- "debug/losses": 0.1316806524991989,
459
- "debug/policy_weights": 0.2545274794101715,
460
- "debug/raw_losses": 0.5126517415046692,
461
  "epoch": 0.5,
 
462
  "learning_rate": 2.9181224366319943e-07,
463
- "logits/chosen": -2.171607732772827,
464
- "logits/rejected": -2.1234748363494873,
465
- "logps/chosen": -426.31719970703125,
466
- "logps/rejected": -475.4518127441406,
467
- "loss": 0.1325,
468
- "rewards/accuracies": 0.75,
469
- "rewards/chosen": -1.4014716148376465,
470
- "rewards/margins": 0.8200809359550476,
471
- "rewards/rejected": -2.2215523719787598,
472
  "step": 240
473
  },
474
  {
475
- "debug/losses": 0.14006611704826355,
476
- "debug/policy_weights": 0.2591710388660431,
477
- "debug/raw_losses": 0.5314095616340637,
478
  "epoch": 0.52,
 
479
  "learning_rate": 2.7370891215954565e-07,
480
- "logits/chosen": -2.1962928771972656,
481
- "logits/rejected": -2.1328587532043457,
482
- "logps/chosen": -403.8672790527344,
483
- "logps/rejected": -458.2118225097656,
484
- "loss": 0.1446,
485
- "rewards/accuracies": 0.762499988079071,
486
- "rewards/chosen": -1.3229501247406006,
487
- "rewards/margins": 0.7149368524551392,
488
- "rewards/rejected": -2.03788685798645,
489
  "step": 250
490
  },
491
  {
492
- "debug/losses": 0.14360225200653076,
493
- "debug/policy_weights": 0.26045817136764526,
494
- "debug/raw_losses": 0.5265286564826965,
495
  "epoch": 0.54,
 
496
  "learning_rate": 2.55479083351317e-07,
497
- "logits/chosen": -2.1760964393615723,
498
- "logits/rejected": -2.150712013244629,
499
- "logps/chosen": -401.54754638671875,
500
- "logps/rejected": -479.643310546875,
501
- "loss": 0.1352,
502
- "rewards/accuracies": 0.7124999761581421,
503
- "rewards/chosen": -1.4240009784698486,
504
- "rewards/margins": 0.7899759411811829,
505
- "rewards/rejected": -2.2139768600463867,
506
  "step": 260
507
  },
508
  {
509
- "debug/losses": 0.11131677776575089,
510
- "debug/policy_weights": 0.24807500839233398,
511
- "debug/raw_losses": 0.47175589203834534,
512
  "epoch": 0.56,
 
513
  "learning_rate": 2.3722002126275822e-07,
514
- "logits/chosen": -2.186707019805908,
515
- "logits/rejected": -2.1590161323547363,
516
- "logps/chosen": -427.5020446777344,
517
- "logps/rejected": -495.8419494628906,
518
- "loss": 0.1267,
519
- "rewards/accuracies": 0.75,
520
- "rewards/chosen": -1.3904914855957031,
521
- "rewards/margins": 0.9172846078872681,
522
- "rewards/rejected": -2.3077759742736816,
523
  "step": 270
524
  },
525
  {
526
- "debug/losses": 0.11321704089641571,
527
- "debug/policy_weights": 0.24243195354938507,
528
- "debug/raw_losses": 0.4841908812522888,
529
  "epoch": 0.59,
 
530
  "learning_rate": 2.19029145890313e-07,
531
- "logits/chosen": -2.1000428199768066,
532
- "logits/rejected": -2.089947462081909,
533
- "logps/chosen": -469.45867919921875,
534
- "logps/rejected": -520.9786987304688,
535
- "loss": 0.1249,
536
- "rewards/accuracies": 0.8187500238418579,
537
- "rewards/chosen": -1.617157220840454,
538
- "rewards/margins": 0.9369556307792664,
539
- "rewards/rejected": -2.554112672805786,
540
  "step": 280
541
  },
542
  {
543
- "debug/losses": 0.11086218059062958,
544
- "debug/policy_weights": 0.22184400260448456,
545
- "debug/raw_losses": 0.5123878717422485,
546
  "epoch": 0.61,
 
547
  "learning_rate": 2.0100351342479216e-07,
548
- "logits/chosen": -2.1559338569641113,
549
- "logits/rejected": -2.1183762550354004,
550
- "logps/chosen": -447.17138671875,
551
- "logps/rejected": -499.66143798828125,
552
- "loss": 0.1214,
553
- "rewards/accuracies": 0.71875,
554
- "rewards/chosen": -1.6940768957138062,
555
- "rewards/margins": 0.7695088386535645,
556
- "rewards/rejected": -2.46358585357666,
557
  "step": 290
558
  },
559
  {
560
- "debug/losses": 0.13476888835430145,
561
- "debug/policy_weights": 0.23854057490825653,
562
- "debug/raw_losses": 0.5516811013221741,
563
  "epoch": 0.63,
 
564
  "learning_rate": 1.8323929841460178e-07,
565
- "logits/chosen": -2.199371099472046,
566
- "logits/rejected": -2.178723096847534,
567
- "logps/chosen": -422.80450439453125,
568
- "logps/rejected": -485.37890625,
569
- "loss": 0.1294,
570
- "rewards/accuracies": 0.7250000238418579,
571
- "rewards/chosen": -1.7119791507720947,
572
- "rewards/margins": 0.7184675335884094,
573
- "rewards/rejected": -2.4304463863372803,
574
  "step": 300
575
  },
576
  {
577
  "epoch": 0.63,
578
- "eval_debug/losses": 0.12840886414051056,
579
- "eval_debug/policy_weights": 0.25453710556030273,
580
- "eval_debug/raw_losses": 0.4935261309146881,
581
- "eval_logits/chosen": -2.1884968280792236,
582
- "eval_logits/rejected": -2.158949851989746,
583
- "eval_logps/chosen": -417.07135009765625,
584
- "eval_logps/rejected": -502.21124267578125,
585
- "eval_loss": 0.13086578249931335,
586
- "eval_rewards/accuracies": 0.73828125,
587
- "eval_rewards/chosen": -1.6003175973892212,
588
- "eval_rewards/margins": 0.8482623100280762,
589
- "eval_rewards/rejected": -2.448579788208008,
590
- "eval_runtime": 53.0489,
591
- "eval_samples_per_second": 37.701,
592
- "eval_steps_per_second": 0.603,
593
  "step": 300
594
  },
595
  {
596
- "debug/losses": 0.12721626460552216,
597
- "debug/policy_weights": 0.25444597005844116,
598
- "debug/raw_losses": 0.49146708846092224,
599
  "epoch": 0.65,
 
600
  "learning_rate": 1.6583128063291573e-07,
601
- "logits/chosen": -2.169189453125,
602
- "logits/rejected": -2.12001895904541,
603
- "logps/chosen": -451.0543518066406,
604
- "logps/rejected": -501.00830078125,
605
- "loss": 0.1339,
606
- "rewards/accuracies": 0.7749999761581421,
607
- "rewards/chosen": -1.6334816217422485,
608
- "rewards/margins": 0.8188700675964355,
609
- "rewards/rejected": -2.4523518085479736,
610
  "step": 310
611
  },
612
  {
613
- "debug/losses": 0.1101643294095993,
614
- "debug/policy_weights": 0.24654574692249298,
615
- "debug/raw_losses": 0.46620503067970276,
616
  "epoch": 0.67,
 
617
  "learning_rate": 1.488723393865766e-07,
618
- "logits/chosen": -2.1372463703155518,
619
- "logits/rejected": -2.0980920791625977,
620
- "logps/chosen": -443.265625,
621
- "logps/rejected": -513.0931396484375,
622
- "loss": 0.1228,
623
- "rewards/accuracies": 0.78125,
624
- "rewards/chosen": -1.5740420818328857,
625
- "rewards/margins": 0.9036803245544434,
626
- "rewards/rejected": -2.477722644805908,
627
  "step": 320
628
  },
629
  {
630
- "debug/losses": 0.12689927220344543,
631
- "debug/policy_weights": 0.25536665320396423,
632
- "debug/raw_losses": 0.49713826179504395,
633
  "epoch": 0.69,
 
634
  "learning_rate": 1.3245295796480788e-07,
635
- "logits/chosen": -2.1500933170318604,
636
- "logits/rejected": -2.1095988750457764,
637
- "logps/chosen": -424.4521484375,
638
- "logps/rejected": -503.61163330078125,
639
- "loss": 0.1324,
640
- "rewards/accuracies": 0.793749988079071,
641
- "rewards/chosen": -1.5042273998260498,
642
- "rewards/margins": 0.8895484805107117,
643
- "rewards/rejected": -2.3937759399414062,
644
  "step": 330
645
  },
646
  {
647
- "debug/losses": 0.11528172343969345,
648
- "debug/policy_weights": 0.24339346587657928,
649
- "debug/raw_losses": 0.48817843198776245,
650
  "epoch": 0.71,
 
651
  "learning_rate": 1.1666074087171627e-07,
652
- "logits/chosen": -2.1724162101745605,
653
- "logits/rejected": -2.1479129791259766,
654
- "logps/chosen": -403.76806640625,
655
- "logps/rejected": -496.89703369140625,
656
- "loss": 0.1239,
657
- "rewards/accuracies": 0.75,
658
- "rewards/chosen": -1.4917190074920654,
659
- "rewards/margins": 0.9479940533638,
660
- "rewards/rejected": -2.4397130012512207,
661
  "step": 340
662
  },
663
  {
664
- "debug/losses": 0.11798451095819473,
665
- "debug/policy_weights": 0.23595662415027618,
666
- "debug/raw_losses": 0.5302962064743042,
667
  "epoch": 0.73,
 
668
  "learning_rate": 1.0157994641835734e-07,
669
- "logits/chosen": -2.148632526397705,
670
- "logits/rejected": -2.1165192127227783,
671
- "logps/chosen": -442.53057861328125,
672
- "logps/rejected": -513.9363403320312,
673
- "loss": 0.1374,
674
  "rewards/accuracies": 0.7124999761581421,
675
- "rewards/chosen": -1.6607694625854492,
676
- "rewards/margins": 0.8479129672050476,
677
- "rewards/rejected": -2.5086822509765625,
678
  "step": 350
679
  },
680
  {
681
- "debug/losses": 0.10292885452508926,
682
- "debug/policy_weights": 0.22015142440795898,
683
- "debug/raw_losses": 0.4573485255241394,
684
  "epoch": 0.75,
 
685
  "learning_rate": 8.729103716819111e-08,
686
- "logits/chosen": -2.1121697425842285,
687
- "logits/rejected": -2.0813915729522705,
688
- "logps/chosen": -401.754150390625,
689
- "logps/rejected": -494.1514587402344,
690
- "loss": 0.1311,
691
- "rewards/accuracies": 0.762499988079071,
692
- "rewards/chosen": -1.4336955547332764,
693
- "rewards/margins": 1.074181079864502,
694
- "rewards/rejected": -2.5078768730163574,
695
  "step": 360
696
  },
697
  {
698
- "debug/losses": 0.13746492564678192,
699
- "debug/policy_weights": 0.25476521253585815,
700
- "debug/raw_losses": 0.5358820557594299,
701
  "epoch": 0.77,
 
702
  "learning_rate": 7.387025063449081e-08,
703
- "logits/chosen": -2.175912380218506,
704
- "logits/rejected": -2.1576006412506104,
705
- "logps/chosen": -416.6788024902344,
706
- "logps/rejected": -477.13153076171875,
707
- "loss": 0.1253,
708
- "rewards/accuracies": 0.7124999761581421,
709
- "rewards/chosen": -1.3861879110336304,
710
- "rewards/margins": 0.7486017346382141,
711
- "rewards/rejected": -2.1347897052764893,
712
  "step": 370
713
  },
714
  {
715
- "debug/losses": 0.14446747303009033,
716
- "debug/policy_weights": 0.2367408275604248,
717
- "debug/raw_losses": 0.5707719326019287,
718
  "epoch": 0.79,
 
719
  "learning_rate": 6.138919252022435e-08,
720
- "logits/chosen": -2.084808111190796,
721
- "logits/rejected": -2.070844888687134,
722
- "logps/chosen": -394.0305480957031,
723
- "logps/rejected": -481.216552734375,
724
- "loss": 0.1285,
725
- "rewards/accuracies": 0.699999988079071,
726
- "rewards/chosen": -1.7118937969207764,
727
- "rewards/margins": 0.6706421375274658,
728
- "rewards/rejected": -2.3825364112854004,
729
  "step": 380
730
  },
731
  {
732
- "debug/losses": 0.12546047568321228,
733
- "debug/policy_weights": 0.24170584976673126,
734
- "debug/raw_losses": 0.4937317967414856,
735
  "epoch": 0.82,
 
736
  "learning_rate": 4.991445467064689e-08,
737
- "logits/chosen": -2.1555511951446533,
738
- "logits/rejected": -2.1209189891815186,
739
- "logps/chosen": -406.2367248535156,
740
- "logps/rejected": -479.50567626953125,
741
- "loss": 0.1302,
742
- "rewards/accuracies": 0.7875000238418579,
743
- "rewards/chosen": -1.5467679500579834,
744
- "rewards/margins": 0.8932439684867859,
745
- "rewards/rejected": -2.440011501312256,
746
  "step": 390
747
  },
748
  {
749
- "debug/losses": 0.13418573141098022,
750
- "debug/policy_weights": 0.24948246777057648,
751
- "debug/raw_losses": 0.5159034132957458,
752
  "epoch": 0.84,
 
753
  "learning_rate": 3.9507259776993954e-08,
754
- "logits/chosen": -2.1299073696136475,
755
- "logits/rejected": -2.0702521800994873,
756
- "logps/chosen": -448.74298095703125,
757
- "logps/rejected": -487.4234313964844,
758
- "loss": 0.1329,
759
- "rewards/accuracies": 0.7562500238418579,
760
- "rewards/chosen": -1.6303924322128296,
761
- "rewards/margins": 0.7685388326644897,
762
- "rewards/rejected": -2.3989315032958984,
763
  "step": 400
764
  },
765
  {
766
  "epoch": 0.84,
767
- "eval_debug/losses": 0.12970629334449768,
768
- "eval_debug/policy_weights": 0.25886857509613037,
769
- "eval_debug/raw_losses": 0.48170554637908936,
770
- "eval_logits/chosen": -2.18296217918396,
771
- "eval_logits/rejected": -2.1507883071899414,
772
- "eval_logps/chosen": -409.0387878417969,
773
- "eval_logps/rejected": -500.7933654785156,
774
- "eval_loss": 0.1314304769039154,
775
- "eval_rewards/accuracies": 0.75,
776
- "eval_rewards/chosen": -1.519991159439087,
777
- "eval_rewards/margins": 0.9144098162651062,
778
- "eval_rewards/rejected": -2.434401035308838,
779
- "eval_runtime": 53.0316,
780
- "eval_samples_per_second": 37.713,
781
- "eval_steps_per_second": 0.603,
782
  "step": 400
783
  },
784
  {
785
- "debug/losses": 0.13898980617523193,
786
- "debug/policy_weights": 0.24470162391662598,
787
- "debug/raw_losses": 0.5698193907737732,
788
  "epoch": 0.86,
 
789
  "learning_rate": 3.022313472693447e-08,
790
- "logits/chosen": -2.1896605491638184,
791
- "logits/rejected": -2.1111254692077637,
792
- "logps/chosen": -431.5157165527344,
793
- "logps/rejected": -519.735595703125,
794
- "loss": 0.1335,
795
- "rewards/accuracies": 0.71875,
796
- "rewards/chosen": -1.67926025390625,
797
- "rewards/margins": 0.7034608125686646,
798
- "rewards/rejected": -2.382721185684204,
799
  "step": 410
800
  },
801
  {
802
- "debug/losses": 0.13722026348114014,
803
- "debug/policy_weights": 0.25542253255844116,
804
- "debug/raw_losses": 0.553850531578064,
805
  "epoch": 0.88,
 
806
  "learning_rate": 2.2111614344599684e-08,
807
- "logits/chosen": -2.1816725730895996,
808
- "logits/rejected": -2.1436891555786133,
809
- "logps/chosen": -450.2588806152344,
810
- "logps/rejected": -532.4340209960938,
811
- "loss": 0.132,
812
- "rewards/accuracies": 0.668749988079071,
813
- "rewards/chosen": -1.6344585418701172,
814
- "rewards/margins": 0.765791654586792,
815
- "rewards/rejected": -2.40024995803833,
816
  "step": 420
817
  },
818
  {
819
- "debug/losses": 0.11242847144603729,
820
- "debug/policy_weights": 0.23566405475139618,
821
- "debug/raw_losses": 0.474797785282135,
822
  "epoch": 0.9,
 
823
  "learning_rate": 1.521597710086439e-08,
824
- "logits/chosen": -2.1367993354797363,
825
- "logits/rejected": -2.101963520050049,
826
- "logps/chosen": -447.2886657714844,
827
- "logps/rejected": -528.4554443359375,
828
- "loss": 0.135,
829
- "rewards/accuracies": 0.768750011920929,
830
- "rewards/chosen": -1.57035493850708,
831
- "rewards/margins": 0.9703021049499512,
832
- "rewards/rejected": -2.5406570434570312,
833
  "step": 430
834
  },
835
  {
836
- "debug/losses": 0.14394986629486084,
837
- "debug/policy_weights": 0.2594669461250305,
838
- "debug/raw_losses": 0.551374614238739,
839
  "epoch": 0.92,
 
840
  "learning_rate": 9.57301420397924e-09,
841
- "logits/chosen": -2.1795363426208496,
842
- "logits/rejected": -2.1459131240844727,
843
- "logps/chosen": -479.01873779296875,
844
- "logps/rejected": -490.289794921875,
845
- "loss": 0.128,
846
- "rewards/accuracies": 0.7250000238418579,
847
- "rewards/chosen": -1.6396774053573608,
848
- "rewards/margins": 0.6072880625724792,
849
- "rewards/rejected": -2.2469656467437744,
850
  "step": 440
851
  },
852
  {
853
- "debug/losses": 0.12136085331439972,
854
- "debug/policy_weights": 0.25487110018730164,
855
- "debug/raw_losses": 0.5141120553016663,
856
  "epoch": 0.94,
 
857
  "learning_rate": 5.212833302556258e-09,
858
- "logits/chosen": -2.131383180618286,
859
- "logits/rejected": -2.101245880126953,
860
- "logps/chosen": -428.8694763183594,
861
- "logps/rejected": -482.65380859375,
862
- "loss": 0.1317,
863
- "rewards/accuracies": 0.762499988079071,
864
- "rewards/chosen": -1.4801188707351685,
865
- "rewards/margins": 0.8445402383804321,
866
- "rewards/rejected": -2.3246593475341797,
867
  "step": 450
868
  },
869
  {
870
- "debug/losses": 0.13340520858764648,
871
- "debug/policy_weights": 0.26456892490386963,
872
- "debug/raw_losses": 0.5068139433860779,
873
  "epoch": 0.96,
 
874
  "learning_rate": 2.158697848236607e-09,
875
- "logits/chosen": -2.1625778675079346,
876
- "logits/rejected": -2.118445873260498,
877
- "logps/chosen": -446.69012451171875,
878
- "logps/rejected": -497.6316833496094,
879
- "loss": 0.1415,
880
- "rewards/accuracies": 0.7749999761581421,
881
- "rewards/chosen": -1.46303391456604,
882
- "rewards/margins": 0.8069203495979309,
883
- "rewards/rejected": -2.2699544429779053,
884
  "step": 460
885
  },
886
  {
887
- "debug/losses": 0.13318563997745514,
888
- "debug/policy_weights": 0.2579984962940216,
889
- "debug/raw_losses": 0.5174868702888489,
890
  "epoch": 0.98,
 
891
  "learning_rate": 4.269029751107489e-10,
892
- "logits/chosen": -2.1812849044799805,
893
- "logits/rejected": -2.100320816040039,
894
- "logps/chosen": -435.08428955078125,
895
- "logps/rejected": -452.8404235839844,
896
- "loss": 0.1302,
897
- "rewards/accuracies": 0.699999988079071,
898
- "rewards/chosen": -1.3687784671783447,
899
- "rewards/margins": 0.8292155265808105,
900
- "rewards/rejected": -2.1979942321777344,
901
  "step": 470
902
  },
903
  {
904
  "epoch": 1.0,
905
  "step": 478,
906
  "total_flos": 0.0,
907
- "train_loss": 0.17621936496331603,
908
- "train_runtime": 4510.4366,
909
- "train_samples_per_second": 13.554,
910
- "train_steps_per_second": 0.106
911
  }
912
  ],
913
  "logging_steps": 10,
914
  "max_steps": 478,
 
915
  "num_train_epochs": 1,
916
  "save_steps": 100,
917
  "total_flos": 0.0,
 
918
  "trial_name": null,
919
  "trial_params": null
920
  }
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "debug/losses": 0.36003828048706055,
13
+ "debug/policy_weights": 0.5194254517555237,
14
  "debug/raw_losses": 0.6931471824645996,
15
  "epoch": 0.0,
16
+ "grad_norm": 5.3582401111914875,
17
  "learning_rate": 1.0416666666666666e-08,
18
+ "logits/chosen": -2.888590097427368,
19
+ "logits/rejected": -2.871957540512085,
20
+ "logps/chosen": -261.88616943359375,
21
+ "logps/rejected": -355.31793212890625,
22
+ "loss": 0.3648,
23
  "rewards/accuracies": 0.0,
24
  "rewards/chosen": 0.0,
25
  "rewards/margins": 0.0,
 
27
  "step": 1
28
  },
29
  {
30
+ "debug/losses": 0.3548344671726227,
31
+ "debug/policy_weights": 0.5118452906608582,
32
+ "debug/raw_losses": 0.6932389736175537,
33
  "epoch": 0.02,
34
+ "grad_norm": 5.014402110225361,
35
  "learning_rate": 1.0416666666666667e-07,
36
+ "logits/chosen": -2.7435038089752197,
37
+ "logits/rejected": -2.7258925437927246,
38
+ "logps/chosen": -256.5353088378906,
39
+ "logps/rejected": -244.49546813964844,
40
+ "loss": 0.3619,
41
  "rewards/accuracies": 0.4236111044883728,
42
+ "rewards/chosen": -8.211319800466299e-05,
43
+ "rewards/margins": -0.00017834355821833014,
44
+ "rewards/rejected": 9.623030200600624e-05,
45
  "step": 10
46
  },
47
  {
48
+ "debug/losses": 0.3572663962841034,
49
+ "debug/policy_weights": 0.5156484842300415,
50
+ "debug/raw_losses": 0.6928108930587769,
51
  "epoch": 0.04,
52
+ "grad_norm": 4.715098712970233,
53
  "learning_rate": 2.0833333333333333e-07,
54
+ "logits/chosen": -2.8007731437683105,
55
+ "logits/rejected": -2.7729618549346924,
56
+ "logps/chosen": -271.37860107421875,
57
+ "logps/rejected": -288.34271240234375,
58
+ "loss": 0.3584,
59
+ "rewards/accuracies": 0.543749988079071,
60
+ "rewards/chosen": 0.0006374852382577956,
61
+ "rewards/margins": 0.0006833201623521745,
62
+ "rewards/rejected": -4.583501868182793e-05,
63
  "step": 20
64
  },
65
  {
66
+ "debug/losses": 0.35951218008995056,
67
+ "debug/policy_weights": 0.521370530128479,
68
+ "debug/raw_losses": 0.6895479559898376,
69
  "epoch": 0.06,
70
+ "grad_norm": 4.7982052271779825,
71
  "learning_rate": 3.1249999999999997e-07,
72
+ "logits/chosen": -2.810387134552002,
73
+ "logits/rejected": -2.773030996322632,
74
+ "logps/chosen": -291.4071350097656,
75
+ "logps/rejected": -258.0267639160156,
76
+ "loss": 0.3579,
77
+ "rewards/accuracies": 0.6312500238418579,
78
+ "rewards/chosen": 0.004485988523811102,
79
+ "rewards/margins": 0.00730940094217658,
80
+ "rewards/rejected": -0.002823411952704191,
81
  "step": 30
82
  },
83
  {
84
+ "debug/losses": 0.35061347484588623,
85
+ "debug/policy_weights": 0.5189496278762817,
86
+ "debug/raw_losses": 0.6759483218193054,
87
  "epoch": 0.08,
88
+ "grad_norm": 4.841334528975048,
89
  "learning_rate": 4.1666666666666667e-07,
90
+ "logits/chosen": -2.7438368797302246,
91
+ "logits/rejected": -2.7376155853271484,
92
+ "logps/chosen": -252.50436401367188,
93
+ "logps/rejected": -240.8105010986328,
94
+ "loss": 0.3573,
95
+ "rewards/accuracies": 0.762499988079071,
96
+ "rewards/chosen": 0.015405787155032158,
97
+ "rewards/margins": 0.03552238270640373,
98
+ "rewards/rejected": -0.020116599276661873,
99
  "step": 40
100
  },
101
  {
102
+ "debug/losses": 0.34173911809921265,
103
+ "debug/policy_weights": 0.5125550627708435,
104
+ "debug/raw_losses": 0.666517436504364,
105
  "epoch": 0.1,
106
+ "grad_norm": 5.135483442753442,
107
  "learning_rate": 4.999733114418725e-07,
108
+ "logits/chosen": -2.6830456256866455,
109
+ "logits/rejected": -2.6514101028442383,
110
+ "logps/chosen": -269.3827819824219,
111
+ "logps/rejected": -287.2772216796875,
112
+ "loss": 0.3437,
113
+ "rewards/accuracies": 0.6812499761581421,
114
+ "rewards/chosen": 0.0019381232559680939,
115
+ "rewards/margins": 0.05897800251841545,
116
+ "rewards/rejected": -0.05703987926244736,
117
  "step": 50
118
  },
119
  {
120
+ "debug/losses": 0.3377828598022461,
121
+ "debug/policy_weights": 0.5224286317825317,
122
+ "debug/raw_losses": 0.6448591947555542,
123
  "epoch": 0.13,
124
+ "grad_norm": 5.5606740216520905,
125
  "learning_rate": 4.990398100856366e-07,
126
+ "logits/chosen": -2.745028018951416,
127
+ "logits/rejected": -2.713804244995117,
128
+ "logps/chosen": -266.05279541015625,
129
+ "logps/rejected": -305.28271484375,
130
+ "loss": 0.3305,
131
+ "rewards/accuracies": 0.6937500238418579,
132
+ "rewards/chosen": -0.02193317376077175,
133
+ "rewards/margins": 0.1177050918340683,
134
+ "rewards/rejected": -0.1396382749080658,
135
  "step": 60
136
  },
137
  {
138
+ "debug/losses": 0.28029459714889526,
139
+ "debug/policy_weights": 0.47101029753685,
140
+ "debug/raw_losses": 0.6030661463737488,
141
  "epoch": 0.15,
142
+ "grad_norm": 6.284287160526739,
143
  "learning_rate": 4.967775735898179e-07,
144
+ "logits/chosen": -2.7072081565856934,
145
+ "logits/rejected": -2.7113521099090576,
146
+ "logps/chosen": -272.530029296875,
147
+ "logps/rejected": -289.87713623046875,
148
+ "loss": 0.2826,
149
+ "rewards/accuracies": 0.7875000238418579,
150
+ "rewards/chosen": -0.1557193100452423,
151
+ "rewards/margins": 0.24685576558113098,
152
+ "rewards/rejected": -0.4025750756263733,
153
  "step": 70
154
  },
155
  {
156
+ "debug/losses": 0.26233458518981934,
157
+ "debug/policy_weights": 0.42553257942199707,
158
+ "debug/raw_losses": 0.6218484044075012,
159
  "epoch": 0.17,
160
+ "grad_norm": 6.8893773770621305,
161
  "learning_rate": 4.931986719649298e-07,
162
+ "logits/chosen": -2.800093412399292,
163
+ "logits/rejected": -2.7768216133117676,
164
+ "logps/chosen": -360.3621520996094,
165
+ "logps/rejected": -329.7616882324219,
166
+ "loss": 0.2516,
167
  "rewards/accuracies": 0.6875,
168
+ "rewards/chosen": -0.45188745856285095,
169
+ "rewards/margins": 0.26955321431159973,
170
+ "rewards/rejected": -0.7214406728744507,
171
  "step": 80
172
  },
173
  {
174
+ "debug/losses": 0.18998686969280243,
175
+ "debug/policy_weights": 0.34327536821365356,
176
+ "debug/raw_losses": 0.5527450442314148,
177
  "epoch": 0.19,
178
+ "grad_norm": 6.5870040213670435,
179
  "learning_rate": 4.883222001996351e-07,
180
+ "logits/chosen": -2.724050998687744,
181
+ "logits/rejected": -2.690390110015869,
182
+ "logps/chosen": -303.0259704589844,
183
+ "logps/rejected": -344.49505615234375,
184
+ "loss": 0.2198,
185
+ "rewards/accuracies": 0.768750011920929,
186
+ "rewards/chosen": -0.5087189078330994,
187
+ "rewards/margins": 0.4743797779083252,
188
+ "rewards/rejected": -0.9830986857414246,
189
  "step": 90
190
  },
191
  {
192
+ "debug/losses": 0.16577747464179993,
193
+ "debug/policy_weights": 0.29914408922195435,
194
+ "debug/raw_losses": 0.5552432537078857,
195
  "epoch": 0.21,
196
+ "grad_norm": 7.249323730073561,
197
  "learning_rate": 4.821741763807186e-07,
198
+ "logits/chosen": -2.6653432846069336,
199
+ "logits/rejected": -2.658141613006592,
200
+ "logps/chosen": -325.9325866699219,
201
+ "logps/rejected": -375.12127685546875,
202
+ "loss": 0.1781,
203
+ "rewards/accuracies": 0.706250011920929,
204
+ "rewards/chosen": -0.7297804951667786,
205
+ "rewards/margins": 0.511761486530304,
206
+ "rewards/rejected": -1.2415419816970825,
207
  "step": 100
208
  },
209
  {
210
  "epoch": 0.21,
211
+ "eval_debug/losses": 0.19525206089019775,
212
+ "eval_debug/policy_weights": 0.3511899709701538,
213
+ "eval_debug/raw_losses": 0.5590351819992065,
214
+ "eval_logits/chosen": -2.70804500579834,
215
+ "eval_logits/rejected": -2.6910133361816406,
216
+ "eval_logps/chosen": -319.9805603027344,
217
+ "eval_logps/rejected": -373.18670654296875,
218
+ "eval_loss": 0.20066915452480316,
219
+ "eval_rewards/accuracies": 0.734375,
220
+ "eval_rewards/chosen": -0.6478448510169983,
221
+ "eval_rewards/margins": 0.5214487910270691,
222
+ "eval_rewards/rejected": -1.1692936420440674,
223
+ "eval_runtime": 43.2006,
224
+ "eval_samples_per_second": 46.296,
225
+ "eval_steps_per_second": 0.741,
226
  "step": 100
227
  },
228
  {
229
+ "debug/losses": 0.1987285315990448,
230
+ "debug/policy_weights": 0.34678488969802856,
231
+ "debug/raw_losses": 0.5721240043640137,
232
  "epoch": 0.23,
233
+ "grad_norm": 6.9123559213202554,
234
  "learning_rate": 4.747874028753375e-07,
235
+ "logits/chosen": -2.7045469284057617,
236
+ "logits/rejected": -2.672999858856201,
237
+ "logps/chosen": -369.9187927246094,
238
+ "logps/rejected": -373.6562194824219,
239
+ "loss": 0.1986,
240
+ "rewards/accuracies": 0.706250011920929,
241
+ "rewards/chosen": -0.7313551902770996,
242
+ "rewards/margins": 0.47859472036361694,
243
+ "rewards/rejected": -1.2099498510360718,
244
  "step": 110
245
  },
246
  {
247
+ "debug/losses": 0.18334314227104187,
248
+ "debug/policy_weights": 0.3257436156272888,
249
+ "debug/raw_losses": 0.5537685751914978,
250
  "epoch": 0.25,
251
+ "grad_norm": 7.316158832496888,
252
  "learning_rate": 4.662012913161997e-07,
253
+ "logits/chosen": -2.6026859283447266,
254
+ "logits/rejected": -2.600971221923828,
255
+ "logps/chosen": -346.50555419921875,
256
+ "logps/rejected": -379.53887939453125,
257
+ "loss": 0.1846,
258
+ "rewards/accuracies": 0.731249988079071,
259
+ "rewards/chosen": -0.7285534739494324,
260
+ "rewards/margins": 0.5237767100334167,
261
+ "rewards/rejected": -1.2523301839828491,
262
  "step": 120
263
  },
264
  {
265
+ "debug/losses": 0.17840833961963654,
266
+ "debug/policy_weights": 0.3271704614162445,
267
+ "debug/raw_losses": 0.5529276132583618,
268
  "epoch": 0.27,
269
+ "grad_norm": 6.11630399118371,
270
  "learning_rate": 4.5646165232345103e-07,
271
+ "logits/chosen": -2.5919995307922363,
272
+ "logits/rejected": -2.598027467727661,
273
+ "logps/chosen": -346.44610595703125,
274
+ "logps/rejected": -391.03778076171875,
275
+ "loss": 0.1758,
276
+ "rewards/accuracies": 0.731249988079071,
277
+ "rewards/chosen": -0.7799400687217712,
278
+ "rewards/margins": 0.5067718029022217,
279
+ "rewards/rejected": -1.2867119312286377,
280
  "step": 130
281
  },
282
  {
283
+ "debug/losses": 0.18315494060516357,
284
+ "debug/policy_weights": 0.3452090620994568,
285
+ "debug/raw_losses": 0.56131511926651,
286
  "epoch": 0.29,
287
+ "grad_norm": 6.845717287191781,
288
  "learning_rate": 4.456204510851956e-07,
289
+ "logits/chosen": -2.572934627532959,
290
+ "logits/rejected": -2.5713508129119873,
291
+ "logps/chosen": -390.03253173828125,
292
+ "logps/rejected": -440.16461181640625,
293
+ "loss": 0.1668,
294
  "rewards/accuracies": 0.7250000238418579,
295
+ "rewards/chosen": -0.8578819036483765,
296
+ "rewards/margins": 0.5912094712257385,
297
+ "rewards/rejected": -1.4490914344787598,
298
  "step": 140
299
  },
300
  {
301
+ "debug/losses": 0.19362398982048035,
302
+ "debug/policy_weights": 0.33687275648117065,
303
+ "debug/raw_losses": 0.559166669845581,
304
  "epoch": 0.31,
305
+ "grad_norm": 7.029648911332738,
306
  "learning_rate": 4.337355301007335e-07,
307
+ "logits/chosen": -2.512932300567627,
308
+ "logits/rejected": -2.4797143936157227,
309
+ "logps/chosen": -357.82464599609375,
310
+ "logps/rejected": -402.2618713378906,
311
+ "loss": 0.1771,
312
+ "rewards/accuracies": 0.7124999761581421,
313
+ "rewards/chosen": -0.8432974815368652,
314
+ "rewards/margins": 0.48099932074546814,
315
+ "rewards/rejected": -1.3242968320846558,
316
  "step": 150
317
  },
318
  {
319
+ "debug/losses": 0.14652669429779053,
320
+ "debug/policy_weights": 0.28093475103378296,
321
+ "debug/raw_losses": 0.5318498611450195,
322
  "epoch": 0.33,
323
+ "grad_norm": 6.358610241024355,
324
  "learning_rate": 4.2087030056579986e-07,
325
+ "logits/chosen": -2.464831590652466,
326
+ "logits/rejected": -2.4293084144592285,
327
+ "logps/chosen": -370.14971923828125,
328
+ "logps/rejected": -408.4079284667969,
329
+ "loss": 0.1583,
330
+ "rewards/accuracies": 0.7124999761581421,
331
+ "rewards/chosen": -1.0606319904327393,
332
+ "rewards/margins": 0.596233606338501,
333
+ "rewards/rejected": -1.6568657159805298,
334
  "step": 160
335
  },
336
  {
337
+ "debug/losses": 0.1372167319059372,
338
+ "debug/policy_weights": 0.2504947781562805,
339
+ "debug/raw_losses": 0.6000548005104065,
340
  "epoch": 0.36,
341
+ "grad_norm": 7.75801911972225,
342
  "learning_rate": 4.070934040463998e-07,
343
+ "logits/chosen": -2.361786365509033,
344
+ "logits/rejected": -2.332592010498047,
345
+ "logps/chosen": -389.86767578125,
346
+ "logps/rejected": -423.97698974609375,
347
+ "loss": 0.1417,
348
+ "rewards/accuracies": 0.675000011920929,
349
+ "rewards/chosen": -1.4737131595611572,
350
+ "rewards/margins": 0.511743426322937,
351
+ "rewards/rejected": -1.9854564666748047,
352
  "step": 170
353
  },
354
  {
355
+ "debug/losses": 0.11748126894235611,
356
+ "debug/policy_weights": 0.2296670377254486,
357
+ "debug/raw_losses": 0.5410301089286804,
358
  "epoch": 0.38,
359
+ "grad_norm": 6.142786182484198,
360
  "learning_rate": 3.9247834624635404e-07,
361
+ "logits/chosen": -2.3254575729370117,
362
+ "logits/rejected": -2.3115153312683105,
363
+ "logps/chosen": -391.5261535644531,
364
+ "logps/rejected": -429.21875,
365
+ "loss": 0.1327,
366
+ "rewards/accuracies": 0.706250011920929,
367
+ "rewards/chosen": -1.446401834487915,
368
+ "rewards/margins": 0.6310542821884155,
369
+ "rewards/rejected": -2.07745623588562,
370
  "step": 180
371
  },
372
  {
373
+ "debug/losses": 0.16298121213912964,
374
+ "debug/policy_weights": 0.2831325829029083,
375
+ "debug/raw_losses": 0.5709558725357056,
376
  "epoch": 0.4,
377
+ "grad_norm": 6.941055961208667,
378
  "learning_rate": 3.7710310482256523e-07,
379
+ "logits/chosen": -2.378763198852539,
380
+ "logits/rejected": -2.3435521125793457,
381
+ "logps/chosen": -381.296630859375,
382
+ "logps/rejected": -435.13494873046875,
383
+ "loss": 0.1447,
384
+ "rewards/accuracies": 0.675000011920929,
385
+ "rewards/chosen": -1.178696870803833,
386
+ "rewards/margins": 0.6175613403320312,
387
+ "rewards/rejected": -1.7962583303451538,
388
  "step": 190
389
  },
390
  {
391
+ "debug/losses": 0.1531984657049179,
392
+ "debug/policy_weights": 0.27346447110176086,
393
+ "debug/raw_losses": 0.5578541159629822,
394
  "epoch": 0.42,
395
+ "grad_norm": 9.54639374222291,
396
  "learning_rate": 3.610497133404795e-07,
397
+ "logits/chosen": -2.3584351539611816,
398
+ "logits/rejected": -2.362753391265869,
399
+ "logps/chosen": -359.6307067871094,
400
+ "logps/rejected": -422.2249450683594,
401
+ "loss": 0.1616,
402
  "rewards/accuracies": 0.731249988079071,
403
+ "rewards/chosen": -1.0809690952301025,
404
+ "rewards/margins": 0.6879221796989441,
405
+ "rewards/rejected": -1.7688913345336914,
406
  "step": 200
407
  },
408
  {
409
  "epoch": 0.42,
410
+ "eval_debug/losses": 0.16714033484458923,
411
+ "eval_debug/policy_weights": 0.3173619508743286,
412
+ "eval_debug/raw_losses": 0.5079437494277954,
413
+ "eval_logits/chosen": -2.44987154006958,
414
+ "eval_logits/rejected": -2.4276504516601562,
415
+ "eval_logps/chosen": -343.49139404296875,
416
+ "eval_logps/rejected": -416.28436279296875,
417
+ "eval_loss": 0.16694626212120056,
418
+ "eval_rewards/accuracies": 0.7109375,
419
+ "eval_rewards/chosen": -0.8829529881477356,
420
+ "eval_rewards/margins": 0.7173169851303101,
421
+ "eval_rewards/rejected": -1.6002700328826904,
422
+ "eval_runtime": 43.2126,
423
+ "eval_samples_per_second": 46.283,
424
+ "eval_steps_per_second": 0.741,
425
  "step": 200
426
  },
427
  {
428
+ "debug/losses": 0.14734311401844025,
429
+ "debug/policy_weights": 0.2870820164680481,
430
+ "debug/raw_losses": 0.5041711330413818,
431
  "epoch": 0.44,
432
+ "grad_norm": 7.104709810051969,
433
  "learning_rate": 3.4440382358952115e-07,
434
+ "logits/chosen": -2.3574612140655518,
435
+ "logits/rejected": -2.326927900314331,
436
+ "logps/chosen": -381.2717590332031,
437
+ "logps/rejected": -413.890869140625,
438
+ "loss": 0.1636,
439
+ "rewards/accuracies": 0.7437499761581421,
440
+ "rewards/chosen": -0.9982665777206421,
441
+ "rewards/margins": 0.6929588317871094,
442
+ "rewards/rejected": -1.6912254095077515,
443
  "step": 210
444
  },
445
  {
446
+ "debug/losses": 0.13881739974021912,
447
+ "debug/policy_weights": 0.27268368005752563,
448
+ "debug/raw_losses": 0.5210752487182617,
449
  "epoch": 0.46,
450
+ "grad_norm": 8.340698654196919,
451
  "learning_rate": 3.272542485937368e-07,
452
+ "logits/chosen": -2.3073763847351074,
453
+ "logits/rejected": -2.288245677947998,
454
+ "logps/chosen": -370.5925598144531,
455
+ "logps/rejected": -423.53021240234375,
456
+ "loss": 0.1549,
457
+ "rewards/accuracies": 0.75,
458
+ "rewards/chosen": -1.1713535785675049,
459
+ "rewards/margins": 0.7361623644828796,
460
+ "rewards/rejected": -1.9075158834457397,
461
  "step": 220
462
  },
463
  {
464
+ "debug/losses": 0.15816743671894073,
465
+ "debug/policy_weights": 0.2902621626853943,
466
+ "debug/raw_losses": 0.509421706199646,
467
  "epoch": 0.48,
468
+ "grad_norm": 7.2612463171873225,
469
  "learning_rate": 3.096924887558854e-07,
470
+ "logits/chosen": -2.339087724685669,
471
+ "logits/rejected": -2.31463360786438,
472
+ "logps/chosen": -342.2999572753906,
473
+ "logps/rejected": -425.86181640625,
474
+ "loss": 0.169,
475
+ "rewards/accuracies": 0.731249988079071,
476
+ "rewards/chosen": -1.0157617330551147,
477
+ "rewards/margins": 0.8210276365280151,
478
+ "rewards/rejected": -1.8367893695831299,
479
  "step": 230
480
  },
481
  {
482
+ "debug/losses": 0.15576457977294922,
483
+ "debug/policy_weights": 0.29614073038101196,
484
+ "debug/raw_losses": 0.4828997552394867,
485
  "epoch": 0.5,
486
+ "grad_norm": 7.345094511114747,
487
  "learning_rate": 2.9181224366319943e-07,
488
+ "logits/chosen": -2.3611202239990234,
489
+ "logits/rejected": -2.317119836807251,
490
+ "logps/chosen": -368.0779113769531,
491
+ "logps/rejected": -434.31549072265625,
492
+ "loss": 0.1514,
493
+ "rewards/accuracies": 0.768750011920929,
494
+ "rewards/chosen": -1.1054710149765015,
495
+ "rewards/margins": 0.8346185684204102,
496
+ "rewards/rejected": -1.9400895833969116,
497
  "step": 240
498
  },
499
  {
500
+ "debug/losses": 0.12131881713867188,
501
+ "debug/policy_weights": 0.2461313009262085,
502
+ "debug/raw_losses": 0.46923789381980896,
503
  "epoch": 0.52,
504
+ "grad_norm": 6.902697741869489,
505
  "learning_rate": 2.7370891215954565e-07,
506
+ "logits/chosen": -2.2654645442962646,
507
+ "logits/rejected": -2.2106964588165283,
508
+ "logps/chosen": -434.3672790527344,
509
+ "logps/rejected": -491.00750732421875,
510
+ "loss": 0.1285,
511
+ "rewards/accuracies": 0.78125,
512
+ "rewards/chosen": -1.4400012493133545,
513
+ "rewards/margins": 0.9503369331359863,
514
+ "rewards/rejected": -2.390338182449341,
515
  "step": 250
516
  },
517
  {
518
+ "debug/losses": 0.1332876831293106,
519
+ "debug/policy_weights": 0.27705293893814087,
520
+ "debug/raw_losses": 0.5086624622344971,
521
  "epoch": 0.54,
522
+ "grad_norm": 7.179715362550834,
523
  "learning_rate": 2.55479083351317e-07,
524
+ "logits/chosen": -2.316641330718994,
525
+ "logits/rejected": -2.3014323711395264,
526
+ "logps/chosen": -410.114990234375,
527
+ "logps/rejected": -454.8689880371094,
528
+ "loss": 0.1406,
529
+ "rewards/accuracies": 0.762499988079071,
530
+ "rewards/chosen": -1.2414360046386719,
531
+ "rewards/margins": 0.8403455018997192,
532
+ "rewards/rejected": -2.0817813873291016,
533
  "step": 260
534
  },
535
  {
536
+ "debug/losses": 0.168529212474823,
537
+ "debug/policy_weights": 0.2988266348838806,
538
+ "debug/raw_losses": 0.5682334899902344,
539
  "epoch": 0.56,
540
+ "grad_norm": 7.216808591858915,
541
  "learning_rate": 2.3722002126275822e-07,
542
+ "logits/chosen": -2.3377718925476074,
543
+ "logits/rejected": -2.3152408599853516,
544
+ "logps/chosen": -379.34661865234375,
545
+ "logps/rejected": -430.7207946777344,
546
+ "loss": 0.1519,
547
+ "rewards/accuracies": 0.675000011920929,
548
+ "rewards/chosen": -1.0926826000213623,
549
+ "rewards/margins": 0.6951464414596558,
550
+ "rewards/rejected": -1.7878293991088867,
551
  "step": 270
552
  },
553
  {
554
+ "debug/losses": 0.15156733989715576,
555
+ "debug/policy_weights": 0.2692407965660095,
556
+ "debug/raw_losses": 0.5413981080055237,
557
  "epoch": 0.59,
558
+ "grad_norm": 9.464551455783996,
559
  "learning_rate": 2.19029145890313e-07,
560
+ "logits/chosen": -2.295574426651001,
561
+ "logits/rejected": -2.2587995529174805,
562
+ "logps/chosen": -383.4193420410156,
563
+ "logps/rejected": -444.91680908203125,
564
+ "loss": 0.1498,
565
+ "rewards/accuracies": 0.731249988079071,
566
+ "rewards/chosen": -1.302856683731079,
567
+ "rewards/margins": 0.7735105156898499,
568
+ "rewards/rejected": -2.076367139816284,
569
  "step": 280
570
  },
571
  {
572
+ "debug/losses": 0.14380133152008057,
573
+ "debug/policy_weights": 0.26454827189445496,
574
+ "debug/raw_losses": 0.5405210256576538,
575
  "epoch": 0.61,
576
+ "grad_norm": 7.542791313020671,
577
  "learning_rate": 2.0100351342479216e-07,
578
+ "logits/chosen": -2.3062846660614014,
579
+ "logits/rejected": -2.2651028633117676,
580
+ "logps/chosen": -386.890625,
581
+ "logps/rejected": -450.9478454589844,
582
+ "loss": 0.1352,
583
+ "rewards/accuracies": 0.737500011920929,
584
+ "rewards/chosen": -1.4489481449127197,
585
+ "rewards/margins": 0.7288294434547424,
586
+ "rewards/rejected": -2.1777775287628174,
587
  "step": 290
588
  },
589
  {
590
+ "debug/losses": 0.14622345566749573,
591
+ "debug/policy_weights": 0.2636463940143585,
592
+ "debug/raw_losses": 0.536266028881073,
593
  "epoch": 0.63,
594
+ "grad_norm": 7.687631612230176,
595
  "learning_rate": 1.8323929841460178e-07,
596
+ "logits/chosen": -2.2653095722198486,
597
+ "logits/rejected": -2.2190821170806885,
598
+ "logps/chosen": -444.52960205078125,
599
+ "logps/rejected": -489.06817626953125,
600
+ "loss": 0.1343,
601
+ "rewards/accuracies": 0.699999988079071,
602
+ "rewards/chosen": -1.5472103357315063,
603
+ "rewards/margins": 0.8187928199768066,
604
+ "rewards/rejected": -2.3660032749176025,
605
  "step": 300
606
  },
607
  {
608
  "epoch": 0.63,
609
+ "eval_debug/losses": 0.13651318848133087,
610
+ "eval_debug/policy_weights": 0.26656585931777954,
611
+ "eval_debug/raw_losses": 0.49531808495521545,
612
+ "eval_logits/chosen": -2.2618136405944824,
613
+ "eval_logits/rejected": -2.228300094604492,
614
+ "eval_logps/chosen": -405.40423583984375,
615
+ "eval_logps/rejected": -493.411376953125,
616
+ "eval_loss": 0.13676287233829498,
617
+ "eval_rewards/accuracies": 0.7578125,
618
+ "eval_rewards/chosen": -1.5020815134048462,
619
+ "eval_rewards/margins": 0.8694581985473633,
620
+ "eval_rewards/rejected": -2.371539831161499,
621
+ "eval_runtime": 43.5081,
622
+ "eval_samples_per_second": 45.968,
623
+ "eval_steps_per_second": 0.735,
624
  "step": 300
625
  },
626
  {
627
+ "debug/losses": 0.1321336030960083,
628
+ "debug/policy_weights": 0.2647426724433899,
629
+ "debug/raw_losses": 0.4956802725791931,
630
  "epoch": 0.65,
631
+ "grad_norm": 7.197171827503445,
632
  "learning_rate": 1.6583128063291573e-07,
633
+ "logits/chosen": -2.186096429824829,
634
+ "logits/rejected": -2.185868501663208,
635
+ "logps/chosen": -443.67803955078125,
636
+ "logps/rejected": -486.25091552734375,
637
+ "loss": 0.1268,
638
+ "rewards/accuracies": 0.768750011920929,
639
+ "rewards/chosen": -1.5781536102294922,
640
+ "rewards/margins": 0.7728461027145386,
641
+ "rewards/rejected": -2.3510000705718994,
642
  "step": 310
643
  },
644
  {
645
+ "debug/losses": 0.13546636700630188,
646
+ "debug/policy_weights": 0.2598869204521179,
647
+ "debug/raw_losses": 0.5187257528305054,
648
  "epoch": 0.67,
649
+ "grad_norm": 7.989288502873238,
650
  "learning_rate": 1.488723393865766e-07,
651
+ "logits/chosen": -2.2410237789154053,
652
+ "logits/rejected": -2.2062346935272217,
653
+ "logps/chosen": -433.8523864746094,
654
+ "logps/rejected": -467.3929138183594,
655
+ "loss": 0.1308,
656
+ "rewards/accuracies": 0.7124999761581421,
657
+ "rewards/chosen": -1.493381142616272,
658
+ "rewards/margins": 0.7878578901290894,
659
+ "rewards/rejected": -2.2812390327453613,
660
  "step": 320
661
  },
662
  {
663
+ "debug/losses": 0.14906349778175354,
664
+ "debug/policy_weights": 0.2727554440498352,
665
+ "debug/raw_losses": 0.5552006959915161,
666
  "epoch": 0.69,
667
+ "grad_norm": 6.170487059058624,
668
  "learning_rate": 1.3245295796480788e-07,
669
+ "logits/chosen": -2.236531972885132,
670
+ "logits/rejected": -2.1864519119262695,
671
+ "logps/chosen": -390.4073791503906,
672
+ "logps/rejected": -459.62347412109375,
673
+ "loss": 0.1411,
674
+ "rewards/accuracies": 0.7124999761581421,
675
+ "rewards/chosen": -1.373223066329956,
676
+ "rewards/margins": 0.6836794018745422,
677
+ "rewards/rejected": -2.0569024085998535,
678
  "step": 330
679
  },
680
  {
681
+ "debug/losses": 0.1494503617286682,
682
+ "debug/policy_weights": 0.28068724274635315,
683
+ "debug/raw_losses": 0.4996885359287262,
684
  "epoch": 0.71,
685
+ "grad_norm": 9.416700000227674,
686
  "learning_rate": 1.1666074087171627e-07,
687
+ "logits/chosen": -2.2197887897491455,
688
+ "logits/rejected": -2.1604628562927246,
689
+ "logps/chosen": -418.9659118652344,
690
+ "logps/rejected": -511.0225524902344,
691
+ "loss": 0.1446,
692
+ "rewards/accuracies": 0.800000011920929,
693
+ "rewards/chosen": -1.3679134845733643,
694
+ "rewards/margins": 1.0408259630203247,
695
+ "rewards/rejected": -2.4087395668029785,
696
  "step": 340
697
  },
698
  {
699
+ "debug/losses": 0.12328938394784927,
700
+ "debug/policy_weights": 0.24812527000904083,
701
+ "debug/raw_losses": 0.48738375306129456,
702
  "epoch": 0.73,
703
+ "grad_norm": 8.40666805347326,
704
  "learning_rate": 1.0157994641835734e-07,
705
+ "logits/chosen": -2.172837018966675,
706
+ "logits/rejected": -2.1325600147247314,
707
+ "logps/chosen": -398.5860900878906,
708
+ "logps/rejected": -471.38397216796875,
709
+ "loss": 0.1286,
710
  "rewards/accuracies": 0.7124999761581421,
711
+ "rewards/chosen": -1.4896047115325928,
712
+ "rewards/margins": 0.9131780862808228,
713
+ "rewards/rejected": -2.402782917022705,
714
  "step": 350
715
  },
716
  {
717
+ "debug/losses": 0.1280422806739807,
718
+ "debug/policy_weights": 0.2567124366760254,
719
+ "debug/raw_losses": 0.5079829692840576,
720
  "epoch": 0.75,
721
+ "grad_norm": 7.5652554505895955,
722
  "learning_rate": 8.729103716819111e-08,
723
+ "logits/chosen": -2.230677366256714,
724
+ "logits/rejected": -2.138909101486206,
725
+ "logps/chosen": -448.53179931640625,
726
+ "logps/rejected": -494.7422790527344,
727
+ "loss": 0.1317,
728
+ "rewards/accuracies": 0.7437499761581421,
729
+ "rewards/chosen": -1.5306211709976196,
730
+ "rewards/margins": 0.8633092641830444,
731
+ "rewards/rejected": -2.393930435180664,
732
  "step": 360
733
  },
734
  {
735
+ "debug/losses": 0.13079939782619476,
736
+ "debug/policy_weights": 0.23427358269691467,
737
+ "debug/raw_losses": 0.5890456438064575,
738
  "epoch": 0.77,
739
+ "grad_norm": 9.87793971826204,
740
  "learning_rate": 7.387025063449081e-08,
741
+ "logits/chosen": -2.167722225189209,
742
+ "logits/rejected": -2.1336138248443604,
743
+ "logps/chosen": -432.41314697265625,
744
+ "logps/rejected": -470.364501953125,
745
+ "loss": 0.1313,
746
+ "rewards/accuracies": 0.71875,
747
+ "rewards/chosen": -1.6386140584945679,
748
+ "rewards/margins": 0.7472943067550659,
749
+ "rewards/rejected": -2.385908365249634,
750
  "step": 370
751
  },
752
  {
753
+ "debug/losses": 0.1057448610663414,
754
+ "debug/policy_weights": 0.2194373905658722,
755
+ "debug/raw_losses": 0.5027541518211365,
756
  "epoch": 0.79,
757
+ "grad_norm": 7.002597773553437,
758
  "learning_rate": 6.138919252022435e-08,
759
+ "logits/chosen": -2.093229055404663,
760
+ "logits/rejected": -2.0845484733581543,
761
+ "logps/chosen": -409.2686462402344,
762
+ "logps/rejected": -525.614013671875,
763
+ "loss": 0.1212,
764
+ "rewards/accuracies": 0.75,
765
+ "rewards/chosen": -1.7782268524169922,
766
+ "rewards/margins": 0.96544349193573,
767
+ "rewards/rejected": -2.743670701980591,
768
  "step": 380
769
  },
770
  {
771
+ "debug/losses": 0.14649717509746552,
772
+ "debug/policy_weights": 0.27705007791519165,
773
+ "debug/raw_losses": 0.5246635675430298,
774
  "epoch": 0.82,
775
+ "grad_norm": 7.870379900484857,
776
  "learning_rate": 4.991445467064689e-08,
777
+ "logits/chosen": -2.1749279499053955,
778
+ "logits/rejected": -2.138232946395874,
779
+ "logps/chosen": -439.6005859375,
780
+ "logps/rejected": -510.23260498046875,
781
+ "loss": 0.1384,
782
+ "rewards/accuracies": 0.71875,
783
+ "rewards/chosen": -1.4645540714263916,
784
+ "rewards/margins": 0.8761545419692993,
785
+ "rewards/rejected": -2.3407082557678223,
786
  "step": 390
787
  },
788
  {
789
+ "debug/losses": 0.1481957882642746,
790
+ "debug/policy_weights": 0.25738272070884705,
791
+ "debug/raw_losses": 0.5208276510238647,
792
  "epoch": 0.84,
793
+ "grad_norm": 7.317947546801369,
794
  "learning_rate": 3.9507259776993954e-08,
795
+ "logits/chosen": -2.1648905277252197,
796
+ "logits/rejected": -2.1050162315368652,
797
+ "logps/chosen": -416.4757385253906,
798
+ "logps/rejected": -496.00927734375,
799
+ "loss": 0.1398,
800
+ "rewards/accuracies": 0.75,
801
+ "rewards/chosen": -1.536012887954712,
802
+ "rewards/margins": 0.8311750292778015,
803
+ "rewards/rejected": -2.367187976837158,
804
  "step": 400
805
  },
806
  {
807
  "epoch": 0.84,
808
+ "eval_debug/losses": 0.13974608480930328,
809
+ "eval_debug/policy_weights": 0.28152135014533997,
810
+ "eval_debug/raw_losses": 0.47273018956184387,
811
+ "eval_logits/chosen": -2.2036194801330566,
812
+ "eval_logits/rejected": -2.1694607734680176,
813
+ "eval_logps/chosen": -386.73663330078125,
814
+ "eval_logps/rejected": -483.9327392578125,
815
+ "eval_loss": 0.14222779870033264,
816
+ "eval_rewards/accuracies": 0.76171875,
817
+ "eval_rewards/chosen": -1.3154058456420898,
818
+ "eval_rewards/margins": 0.9613481163978577,
819
+ "eval_rewards/rejected": -2.2767536640167236,
820
+ "eval_runtime": 98.9464,
821
+ "eval_samples_per_second": 20.213,
822
+ "eval_steps_per_second": 0.323,
823
  "step": 400
824
  },
825
  {
826
+ "debug/losses": 0.13129359483718872,
827
+ "debug/policy_weights": 0.25963759422302246,
828
+ "debug/raw_losses": 0.5042656660079956,
829
  "epoch": 0.86,
830
+ "grad_norm": 7.093812921964164,
831
  "learning_rate": 3.022313472693447e-08,
832
+ "logits/chosen": -2.2121942043304443,
833
+ "logits/rejected": -2.1647019386291504,
834
+ "logps/chosen": -432.4383850097656,
835
+ "logps/rejected": -489.4990234375,
836
+ "loss": 0.1404,
837
+ "rewards/accuracies": 0.75,
838
+ "rewards/chosen": -1.4152858257293701,
839
+ "rewards/margins": 0.8453054428100586,
840
+ "rewards/rejected": -2.260591506958008,
841
  "step": 410
842
  },
843
  {
844
+ "debug/losses": 0.15901803970336914,
845
+ "debug/policy_weights": 0.2908507287502289,
846
+ "debug/raw_losses": 0.5441660284996033,
847
  "epoch": 0.88,
848
+ "grad_norm": 7.194889894590537,
849
  "learning_rate": 2.2111614344599684e-08,
850
+ "logits/chosen": -2.1668639183044434,
851
+ "logits/rejected": -2.1467199325561523,
852
+ "logps/chosen": -427.835693359375,
853
+ "logps/rejected": -492.3506774902344,
854
+ "loss": 0.1412,
855
+ "rewards/accuracies": 0.6937500238418579,
856
+ "rewards/chosen": -1.4115890264511108,
857
+ "rewards/margins": 0.840238094329834,
858
+ "rewards/rejected": -2.2518272399902344,
859
  "step": 420
860
  },
861
  {
862
+ "debug/losses": 0.1401083916425705,
863
+ "debug/policy_weights": 0.2716709077358246,
864
+ "debug/raw_losses": 0.49931907653808594,
865
  "epoch": 0.9,
866
+ "grad_norm": 6.971935536096578,
867
  "learning_rate": 1.521597710086439e-08,
868
+ "logits/chosen": -2.106959342956543,
869
+ "logits/rejected": -2.051405668258667,
870
+ "logps/chosen": -412.853759765625,
871
+ "logps/rejected": -481.4043884277344,
872
+ "loss": 0.1291,
873
+ "rewards/accuracies": 0.7875000238418579,
874
+ "rewards/chosen": -1.4160857200622559,
875
+ "rewards/margins": 0.8900654911994934,
876
+ "rewards/rejected": -2.3061509132385254,
877
  "step": 430
878
  },
879
  {
880
+ "debug/losses": 0.14032089710235596,
881
+ "debug/policy_weights": 0.28002962470054626,
882
+ "debug/raw_losses": 0.4943667948246002,
883
  "epoch": 0.92,
884
+ "grad_norm": 7.598363791526828,
885
  "learning_rate": 9.57301420397924e-09,
886
+ "logits/chosen": -2.1854939460754395,
887
+ "logits/rejected": -2.1297545433044434,
888
+ "logps/chosen": -409.72625732421875,
889
+ "logps/rejected": -484.60546875,
890
+ "loss": 0.1401,
891
+ "rewards/accuracies": 0.737500011920929,
892
+ "rewards/chosen": -1.338160753250122,
893
+ "rewards/margins": 0.8834611773490906,
894
+ "rewards/rejected": -2.2216219902038574,
895
  "step": 440
896
  },
897
  {
898
+ "debug/losses": 0.1532571017742157,
899
+ "debug/policy_weights": 0.28182727098464966,
900
+ "debug/raw_losses": 0.5403295159339905,
901
  "epoch": 0.94,
902
+ "grad_norm": 7.420969377583743,
903
  "learning_rate": 5.212833302556258e-09,
904
+ "logits/chosen": -2.108179807662964,
905
+ "logits/rejected": -2.0896925926208496,
906
+ "logps/chosen": -421.1512756347656,
907
+ "logps/rejected": -522.5540771484375,
908
+ "loss": 0.1398,
909
+ "rewards/accuracies": 0.699999988079071,
910
+ "rewards/chosen": -1.3998409509658813,
911
+ "rewards/margins": 0.8198366165161133,
912
+ "rewards/rejected": -2.219677448272705,
913
  "step": 450
914
  },
915
  {
916
+ "debug/losses": 0.14014151692390442,
917
+ "debug/policy_weights": 0.27090734243392944,
918
+ "debug/raw_losses": 0.5257419347763062,
919
  "epoch": 0.96,
920
+ "grad_norm": 6.950959627699968,
921
  "learning_rate": 2.158697848236607e-09,
922
+ "logits/chosen": -2.1658027172088623,
923
+ "logits/rejected": -2.1314563751220703,
924
+ "logps/chosen": -402.38995361328125,
925
+ "logps/rejected": -448.8475646972656,
926
+ "loss": 0.1362,
927
+ "rewards/accuracies": 0.71875,
928
+ "rewards/chosen": -1.3720080852508545,
929
+ "rewards/margins": 0.7829836010932922,
930
+ "rewards/rejected": -2.154991626739502,
931
  "step": 460
932
  },
933
  {
934
+ "debug/losses": 0.14437520503997803,
935
+ "debug/policy_weights": 0.27037832140922546,
936
+ "debug/raw_losses": 0.5343233942985535,
937
  "epoch": 0.98,
938
+ "grad_norm": 8.44671954335882,
939
  "learning_rate": 4.269029751107489e-10,
940
+ "logits/chosen": -2.151285171508789,
941
+ "logits/rejected": -2.119642972946167,
942
+ "logps/chosen": -403.640625,
943
+ "logps/rejected": -483.1455078125,
944
+ "loss": 0.1371,
945
+ "rewards/accuracies": 0.7124999761581421,
946
+ "rewards/chosen": -1.3858269453048706,
947
+ "rewards/margins": 0.757634699344635,
948
+ "rewards/rejected": -2.1434617042541504,
949
  "step": 470
950
  },
951
  {
952
  "epoch": 1.0,
953
  "step": 478,
954
  "total_flos": 0.0,
955
+ "train_loss": 0.17903520272865456,
956
+ "train_runtime": 4631.245,
957
+ "train_samples_per_second": 13.2,
958
+ "train_steps_per_second": 0.103
959
  }
960
  ],
961
  "logging_steps": 10,
962
  "max_steps": 478,
963
+ "num_input_tokens_seen": 0,
964
  "num_train_epochs": 1,
965
  "save_steps": 100,
966
  "total_flos": 0.0,
967
+ "train_batch_size": 8,
968
  "trial_name": null,
969
  "trial_params": null
970
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9719e3f0efcf7076e0c24a28a8de3b015157defd25e092d527d5719a985b314c
3
- size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:399bbe17d43609818d6de82c511649d9abdfbf26f93b2196d4e24cacea1c61ee
3
+ size 6328