wzhouad commited on
Commit
f81992b
1 Parent(s): ba49b4e

Model save

Browse files
README.md CHANGED
@@ -14,16 +14,6 @@ should probably proofread and complete it, then remove this comment. -->
14
  # zephyr-7b-dpo-full
15
 
16
  This model was trained from scratch on the None dataset.
17
- It achieves the following results on the evaluation set:
18
- - Loss: 0.5261
19
- - Rewards/chosen: -2.4591
20
- - Rewards/rejected: -3.9221
21
- - Rewards/accuracies: 0.7773
22
- - Rewards/margins: 1.4631
23
- - Logps/rejected: -703.8400
24
- - Logps/chosen: -549.4910
25
- - Logits/rejected: 0.0289
26
- - Logits/chosen: 0.0663
27
 
28
  ## Model description
29
 
@@ -43,12 +33,12 @@ More information needed
43
 
44
  The following hyperparameters were used during training:
45
  - learning_rate: 1e-06
46
- - train_batch_size: 4
47
  - eval_batch_size: 8
48
- - seed: 2
49
  - distributed_type: multi-GPU
50
  - num_devices: 8
51
- - gradient_accumulation_steps: 4
52
  - total_train_batch_size: 128
53
  - total_eval_batch_size: 64
54
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
@@ -58,17 +48,6 @@ The following hyperparameters were used during training:
58
 
59
  ### Training results
60
 
61
- | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
- |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
- | 0.6201 | 0.21 | 100 | 0.6253 | -0.2753 | -0.6662 | 0.7031 | 0.3909 | -378.2405 | -331.1124 | 0.4172 | 0.3706 |
64
- | 0.5547 | 0.42 | 200 | 0.5549 | -0.6988 | -1.4726 | 0.7656 | 0.7738 | -458.8863 | -373.4661 | 0.4261 | 0.3909 |
65
- | 0.5343 | 0.63 | 300 | 0.5316 | -0.8044 | -1.6474 | 0.7656 | 0.8430 | -476.3628 | -384.0199 | 0.2851 | 0.2449 |
66
- | 0.5323 | 0.84 | 400 | 0.5211 | -0.9068 | -1.8283 | 0.7812 | 0.9216 | -494.4600 | -394.2621 | 0.2834 | 0.2514 |
67
- | 0.352 | 1.05 | 500 | 0.5258 | -1.9533 | -3.4166 | 0.7969 | 1.4634 | -653.2899 | -498.9117 | -0.0846 | -0.0654 |
68
- | 0.3342 | 1.26 | 600 | 0.5268 | -2.3123 | -3.7246 | 0.7930 | 1.4124 | -684.0857 | -534.8101 | 0.1128 | 0.1344 |
69
- | 0.337 | 1.47 | 700 | 0.5290 | -2.3753 | -3.8837 | 0.7773 | 1.5084 | -699.9910 | -541.1116 | 0.0099 | 0.0414 |
70
- | 0.3398 | 1.67 | 800 | 0.5297 | -2.5097 | -4.0133 | 0.7734 | 1.5036 | -712.9506 | -554.5546 | 0.0381 | 0.0750 |
71
- | 0.307 | 1.88 | 900 | 0.5261 | -2.4591 | -3.9221 | 0.7773 | 1.4631 | -703.8400 | -549.4910 | 0.0289 | 0.0663 |
72
 
73
 
74
  ### Framework versions
 
14
  # zephyr-7b-dpo-full
15
 
16
  This model was trained from scratch on the None dataset.
 
 
 
 
 
 
 
 
 
 
17
 
18
  ## Model description
19
 
 
33
 
34
  The following hyperparameters were used during training:
35
  - learning_rate: 1e-06
36
+ - train_batch_size: 2
37
  - eval_batch_size: 8
38
+ - seed: 4
39
  - distributed_type: multi-GPU
40
  - num_devices: 8
41
+ - gradient_accumulation_steps: 8
42
  - total_train_batch_size: 128
43
  - total_eval_batch_size: 64
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 
48
 
49
  ### Training results
50
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
 
53
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.0,
3
- "train_loss": 0.44779854000739333,
4
- "train_runtime": 8782.9823,
5
- "train_samples": 61134,
6
- "train_samples_per_second": 13.921,
7
- "train_steps_per_second": 0.109
8
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "train_loss": 0.5396277803770253,
4
+ "train_runtime": 10358.1577,
5
+ "train_samples": 45548,
6
+ "train_samples_per_second": 8.795,
7
+ "train_steps_per_second": 0.069
8
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac6928598b87aa9e7ace3c1f94d7df4eaa7164f92df0fda9603ca217542e9949
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20c73d37fa1abc7f882494b68d610836beb09ee5e90cce2d4b4576eb264bd637
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1832fd3b8ecfcb01bbcfbb4425fba1dbd3c57767b6fe8fe806acc6e01654a97b
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afe87beb2104fe94743a7aad2eaaf684b7d6d4a1bc13e5a8e968ee40e87d1576
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a888c6fcef84f2cb517c57068441aa0a6ba00c8bfc4bd4c7155e79556f8133b9
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f5ac4165fed09e88facc67c437f2ed004803890e8af5b9d37e433d9f3ae3649
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2d194b9549d875e2494134696a27103429d2e21501246a202a7415a40b5de72
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76643d477c833c3cd28dbb514282eada09b9314c8d0ae879cfa39e575f941a0c
3
  size 1168138808
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.0,
3
- "train_loss": 0.44779854000739333,
4
- "train_runtime": 8782.9823,
5
- "train_samples": 61134,
6
- "train_samples_per_second": 13.921,
7
- "train_steps_per_second": 0.109
8
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "train_loss": 0.5396277803770253,
4
+ "train_runtime": 10358.1577,
5
+ "train_samples": 45548,
6
+ "train_samples_per_second": 8.795,
7
+ "train_steps_per_second": 0.069
8
  }
trainer_state.json CHANGED
@@ -1,1501 +1,1021 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.9968602825745683,
5
- "eval_steps": 100,
6
- "global_step": 954,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.02,
13
- "learning_rate": 1.0416666666666667e-07,
14
- "logits/chosen": 0.29368966817855835,
15
- "logits/rejected": 0.3178113102912903,
16
- "logps/chosen": -295.21783447265625,
17
- "logps/rejected": -290.84619140625,
18
- "loss": 0.6933,
19
- "rewards/accuracies": 0.4000000059604645,
20
- "rewards/chosen": 0.00023447822604794055,
21
- "rewards/margins": 0.00020264319027774036,
22
- "rewards/rejected": 3.183506123605184e-05,
23
  "step": 10
24
  },
25
  {
26
- "epoch": 0.04,
27
- "learning_rate": 2.0833333333333333e-07,
28
- "logits/chosen": 0.25306791067123413,
29
- "logits/rejected": 0.3252382278442383,
30
- "logps/chosen": -318.19073486328125,
31
- "logps/rejected": -289.6706237792969,
32
- "loss": 0.6929,
33
- "rewards/accuracies": 0.53125,
34
- "rewards/chosen": 0.00048495858209207654,
35
- "rewards/margins": 0.0009848512709140778,
36
- "rewards/rejected": -0.0004998926888220012,
37
  "step": 20
38
  },
39
  {
40
- "epoch": 0.06,
41
- "learning_rate": 3.1249999999999997e-07,
42
- "logits/chosen": 0.33226653933525085,
43
- "logits/rejected": 0.3872108459472656,
44
- "logps/chosen": -296.3697204589844,
45
- "logps/rejected": -283.0611877441406,
46
- "loss": 0.692,
47
- "rewards/accuracies": 0.550000011920929,
48
- "rewards/chosen": -0.0008094090735539794,
49
- "rewards/margins": 0.0026363185606896877,
50
- "rewards/rejected": -0.003445727750658989,
51
  "step": 30
52
  },
53
  {
54
- "epoch": 0.08,
55
- "learning_rate": 4.1666666666666667e-07,
56
- "logits/chosen": 0.22801101207733154,
57
- "logits/rejected": 0.32900214195251465,
58
- "logps/chosen": -305.9015197753906,
59
- "logps/rejected": -293.1842346191406,
60
- "loss": 0.6883,
61
- "rewards/accuracies": 0.643750011920929,
62
- "rewards/chosen": -0.0027595984283834696,
63
- "rewards/margins": 0.006589935161173344,
64
- "rewards/rejected": -0.00934953335672617,
65
  "step": 40
66
  },
67
  {
68
- "epoch": 0.1,
69
- "learning_rate": 5.208333333333334e-07,
70
- "logits/chosen": 0.27091675996780396,
71
- "logits/rejected": 0.31866759061813354,
72
- "logps/chosen": -314.2833557128906,
73
- "logps/rejected": -307.02532958984375,
74
- "loss": 0.683,
75
- "rewards/accuracies": 0.6312500238418579,
76
- "rewards/chosen": -0.0060192132368683815,
77
- "rewards/margins": 0.019419629126787186,
78
- "rewards/rejected": -0.025438839569687843,
79
  "step": 50
80
  },
81
  {
82
- "epoch": 0.13,
83
- "learning_rate": 6.249999999999999e-07,
84
- "logits/chosen": 0.31704145669937134,
85
- "logits/rejected": 0.4334793984889984,
86
- "logps/chosen": -294.429931640625,
87
- "logps/rejected": -272.87994384765625,
88
- "loss": 0.6735,
89
- "rewards/accuracies": 0.6312500238418579,
90
- "rewards/chosen": -0.022902976721525192,
91
- "rewards/margins": 0.04408121109008789,
92
- "rewards/rejected": -0.06698418408632278,
93
  "step": 60
94
  },
95
  {
96
- "epoch": 0.15,
97
- "learning_rate": 7.291666666666666e-07,
98
- "logits/chosen": 0.31964099407196045,
99
- "logits/rejected": 0.3377896547317505,
100
- "logps/chosen": -304.6803894042969,
101
- "logps/rejected": -311.32794189453125,
102
- "loss": 0.6602,
103
- "rewards/accuracies": 0.6312500238418579,
104
- "rewards/chosen": -0.05670947954058647,
105
- "rewards/margins": 0.05069820210337639,
106
- "rewards/rejected": -0.10740767419338226,
107
  "step": 70
108
  },
109
  {
110
- "epoch": 0.17,
111
- "learning_rate": 8.333333333333333e-07,
112
- "logits/chosen": 0.35048729181289673,
113
- "logits/rejected": 0.4193252921104431,
114
- "logps/chosen": -306.3404541015625,
115
- "logps/rejected": -282.4783020019531,
116
- "loss": 0.6474,
117
- "rewards/accuracies": 0.643750011920929,
118
- "rewards/chosen": -0.10853584855794907,
119
- "rewards/margins": 0.13117292523384094,
120
- "rewards/rejected": -0.2397087812423706,
121
  "step": 80
122
  },
123
  {
124
- "epoch": 0.19,
125
- "learning_rate": 9.374999999999999e-07,
126
- "logits/chosen": 0.32813602685928345,
127
- "logits/rejected": 0.4464220404624939,
128
- "logps/chosen": -341.1703186035156,
129
- "logps/rejected": -299.92340087890625,
130
- "loss": 0.6357,
131
- "rewards/accuracies": 0.6499999761581421,
132
- "rewards/chosen": -0.2554694712162018,
133
- "rewards/margins": 0.12284588813781738,
134
- "rewards/rejected": -0.37831538915634155,
135
  "step": 90
136
  },
137
  {
138
- "epoch": 0.21,
139
- "learning_rate": 9.999463737538052e-07,
140
- "logits/chosen": 0.35799938440322876,
141
- "logits/rejected": 0.3899138271808624,
142
- "logps/chosen": -318.7712097167969,
143
- "logps/rejected": -348.5688781738281,
144
- "loss": 0.6201,
145
- "rewards/accuracies": 0.6312500238418579,
146
- "rewards/chosen": -0.3124231994152069,
147
- "rewards/margins": 0.19829413294792175,
148
- "rewards/rejected": -0.5107173323631287,
149
  "step": 100
150
  },
151
  {
152
- "epoch": 0.21,
153
- "eval_logits/chosen": 0.3705582916736603,
154
- "eval_logits/rejected": 0.4172414541244507,
155
- "eval_logps/chosen": -331.11236572265625,
156
- "eval_logps/rejected": -378.240478515625,
157
- "eval_loss": 0.6252639293670654,
158
- "eval_rewards/accuracies": 0.703125,
159
- "eval_rewards/chosen": -0.27527713775634766,
160
- "eval_rewards/margins": 0.3908771872520447,
161
- "eval_rewards/rejected": -0.6661543846130371,
162
- "eval_runtime": 64.993,
163
- "eval_samples_per_second": 30.773,
164
- "eval_steps_per_second": 0.492,
165
- "step": 100
166
- },
167
- {
168
- "epoch": 0.23,
169
- "learning_rate": 9.993432105822034e-07,
170
- "logits/chosen": 0.3002661168575287,
171
- "logits/rejected": 0.3483879864215851,
172
- "logps/chosen": -350.9095458984375,
173
- "logps/rejected": -360.30963134765625,
174
- "loss": 0.6093,
175
- "rewards/accuracies": 0.706250011920929,
176
- "rewards/chosen": -0.37449145317077637,
177
- "rewards/margins": 0.3816668689250946,
178
- "rewards/rejected": -0.7561584115028381,
179
  "step": 110
180
  },
181
  {
182
- "epoch": 0.25,
183
- "learning_rate": 9.980706626858607e-07,
184
- "logits/chosen": 0.2088731825351715,
185
- "logits/rejected": 0.2899537980556488,
186
- "logps/chosen": -395.3984069824219,
187
- "logps/rejected": -433.7286682128906,
188
- "loss": 0.5905,
189
- "rewards/accuracies": 0.6875,
190
- "rewards/chosen": -0.4185329079627991,
191
- "rewards/margins": 0.4517739713191986,
192
- "rewards/rejected": -0.8703069686889648,
193
  "step": 120
194
  },
195
  {
196
- "epoch": 0.27,
197
- "learning_rate": 9.961304359538434e-07,
198
- "logits/chosen": 0.09751267731189728,
199
- "logits/rejected": 0.22797170281410217,
200
- "logps/chosen": -374.6502380371094,
201
- "logps/rejected": -373.14263916015625,
202
- "loss": 0.5998,
203
- "rewards/accuracies": 0.6312500238418579,
204
- "rewards/chosen": -0.4483565390110016,
205
- "rewards/margins": 0.3120550811290741,
206
- "rewards/rejected": -0.7604116201400757,
207
  "step": 130
208
  },
209
  {
210
- "epoch": 0.29,
211
- "learning_rate": 9.935251313189563e-07,
212
- "logits/chosen": 0.23218217492103577,
213
- "logits/rejected": 0.28826406598091125,
214
- "logps/chosen": -363.15338134765625,
215
- "logps/rejected": -381.83795166015625,
216
- "loss": 0.5847,
217
- "rewards/accuracies": 0.699999988079071,
218
- "rewards/chosen": -0.49422675371170044,
219
- "rewards/margins": 0.38399404287338257,
220
- "rewards/rejected": -0.8782208561897278,
221
  "step": 140
222
  },
223
  {
224
- "epoch": 0.31,
225
- "learning_rate": 9.902582412711118e-07,
226
- "logits/chosen": 0.29499703645706177,
227
- "logits/rejected": 0.49714046716690063,
228
- "logps/chosen": -342.9471130371094,
229
- "logps/rejected": -372.1763610839844,
230
- "loss": 0.5681,
231
- "rewards/accuracies": 0.699999988079071,
232
- "rewards/chosen": -0.6186890602111816,
233
- "rewards/margins": 0.5347500443458557,
234
- "rewards/rejected": -1.1534390449523926,
235
  "step": 150
236
  },
237
  {
238
- "epoch": 0.33,
239
- "learning_rate": 9.86334145175542e-07,
240
- "logits/chosen": 0.35773637890815735,
241
- "logits/rejected": 0.4941268861293793,
242
- "logps/chosen": -370.3203125,
243
- "logps/rejected": -428.2557067871094,
244
- "loss": 0.5792,
245
- "rewards/accuracies": 0.7437499761581421,
246
- "rewards/chosen": -0.7682583928108215,
247
- "rewards/margins": 0.6092099547386169,
248
- "rewards/rejected": -1.377468228340149,
249
  "step": 160
250
  },
251
  {
252
- "epoch": 0.36,
253
- "learning_rate": 9.817581034021272e-07,
254
- "logits/chosen": 0.19401590526103973,
255
- "logits/rejected": 0.3156794607639313,
256
- "logps/chosen": -415.65313720703125,
257
- "logps/rejected": -425.68963623046875,
258
- "loss": 0.5747,
259
- "rewards/accuracies": 0.731249988079071,
260
- "rewards/chosen": -0.5680743455886841,
261
- "rewards/margins": 0.4286484122276306,
262
- "rewards/rejected": -0.9967228174209595,
263
  "step": 170
264
  },
265
  {
266
- "epoch": 0.38,
267
- "learning_rate": 9.765362502737097e-07,
268
- "logits/chosen": 0.2572034001350403,
269
- "logits/rejected": 0.27328386902809143,
270
- "logps/chosen": -358.0533447265625,
271
- "logps/rejected": -398.5332946777344,
272
- "loss": 0.5634,
273
- "rewards/accuracies": 0.699999988079071,
274
- "rewards/chosen": -0.644694447517395,
275
- "rewards/margins": 0.5251447558403015,
276
- "rewards/rejected": -1.1698391437530518,
277
  "step": 180
278
  },
279
  {
280
- "epoch": 0.4,
281
- "learning_rate": 9.706755858428485e-07,
282
- "logits/chosen": 0.3962785303592682,
283
- "logits/rejected": 0.4539657235145569,
284
- "logps/chosen": -347.5164794921875,
285
- "logps/rejected": -378.83184814453125,
286
- "loss": 0.5423,
287
- "rewards/accuracies": 0.71875,
288
- "rewards/chosen": -0.7963652610778809,
289
- "rewards/margins": 0.5720622539520264,
290
- "rewards/rejected": -1.3684275150299072,
291
  "step": 190
292
  },
293
  {
294
- "epoch": 0.42,
295
- "learning_rate": 9.641839665080363e-07,
296
- "logits/chosen": 0.3198946714401245,
297
- "logits/rejected": 0.4063253402709961,
298
- "logps/chosen": -352.0743713378906,
299
- "logps/rejected": -419.7388610839844,
300
- "loss": 0.5547,
301
- "rewards/accuracies": 0.7250000238418579,
302
- "rewards/chosen": -0.7651049494743347,
303
- "rewards/margins": 0.6531444787979126,
304
- "rewards/rejected": -1.418249487876892,
305
- "step": 200
306
- },
307
- {
308
- "epoch": 0.42,
309
- "eval_logits/chosen": 0.3908616304397583,
310
- "eval_logits/rejected": 0.4261176884174347,
311
- "eval_logps/chosen": -373.46612548828125,
312
- "eval_logps/rejected": -458.88629150390625,
313
- "eval_loss": 0.5549300312995911,
314
- "eval_rewards/accuracies": 0.765625,
315
- "eval_rewards/chosen": -0.6988146305084229,
316
- "eval_rewards/margins": 0.7737974524497986,
317
- "eval_rewards/rejected": -1.4726121425628662,
318
- "eval_runtime": 65.2313,
319
- "eval_samples_per_second": 30.66,
320
- "eval_steps_per_second": 0.491,
321
  "step": 200
322
  },
323
  {
324
- "epoch": 0.44,
325
- "learning_rate": 9.570700944819582e-07,
326
- "logits/chosen": 0.3505176901817322,
327
- "logits/rejected": 0.42375579476356506,
328
- "logps/chosen": -397.61199951171875,
329
- "logps/rejected": -454.89776611328125,
330
- "loss": 0.5265,
331
- "rewards/accuracies": 0.762499988079071,
332
- "rewards/chosen": -0.7523492574691772,
333
- "rewards/margins": 0.7344074845314026,
334
- "rewards/rejected": -1.486756682395935,
335
  "step": 210
336
  },
337
  {
338
- "epoch": 0.46,
339
- "learning_rate": 9.493435061259129e-07,
340
- "logits/chosen": 0.2726442813873291,
341
- "logits/rejected": 0.4434526860713959,
342
- "logps/chosen": -410.77667236328125,
343
- "logps/rejected": -431.207275390625,
344
- "loss": 0.555,
345
- "rewards/accuracies": 0.731249988079071,
346
- "rewards/chosen": -0.8984044194221497,
347
- "rewards/margins": 0.5839776992797852,
348
- "rewards/rejected": -1.48238205909729,
349
  "step": 220
350
  },
351
  {
352
- "epoch": 0.48,
353
- "learning_rate": 9.4101455916603e-07,
354
- "logits/chosen": 0.28837597370147705,
355
- "logits/rejected": 0.35526323318481445,
356
- "logps/chosen": -363.0335998535156,
357
- "logps/rejected": -436.46612548828125,
358
- "loss": 0.5498,
359
- "rewards/accuracies": 0.7124999761581421,
360
- "rewards/chosen": -0.8101651072502136,
361
- "rewards/margins": 0.737695574760437,
362
- "rewards/rejected": -1.5478605031967163,
363
  "step": 230
364
  },
365
  {
366
- "epoch": 0.5,
367
- "learning_rate": 9.320944188084241e-07,
368
- "logits/chosen": 0.23826150596141815,
369
- "logits/rejected": 0.285171240568161,
370
- "logps/chosen": -432.6297912597656,
371
- "logps/rejected": -497.56341552734375,
372
- "loss": 0.5392,
373
- "rewards/accuracies": 0.7437499761581421,
374
- "rewards/chosen": -0.7939236164093018,
375
- "rewards/margins": 0.7111212611198425,
376
- "rewards/rejected": -1.505044937133789,
377
  "step": 240
378
  },
379
  {
380
- "epoch": 0.52,
381
- "learning_rate": 9.225950427718974e-07,
382
- "logits/chosen": 0.2762988209724426,
383
- "logits/rejected": 0.31130319833755493,
384
- "logps/chosen": -399.399169921875,
385
- "logps/rejected": -445.702880859375,
386
- "loss": 0.5465,
387
- "rewards/accuracies": 0.6875,
388
- "rewards/chosen": -0.83611661195755,
389
- "rewards/margins": 0.7073522806167603,
390
- "rewards/rejected": -1.5434690713882446,
391
  "step": 250
392
  },
393
  {
394
- "epoch": 0.54,
395
- "learning_rate": 9.125291652582547e-07,
396
- "logits/chosen": 0.1327328383922577,
397
- "logits/rejected": 0.3085227310657501,
398
- "logps/chosen": -436.08135986328125,
399
- "logps/rejected": -457.634765625,
400
- "loss": 0.5194,
401
- "rewards/accuracies": 0.699999988079071,
402
- "rewards/chosen": -0.9428914785385132,
403
- "rewards/margins": 0.6950392723083496,
404
- "rewards/rejected": -1.6379308700561523,
405
  "step": 260
406
  },
407
  {
408
- "epoch": 0.57,
409
- "learning_rate": 9.019102798817195e-07,
410
- "logits/chosen": 0.23745720088481903,
411
- "logits/rejected": 0.34172096848487854,
412
- "logps/chosen": -421.8299865722656,
413
- "logps/rejected": -466.4856872558594,
414
- "loss": 0.5496,
415
- "rewards/accuracies": 0.71875,
416
- "rewards/chosen": -0.9325121641159058,
417
- "rewards/margins": 0.7680062651634216,
418
- "rewards/rejected": -1.7005186080932617,
419
  "step": 270
420
  },
421
  {
422
- "epoch": 0.59,
423
- "learning_rate": 8.90752621580335e-07,
424
- "logits/chosen": 0.16251161694526672,
425
- "logits/rejected": 0.2581509053707123,
426
- "logps/chosen": -418.5828552246094,
427
- "logps/rejected": -507.22412109375,
428
- "loss": 0.5168,
429
- "rewards/accuracies": 0.7250000238418579,
430
- "rewards/chosen": -1.113261342048645,
431
- "rewards/margins": 0.7757157683372498,
432
- "rewards/rejected": -1.88897705078125,
433
  "step": 280
434
  },
435
  {
436
- "epoch": 0.61,
437
- "learning_rate": 8.79071147533597e-07,
438
- "logits/chosen": 0.18345972895622253,
439
- "logits/rejected": 0.24752414226531982,
440
- "logps/chosen": -374.2388916015625,
441
- "logps/rejected": -421.7548828125,
442
- "loss": 0.5452,
443
- "rewards/accuracies": 0.699999988079071,
444
- "rewards/chosen": -0.7729798555374146,
445
- "rewards/margins": 0.6754422187805176,
446
- "rewards/rejected": -1.448421835899353,
447
  "step": 290
448
  },
449
  {
450
- "epoch": 0.63,
451
- "learning_rate": 8.668815171119019e-07,
452
- "logits/chosen": 0.11917382478713989,
453
- "logits/rejected": 0.2862890362739563,
454
- "logps/chosen": -396.6165466308594,
455
- "logps/rejected": -400.9344787597656,
456
- "loss": 0.5343,
457
- "rewards/accuracies": 0.75,
458
- "rewards/chosen": -0.8496394157409668,
459
- "rewards/margins": 0.5671547055244446,
460
- "rewards/rejected": -1.4167941808700562,
461
- "step": 300
462
- },
463
- {
464
- "epoch": 0.63,
465
- "eval_logits/chosen": 0.24486932158470154,
466
- "eval_logits/rejected": 0.2851215898990631,
467
- "eval_logps/chosen": -384.0199279785156,
468
- "eval_logps/rejected": -476.3627624511719,
469
- "eval_loss": 0.5315821766853333,
470
- "eval_rewards/accuracies": 0.765625,
471
- "eval_rewards/chosen": -0.8043524026870728,
472
- "eval_rewards/margins": 0.8430246114730835,
473
- "eval_rewards/rejected": -1.6473771333694458,
474
- "eval_runtime": 64.8898,
475
- "eval_samples_per_second": 30.822,
476
- "eval_steps_per_second": 0.493,
477
  "step": 300
478
  },
479
  {
480
- "epoch": 0.65,
481
- "learning_rate": 8.54200070884685e-07,
482
- "logits/chosen": 0.21563191711902618,
483
- "logits/rejected": 0.1952591836452484,
484
- "logps/chosen": -438.91552734375,
485
- "logps/rejected": -499.61395263671875,
486
- "loss": 0.5567,
487
- "rewards/accuracies": 0.699999988079071,
488
- "rewards/chosen": -1.173762559890747,
489
- "rewards/margins": 0.5995947122573853,
490
- "rewards/rejected": -1.7733571529388428,
491
  "step": 310
492
  },
493
  {
494
- "epoch": 0.67,
495
- "learning_rate": 8.410438087153911e-07,
496
- "logits/chosen": 0.031750187277793884,
497
- "logits/rejected": 0.14312420785427094,
498
- "logps/chosen": -457.0936584472656,
499
- "logps/rejected": -459.15411376953125,
500
- "loss": 0.5366,
501
- "rewards/accuracies": 0.612500011920929,
502
- "rewards/chosen": -1.0222933292388916,
503
- "rewards/margins": 0.4668423533439636,
504
- "rewards/rejected": -1.4891356229782104,
505
  "step": 320
506
  },
507
  {
508
- "epoch": 0.69,
509
- "learning_rate": 8.274303669726426e-07,
510
- "logits/chosen": 0.07745673507452011,
511
- "logits/rejected": 0.07082104682922363,
512
- "logps/chosen": -399.32464599609375,
513
- "logps/rejected": -523.4277954101562,
514
- "loss": 0.532,
515
- "rewards/accuracies": 0.7749999761581421,
516
- "rewards/chosen": -0.9549884796142578,
517
- "rewards/margins": 0.8862001299858093,
518
- "rewards/rejected": -1.8411887884140015,
519
  "step": 330
520
  },
521
  {
522
- "epoch": 0.71,
523
- "learning_rate": 8.133779948881513e-07,
524
- "logits/chosen": 0.10294970124959946,
525
- "logits/rejected": 0.09352216869592667,
526
- "logps/chosen": -409.33770751953125,
527
- "logps/rejected": -510.10089111328125,
528
- "loss": 0.5217,
529
- "rewards/accuracies": 0.7124999761581421,
530
- "rewards/chosen": -1.1990526914596558,
531
- "rewards/margins": 0.7596696615219116,
532
- "rewards/rejected": -1.958722472190857,
533
  "step": 340
534
  },
535
  {
536
- "epoch": 0.73,
537
- "learning_rate": 7.989055300930704e-07,
538
- "logits/chosen": 0.16737070679664612,
539
- "logits/rejected": 0.21901166439056396,
540
- "logps/chosen": -437.2059020996094,
541
- "logps/rejected": -538.3087158203125,
542
- "loss": 0.5091,
543
- "rewards/accuracies": 0.7437499761581421,
544
- "rewards/chosen": -1.2976617813110352,
545
- "rewards/margins": 0.8197441101074219,
546
- "rewards/rejected": -2.117405891418457,
547
  "step": 350
548
  },
549
  {
550
- "epoch": 0.75,
551
- "learning_rate": 7.840323733655778e-07,
552
- "logits/chosen": 0.1267612874507904,
553
- "logits/rejected": 0.20460394024848938,
554
- "logps/chosen": -465.1019592285156,
555
- "logps/rejected": -518.5805053710938,
556
- "loss": 0.5185,
557
- "rewards/accuracies": 0.7562500238418579,
558
- "rewards/chosen": -1.174440860748291,
559
- "rewards/margins": 0.9074214696884155,
560
- "rewards/rejected": -2.081862211227417,
561
  "step": 360
562
  },
563
  {
564
- "epoch": 0.77,
565
- "learning_rate": 7.687784626235447e-07,
566
- "logits/chosen": 0.18740633130073547,
567
- "logits/rejected": 0.27840983867645264,
568
- "logps/chosen": -468.21807861328125,
569
- "logps/rejected": -493.06646728515625,
570
- "loss": 0.5158,
571
- "rewards/accuracies": 0.675000011920929,
572
- "rewards/chosen": -1.1525232791900635,
573
- "rewards/margins": 0.7130603790283203,
574
- "rewards/rejected": -1.8655836582183838,
575
  "step": 370
576
  },
577
  {
578
- "epoch": 0.8,
579
- "learning_rate": 7.531642461971514e-07,
580
- "logits/chosen": 0.15459200739860535,
581
- "logits/rejected": 0.23283176124095917,
582
- "logps/chosen": -453.07904052734375,
583
- "logps/rejected": -535.761474609375,
584
- "loss": 0.5392,
585
- "rewards/accuracies": 0.7562500238418579,
586
- "rewards/chosen": -1.2102587223052979,
587
- "rewards/margins": 1.0239769220352173,
588
- "rewards/rejected": -2.2342355251312256,
589
  "step": 380
590
  },
591
  {
592
- "epoch": 0.82,
593
- "learning_rate": 7.372106554172801e-07,
594
- "logits/chosen": 0.19951777160167694,
595
- "logits/rejected": 0.2895793318748474,
596
- "logps/chosen": -411.4781799316406,
597
- "logps/rejected": -466.41253662109375,
598
- "loss": 0.5356,
599
- "rewards/accuracies": 0.7124999761581421,
600
- "rewards/chosen": -0.8807674646377563,
601
- "rewards/margins": 0.7255697846412659,
602
- "rewards/rejected": -1.606337308883667,
603
  "step": 390
604
  },
605
  {
606
- "epoch": 0.84,
607
- "learning_rate": 7.209390765564318e-07,
608
- "logits/chosen": 0.1757899820804596,
609
- "logits/rejected": 0.23791635036468506,
610
- "logps/chosen": -390.8957214355469,
611
- "logps/rejected": -478.49407958984375,
612
- "loss": 0.5323,
613
- "rewards/accuracies": 0.7437499761581421,
614
- "rewards/chosen": -0.9380720257759094,
615
- "rewards/margins": 0.7229949235916138,
616
- "rewards/rejected": -1.661067008972168,
617
- "step": 400
618
- },
619
- {
620
- "epoch": 0.84,
621
- "eval_logits/chosen": 0.25136542320251465,
622
- "eval_logits/rejected": 0.28342366218566895,
623
- "eval_logps/chosen": -394.2620849609375,
624
- "eval_logps/rejected": -494.46002197265625,
625
- "eval_loss": 0.521114706993103,
626
- "eval_rewards/accuracies": 0.78125,
627
- "eval_rewards/chosen": -0.90677410364151,
628
- "eval_rewards/margins": 0.9215754270553589,
629
- "eval_rewards/rejected": -1.8283497095108032,
630
- "eval_runtime": 65.2477,
631
- "eval_samples_per_second": 30.652,
632
- "eval_steps_per_second": 0.49,
633
  "step": 400
634
  },
635
  {
636
- "epoch": 0.86,
637
- "learning_rate": 7.043713221597773e-07,
638
- "logits/chosen": 0.1401471644639969,
639
- "logits/rejected": 0.17819848656654358,
640
- "logps/chosen": -401.8942565917969,
641
- "logps/rejected": -474.2579650878906,
642
- "loss": 0.5225,
643
- "rewards/accuracies": 0.7749999761581421,
644
- "rewards/chosen": -0.8892021179199219,
645
- "rewards/margins": 0.906479001045227,
646
- "rewards/rejected": -1.7956812381744385,
647
  "step": 410
648
  },
649
  {
650
- "epoch": 0.88,
651
- "learning_rate": 6.875296018047809e-07,
652
- "logits/chosen": 0.1304786652326584,
653
- "logits/rejected": 0.15569528937339783,
654
- "logps/chosen": -400.5438537597656,
655
- "logps/rejected": -468.2953186035156,
656
- "loss": 0.5065,
657
- "rewards/accuracies": 0.8187500238418579,
658
- "rewards/chosen": -0.9208317995071411,
659
- "rewards/margins": 0.9665653109550476,
660
- "rewards/rejected": -1.887397050857544,
661
  "step": 420
662
  },
663
  {
664
- "epoch": 0.9,
665
- "learning_rate": 6.704364923285857e-07,
666
- "logits/chosen": 0.09775003790855408,
667
- "logits/rejected": 0.1501173973083496,
668
- "logps/chosen": -476.73065185546875,
669
- "logps/rejected": -533.7389526367188,
670
- "loss": 0.5285,
671
- "rewards/accuracies": 0.71875,
672
- "rewards/chosen": -1.26286780834198,
673
- "rewards/margins": 0.8789494633674622,
674
- "rewards/rejected": -2.141817569732666,
675
  "step": 430
676
  },
677
  {
678
- "epoch": 0.92,
679
- "learning_rate": 6.531149075630796e-07,
680
- "logits/chosen": 0.1202029138803482,
681
- "logits/rejected": 0.1756385862827301,
682
- "logps/chosen": -400.1629333496094,
683
- "logps/rejected": -469.0196838378906,
684
- "loss": 0.5074,
685
- "rewards/accuracies": 0.6812499761581421,
686
- "rewards/chosen": -1.0582826137542725,
687
- "rewards/margins": 0.9171028137207031,
688
- "rewards/rejected": -1.975385308265686,
689
  "step": 440
690
  },
691
  {
692
- "epoch": 0.94,
693
- "learning_rate": 6.355880676182085e-07,
694
- "logits/chosen": 0.01758761703968048,
695
- "logits/rejected": 0.15642888844013214,
696
- "logps/chosen": -443.065185546875,
697
- "logps/rejected": -484.41473388671875,
698
- "loss": 0.5295,
699
- "rewards/accuracies": 0.6875,
700
- "rewards/chosen": -1.004433035850525,
701
- "rewards/margins": 0.8090255856513977,
702
- "rewards/rejected": -1.8134586811065674,
703
  "step": 450
704
  },
705
  {
706
- "epoch": 0.96,
707
- "learning_rate": 6.178794677547137e-07,
708
- "logits/chosen": 0.012063628062605858,
709
- "logits/rejected": 0.10572747141122818,
710
- "logps/chosen": -408.4104919433594,
711
- "logps/rejected": -449.46881103515625,
712
- "loss": 0.5267,
713
- "rewards/accuracies": 0.7437499761581421,
714
- "rewards/chosen": -1.0622318983078003,
715
- "rewards/margins": 0.6900007724761963,
716
- "rewards/rejected": -1.752232551574707,
717
  "step": 460
718
  },
719
  {
720
- "epoch": 0.98,
721
- "learning_rate": 6.000128468880222e-07,
722
- "logits/chosen": -0.05889149755239487,
723
- "logits/rejected": -0.014351313933730125,
724
- "logps/chosen": -449.07061767578125,
725
- "logps/rejected": -538.423828125,
726
- "loss": 0.5042,
727
- "rewards/accuracies": 0.762499988079071,
728
- "rewards/chosen": -1.1122030019760132,
729
- "rewards/margins": 0.9293983578681946,
730
- "rewards/rejected": -2.0416014194488525,
731
  "step": 470
732
  },
733
  {
734
- "epoch": 1.0,
735
- "learning_rate": 5.820121557655108e-07,
736
- "logits/chosen": -0.045923542231321335,
737
- "logits/rejected": 0.03086056187748909,
738
- "logps/chosen": -422.5347595214844,
739
- "logps/rejected": -529.2613525390625,
740
- "loss": 0.4682,
741
- "rewards/accuracies": 0.768750011920929,
742
- "rewards/chosen": -1.275895118713379,
743
- "rewards/margins": 1.037326455116272,
744
- "rewards/rejected": -2.3132214546203613,
745
  "step": 480
746
  },
747
  {
748
- "epoch": 1.03,
749
- "learning_rate": 5.639015248598023e-07,
750
- "logits/chosen": -0.05794327333569527,
751
- "logits/rejected": -0.09440571069717407,
752
- "logps/chosen": -441.08990478515625,
753
- "logps/rejected": -601.30419921875,
754
- "loss": 0.3493,
755
- "rewards/accuracies": 0.862500011920929,
756
- "rewards/chosen": -1.3565930128097534,
757
- "rewards/margins": 1.8090267181396484,
758
- "rewards/rejected": -3.1656198501586914,
759
  "step": 490
760
  },
761
  {
762
- "epoch": 1.05,
763
- "learning_rate": 5.457052320211339e-07,
764
- "logits/chosen": -0.23803594708442688,
765
- "logits/rejected": -0.29828980565071106,
766
- "logps/chosen": -474.3367614746094,
767
- "logps/rejected": -656.8623046875,
768
- "loss": 0.352,
769
- "rewards/accuracies": 0.8187500238418579,
770
- "rewards/chosen": -1.730020523071289,
771
- "rewards/margins": 1.8847625255584717,
772
- "rewards/rejected": -3.6147830486297607,
773
- "step": 500
774
- },
775
- {
776
- "epoch": 1.05,
777
- "eval_logits/chosen": -0.06535135954618454,
778
- "eval_logits/rejected": -0.08458372950553894,
779
- "eval_logps/chosen": -498.9117431640625,
780
- "eval_logps/rejected": -653.2899169921875,
781
- "eval_loss": 0.525809109210968,
782
- "eval_rewards/accuracies": 0.796875,
783
- "eval_rewards/chosen": -1.9532711505889893,
784
- "eval_rewards/margins": 1.4633771181106567,
785
- "eval_rewards/rejected": -3.4166483879089355,
786
- "eval_runtime": 64.5447,
787
- "eval_samples_per_second": 30.986,
788
- "eval_steps_per_second": 0.496,
789
  "step": 500
790
  },
791
  {
792
- "epoch": 1.07,
793
- "learning_rate": 5.274476699321637e-07,
794
- "logits/chosen": -0.17468394339084625,
795
- "logits/rejected": -0.19271844625473022,
796
- "logps/chosen": -464.8907165527344,
797
- "logps/rejected": -632.8477172851562,
798
- "loss": 0.3423,
799
- "rewards/accuracies": 0.893750011920929,
800
- "rewards/chosen": -1.695770263671875,
801
- "rewards/margins": 1.8958046436309814,
802
- "rewards/rejected": -3.5915749073028564,
803
  "step": 510
804
  },
805
  {
806
- "epoch": 1.09,
807
- "learning_rate": 5.091533134088387e-07,
808
- "logits/chosen": -0.23788562417030334,
809
- "logits/rejected": -0.20414999127388,
810
- "logps/chosen": -485.94415283203125,
811
- "logps/rejected": -640.9654541015625,
812
- "loss": 0.3236,
813
- "rewards/accuracies": 0.8999999761581421,
814
- "rewards/chosen": -1.8264538049697876,
815
- "rewards/margins": 1.7633371353149414,
816
- "rewards/rejected": -3.5897908210754395,
817
  "step": 520
818
  },
819
  {
820
- "epoch": 1.11,
821
- "learning_rate": 4.908466865911614e-07,
822
- "logits/chosen": -0.11999205499887466,
823
- "logits/rejected": -0.09423510730266571,
824
- "logps/chosen": -487.30810546875,
825
- "logps/rejected": -646.9547119140625,
826
- "loss": 0.3424,
827
- "rewards/accuracies": 0.84375,
828
- "rewards/chosen": -1.7824156284332275,
829
- "rewards/margins": 1.7626768350601196,
830
- "rewards/rejected": -3.5450921058654785,
831
  "step": 530
832
  },
833
  {
834
- "epoch": 1.13,
835
- "learning_rate": 4.7255233006783624e-07,
836
- "logits/chosen": -0.16903451085090637,
837
- "logits/rejected": -0.06715662032365799,
838
- "logps/chosen": -530.5585327148438,
839
- "logps/rejected": -645.1633911132812,
840
- "loss": 0.3301,
841
- "rewards/accuracies": 0.862500011920929,
842
- "rewards/chosen": -1.7870140075683594,
843
- "rewards/margins": 1.8271121978759766,
844
- "rewards/rejected": -3.614126682281494,
845
  "step": 540
846
  },
847
  {
848
- "epoch": 1.15,
849
- "learning_rate": 4.5429476797886617e-07,
850
- "logits/chosen": -0.03496643900871277,
851
- "logits/rejected": -0.04560618847608566,
852
- "logps/chosen": -483.15203857421875,
853
- "logps/rejected": -634.9447631835938,
854
- "loss": 0.3501,
855
- "rewards/accuracies": 0.8187500238418579,
856
- "rewards/chosen": -1.8454551696777344,
857
- "rewards/margins": 1.5336250066757202,
858
- "rewards/rejected": -3.379080295562744,
859
  "step": 550
860
  },
861
  {
862
- "epoch": 1.17,
863
- "learning_rate": 4.3609847514019763e-07,
864
- "logits/chosen": 0.06872721016407013,
865
- "logits/rejected": 0.0859331339597702,
866
- "logps/chosen": -506.5009765625,
867
- "logps/rejected": -646.7543334960938,
868
- "loss": 0.3359,
869
- "rewards/accuracies": 0.875,
870
- "rewards/chosen": -1.7461020946502686,
871
- "rewards/margins": 1.8654359579086304,
872
- "rewards/rejected": -3.6115379333496094,
873
  "step": 560
874
  },
875
  {
876
- "epoch": 1.19,
877
- "learning_rate": 4.179878442344892e-07,
878
- "logits/chosen": -0.04321649298071861,
879
- "logits/rejected": 0.051123034209012985,
880
- "logps/chosen": -496.735595703125,
881
- "logps/rejected": -647.29150390625,
882
- "loss": 0.3464,
883
- "rewards/accuracies": 0.8062499761581421,
884
- "rewards/chosen": -2.0714824199676514,
885
- "rewards/margins": 1.762717843055725,
886
- "rewards/rejected": -3.834200620651245,
887
  "step": 570
888
  },
889
  {
890
- "epoch": 1.21,
891
- "learning_rate": 3.9998715311197783e-07,
892
- "logits/chosen": -0.004042728338390589,
893
- "logits/rejected": 0.04070080816745758,
894
- "logps/chosen": -512.9417724609375,
895
- "logps/rejected": -696.821044921875,
896
- "loss": 0.3365,
897
- "rewards/accuracies": 0.862500011920929,
898
- "rewards/chosen": -2.097195863723755,
899
- "rewards/margins": 1.831916093826294,
900
- "rewards/rejected": -3.9291114807128906,
901
  "step": 580
902
  },
903
  {
904
- "epoch": 1.23,
905
- "learning_rate": 3.821205322452863e-07,
906
- "logits/chosen": 0.09215477854013443,
907
- "logits/rejected": 0.05737446993589401,
908
- "logps/chosen": -525.1434936523438,
909
- "logps/rejected": -705.9063110351562,
910
- "loss": 0.3158,
911
- "rewards/accuracies": 0.856249988079071,
912
- "rewards/chosen": -2.0721964836120605,
913
- "rewards/margins": 1.9721500873565674,
914
- "rewards/rejected": -4.044346809387207,
915
  "step": 590
916
  },
917
  {
918
- "epoch": 1.26,
919
- "learning_rate": 3.6441193238179146e-07,
920
- "logits/chosen": 0.07227401435375214,
921
- "logits/rejected": 0.03988388180732727,
922
- "logps/chosen": -549.4615478515625,
923
- "logps/rejected": -771.7275390625,
924
- "loss": 0.3342,
925
- "rewards/accuracies": 0.856249988079071,
926
- "rewards/chosen": -2.2555530071258545,
927
- "rewards/margins": 2.014453649520874,
928
- "rewards/rejected": -4.2700066566467285,
929
- "step": 600
930
- },
931
- {
932
- "epoch": 1.26,
933
- "eval_logits/chosen": 0.1344175487756729,
934
- "eval_logits/rejected": 0.11280365288257599,
935
- "eval_logps/chosen": -534.8101196289062,
936
- "eval_logps/rejected": -684.085693359375,
937
- "eval_loss": 0.5267595648765564,
938
- "eval_rewards/accuracies": 0.79296875,
939
- "eval_rewards/chosen": -2.3122546672821045,
940
- "eval_rewards/margins": 1.412351369857788,
941
- "eval_rewards/rejected": -3.7246060371398926,
942
- "eval_runtime": 64.3013,
943
- "eval_samples_per_second": 31.104,
944
- "eval_steps_per_second": 0.498,
945
  "step": 600
946
  },
947
  {
948
- "epoch": 1.28,
949
- "learning_rate": 3.4688509243692034e-07,
950
- "logits/chosen": 0.021421348676085472,
951
- "logits/rejected": 0.08866464346647263,
952
- "logps/chosen": -535.4236450195312,
953
- "logps/rejected": -694.8360595703125,
954
- "loss": 0.344,
955
- "rewards/accuracies": 0.8687499761581421,
956
- "rewards/chosen": -2.009089946746826,
957
- "rewards/margins": 1.8043378591537476,
958
- "rewards/rejected": -3.8134284019470215,
959
  "step": 610
960
  },
961
  {
962
- "epoch": 1.3,
963
- "learning_rate": 3.295635076714144e-07,
964
- "logits/chosen": -0.06299210339784622,
965
- "logits/rejected": -0.04097691923379898,
966
- "logps/chosen": -499.0265197753906,
967
- "logps/rejected": -653.1188354492188,
968
- "loss": 0.3317,
969
  "rewards/accuracies": 0.862500011920929,
970
- "rewards/chosen": -1.9393279552459717,
971
- "rewards/margins": 1.6952606439590454,
972
- "rewards/rejected": -3.6345887184143066,
973
  "step": 620
974
  },
975
  {
976
- "epoch": 1.32,
977
- "learning_rate": 3.12470398195219e-07,
978
- "logits/chosen": -0.013870243914425373,
979
- "logits/rejected": 0.0671583041548729,
980
- "logps/chosen": -512.1448974609375,
981
- "logps/rejected": -680.2833251953125,
982
- "loss": 0.3283,
983
- "rewards/accuracies": 0.887499988079071,
984
- "rewards/chosen": -1.9993736743927002,
985
- "rewards/margins": 1.9316644668579102,
986
- "rewards/rejected": -3.9310379028320312,
987
  "step": 630
988
  },
989
  {
990
- "epoch": 1.34,
991
- "learning_rate": 2.956286778402226e-07,
992
- "logits/chosen": -0.11906696856021881,
993
- "logits/rejected": -0.17396704852581024,
994
- "logps/chosen": -495.10345458984375,
995
- "logps/rejected": -677.45556640625,
996
- "loss": 0.319,
997
- "rewards/accuracies": 0.8812500238418579,
998
- "rewards/chosen": -2.2106266021728516,
999
- "rewards/margins": 1.9618394374847412,
1000
- "rewards/rejected": -4.172466278076172,
1001
  "step": 640
1002
  },
1003
  {
1004
- "epoch": 1.36,
1005
- "learning_rate": 2.7906092344356826e-07,
1006
- "logits/chosen": -0.07961982488632202,
1007
- "logits/rejected": -0.12522803246974945,
1008
- "logps/chosen": -516.12890625,
1009
- "logps/rejected": -696.549560546875,
1010
- "loss": 0.3233,
1011
- "rewards/accuracies": 0.8374999761581421,
1012
- "rewards/chosen": -2.1828126907348633,
1013
- "rewards/margins": 1.8567641973495483,
1014
- "rewards/rejected": -4.039577007293701,
1015
  "step": 650
1016
  },
1017
  {
1018
- "epoch": 1.38,
1019
- "learning_rate": 2.6278934458271996e-07,
1020
- "logits/chosen": -0.15696656703948975,
1021
- "logits/rejected": -0.12818947434425354,
1022
- "logps/chosen": -568.789794921875,
1023
- "logps/rejected": -722.9078369140625,
1024
- "loss": 0.3273,
1025
- "rewards/accuracies": 0.862500011920929,
1026
- "rewards/chosen": -2.1225264072418213,
1027
- "rewards/margins": 1.9797760248184204,
1028
- "rewards/rejected": -4.102302551269531,
1029
  "step": 660
1030
  },
1031
  {
1032
- "epoch": 1.4,
1033
- "learning_rate": 2.468357538028487e-07,
1034
- "logits/chosen": 0.01013887207955122,
1035
- "logits/rejected": 0.029213298112154007,
1036
- "logps/chosen": -541.7221069335938,
1037
- "logps/rejected": -720.105712890625,
1038
- "loss": 0.3153,
1039
  "rewards/accuracies": 0.856249988079071,
1040
- "rewards/chosen": -2.205817461013794,
1041
- "rewards/margins": 2.0757954120635986,
1042
- "rewards/rejected": -4.281612396240234,
1043
  "step": 670
1044
  },
1045
  {
1046
- "epoch": 1.42,
1047
- "learning_rate": 2.312215373764551e-07,
1048
- "logits/chosen": -0.06487278640270233,
1049
- "logits/rejected": -0.01965305209159851,
1050
- "logps/chosen": -516.360107421875,
1051
- "logps/rejected": -694.418212890625,
1052
- "loss": 0.3262,
1053
- "rewards/accuracies": 0.856249988079071,
1054
- "rewards/chosen": -2.184447765350342,
1055
- "rewards/margins": 1.861135721206665,
1056
- "rewards/rejected": -4.045583248138428,
1057
  "step": 680
1058
  },
1059
  {
1060
- "epoch": 1.44,
1061
- "learning_rate": 2.1596762663442213e-07,
1062
- "logits/chosen": -0.16129662096500397,
1063
- "logits/rejected": -0.09581325948238373,
1064
- "logps/chosen": -546.6666259765625,
1065
- "logps/rejected": -722.95263671875,
1066
- "loss": 0.3194,
1067
- "rewards/accuracies": 0.8500000238418579,
1068
- "rewards/chosen": -2.1821353435516357,
1069
- "rewards/margins": 2.131826877593994,
1070
- "rewards/rejected": -4.313961982727051,
1071
  "step": 690
1072
  },
1073
  {
1074
- "epoch": 1.47,
1075
- "learning_rate": 2.0109446990692963e-07,
1076
- "logits/chosen": -0.11689990758895874,
1077
- "logits/rejected": -0.2061731368303299,
1078
- "logps/chosen": -537.8167724609375,
1079
- "logps/rejected": -781.1466674804688,
1080
- "loss": 0.337,
1081
- "rewards/accuracies": 0.8812500238418579,
1082
- "rewards/chosen": -2.2769298553466797,
1083
- "rewards/margins": 2.1021370887756348,
1084
- "rewards/rejected": -4.3790669441223145,
1085
- "step": 700
1086
- },
1087
- {
1088
- "epoch": 1.47,
1089
- "eval_logits/chosen": 0.041396014392375946,
1090
- "eval_logits/rejected": 0.009947247803211212,
1091
- "eval_logps/chosen": -541.1116333007812,
1092
- "eval_logps/rejected": -699.990966796875,
1093
- "eval_loss": 0.5290427207946777,
1094
- "eval_rewards/accuracies": 0.77734375,
1095
- "eval_rewards/chosen": -2.375269651412964,
1096
- "eval_rewards/margins": 1.5083887577056885,
1097
- "eval_rewards/rejected": -3.883657932281494,
1098
- "eval_runtime": 64.3756,
1099
- "eval_samples_per_second": 31.068,
1100
- "eval_steps_per_second": 0.497,
1101
  "step": 700
1102
  },
1103
  {
1104
- "epoch": 1.49,
1105
- "learning_rate": 1.8662200511184872e-07,
1106
- "logits/chosen": -0.05749096721410751,
1107
- "logits/rejected": -0.05046076700091362,
1108
- "logps/chosen": -522.4590454101562,
1109
- "logps/rejected": -701.3648071289062,
1110
- "loss": 0.3387,
1111
- "rewards/accuracies": 0.8812500238418579,
1112
- "rewards/chosen": -2.1689815521240234,
1113
- "rewards/margins": 2.023318290710449,
1114
- "rewards/rejected": -4.192299842834473,
1115
- "step": 710
1116
- },
1117
- {
1118
- "epoch": 1.51,
1119
- "learning_rate": 1.725696330273575e-07,
1120
- "logits/chosen": -0.12996384501457214,
1121
- "logits/rejected": -0.20302283763885498,
1122
- "logps/chosen": -513.8458251953125,
1123
- "logps/rejected": -704.156982421875,
1124
- "loss": 0.3363,
1125
- "rewards/accuracies": 0.887499988079071,
1126
- "rewards/chosen": -1.9350038766860962,
1127
- "rewards/margins": 2.0845541954040527,
1128
- "rewards/rejected": -4.019558429718018,
1129
- "step": 720
1130
- },
1131
- {
1132
- "epoch": 1.53,
1133
- "learning_rate": 1.589561912846089e-07,
1134
- "logits/chosen": -0.02087187021970749,
1135
- "logits/rejected": -0.01297883689403534,
1136
- "logps/chosen": -513.4478149414062,
1137
- "logps/rejected": -715.5861206054688,
1138
- "loss": 0.3311,
1139
- "rewards/accuracies": 0.8812500238418579,
1140
- "rewards/chosen": -2.01027512550354,
1141
- "rewards/margins": 1.9698785543441772,
1142
- "rewards/rejected": -3.9801535606384277,
1143
- "step": 730
1144
- },
1145
- {
1146
- "epoch": 1.55,
1147
- "learning_rate": 1.4579992911531496e-07,
1148
- "logits/chosen": -0.08378951251506805,
1149
- "logits/rejected": -0.10333013534545898,
1150
- "logps/chosen": -528.8599853515625,
1151
- "logps/rejected": -697.0358276367188,
1152
- "loss": 0.31,
1153
- "rewards/accuracies": 0.887499988079071,
1154
- "rewards/chosen": -2.1352083683013916,
1155
- "rewards/margins": 2.004941940307617,
1156
- "rewards/rejected": -4.140150547027588,
1157
- "step": 740
1158
- },
1159
- {
1160
- "epoch": 1.57,
1161
- "learning_rate": 1.3311848288809813e-07,
1162
- "logits/chosen": -0.021279722452163696,
1163
- "logits/rejected": -0.07772192358970642,
1164
- "logps/chosen": -553.6995239257812,
1165
- "logps/rejected": -696.1124267578125,
1166
- "loss": 0.3325,
1167
- "rewards/accuracies": 0.856249988079071,
1168
- "rewards/chosen": -2.2102222442626953,
1169
- "rewards/margins": 1.8731515407562256,
1170
- "rewards/rejected": -4.083374500274658,
1171
- "step": 750
1172
- },
1173
- {
1174
- "epoch": 1.59,
1175
- "learning_rate": 1.209288524664029e-07,
1176
- "logits/chosen": -0.1553444117307663,
1177
- "logits/rejected": -0.07370997965335846,
1178
- "logps/chosen": -559.6234130859375,
1179
- "logps/rejected": -718.3536376953125,
1180
- "loss": 0.3318,
1181
- "rewards/accuracies": 0.84375,
1182
- "rewards/chosen": -2.2175405025482178,
1183
- "rewards/margins": 2.087991714477539,
1184
- "rewards/rejected": -4.305531978607178,
1185
- "step": 760
1186
- },
1187
- {
1188
- "epoch": 1.61,
1189
- "learning_rate": 1.0924737841966497e-07,
1190
- "logits/chosen": 0.06289811432361603,
1191
- "logits/rejected": 0.08451451361179352,
1192
- "logps/chosen": -502.3817443847656,
1193
- "logps/rejected": -655.4021606445312,
1194
- "loss": 0.3065,
1195
- "rewards/accuracies": 0.856249988079071,
1196
- "rewards/chosen": -2.215487241744995,
1197
- "rewards/margins": 1.8723819255828857,
1198
- "rewards/rejected": -4.087869644165039,
1199
- "step": 770
1200
- },
1201
- {
1202
- "epoch": 1.63,
1203
- "learning_rate": 9.808972011828054e-08,
1204
- "logits/chosen": -0.028034457936882973,
1205
- "logits/rejected": -0.07584713399410248,
1206
- "logps/chosen": -507.1363220214844,
1207
- "logps/rejected": -684.7667236328125,
1208
- "loss": 0.321,
1209
- "rewards/accuracies": 0.831250011920929,
1210
- "rewards/chosen": -2.1353392601013184,
1211
- "rewards/margins": 1.854984998703003,
1212
- "rewards/rejected": -3.9903244972229004,
1213
- "step": 780
1214
- },
1215
- {
1216
- "epoch": 1.65,
1217
- "learning_rate": 8.747083474174527e-08,
1218
- "logits/chosen": -0.027467548847198486,
1219
- "logits/rejected": -0.11992067098617554,
1220
- "logps/chosen": -513.2483520507812,
1221
- "logps/rejected": -773.5486450195312,
1222
- "loss": 0.3106,
1223
- "rewards/accuracies": 0.9125000238418579,
1224
- "rewards/chosen": -2.409447431564331,
1225
- "rewards/margins": 2.4787163734436035,
1226
- "rewards/rejected": -4.8881635665893555,
1227
- "step": 790
1228
- },
1229
- {
1230
- "epoch": 1.67,
1231
- "learning_rate": 7.740495722810269e-08,
1232
- "logits/chosen": -0.22111931443214417,
1233
- "logits/rejected": -0.16927292943000793,
1234
- "logps/chosen": -594.322998046875,
1235
- "logps/rejected": -767.4426879882812,
1236
- "loss": 0.3398,
1237
- "rewards/accuracies": 0.84375,
1238
- "rewards/chosen": -2.4693970680236816,
1239
- "rewards/margins": 2.0875487327575684,
1240
- "rewards/rejected": -4.55694580078125,
1241
- "step": 800
1242
- },
1243
- {
1244
- "epoch": 1.67,
1245
- "eval_logits/chosen": 0.0749908834695816,
1246
- "eval_logits/rejected": 0.0380852147936821,
1247
- "eval_logps/chosen": -554.5546264648438,
1248
- "eval_logps/rejected": -712.9505615234375,
1249
- "eval_loss": 0.5297122001647949,
1250
- "eval_rewards/accuracies": 0.7734375,
1251
- "eval_rewards/chosen": -2.509699821472168,
1252
- "eval_rewards/margins": 1.5035548210144043,
1253
- "eval_rewards/rejected": -4.013254642486572,
1254
- "eval_runtime": 65.3757,
1255
- "eval_samples_per_second": 30.592,
1256
- "eval_steps_per_second": 0.489,
1257
- "step": 800
1258
- },
1259
- {
1260
- "epoch": 1.7,
1261
- "learning_rate": 6.790558119157597e-08,
1262
- "logits/chosen": -0.03027234971523285,
1263
- "logits/rejected": -0.026968132704496384,
1264
- "logps/chosen": -558.7872924804688,
1265
- "logps/rejected": -744.04736328125,
1266
- "loss": 0.3134,
1267
- "rewards/accuracies": 0.8812500238418579,
1268
- "rewards/chosen": -2.323040246963501,
1269
- "rewards/margins": 2.1539111137390137,
1270
- "rewards/rejected": -4.476951599121094,
1271
- "step": 810
1272
- },
1273
- {
1274
- "epoch": 1.72,
1275
- "learning_rate": 5.898544083397e-08,
1276
- "logits/chosen": -0.06089891865849495,
1277
- "logits/rejected": -0.11520856618881226,
1278
- "logps/chosen": -516.7492065429688,
1279
- "logps/rejected": -698.2511596679688,
1280
- "loss": 0.3131,
1281
- "rewards/accuracies": 0.862500011920929,
1282
- "rewards/chosen": -2.2953174114227295,
1283
- "rewards/margins": 2.0010485649108887,
1284
- "rewards/rejected": -4.296365737915039,
1285
- "step": 820
1286
- },
1287
- {
1288
- "epoch": 1.74,
1289
- "learning_rate": 5.065649387408705e-08,
1290
- "logits/chosen": -0.026049736887216568,
1291
- "logits/rejected": -0.11464808881282806,
1292
- "logps/chosen": -556.2135620117188,
1293
- "logps/rejected": -718.17578125,
1294
- "loss": 0.3145,
1295
- "rewards/accuracies": 0.8687499761581421,
1296
- "rewards/chosen": -2.405728816986084,
1297
- "rewards/margins": 1.908630132675171,
1298
- "rewards/rejected": -4.314358711242676,
1299
- "step": 830
1300
- },
1301
- {
1302
- "epoch": 1.76,
1303
- "learning_rate": 4.292990551804171e-08,
1304
- "logits/chosen": -0.12712730467319489,
1305
- "logits/rejected": -0.10675887763500214,
1306
- "logps/chosen": -529.3737182617188,
1307
- "logps/rejected": -723.229248046875,
1308
- "loss": 0.327,
1309
- "rewards/accuracies": 0.8500000238418579,
1310
- "rewards/chosen": -2.146822929382324,
1311
- "rewards/margins": 2.177410125732422,
1312
- "rewards/rejected": -4.324233055114746,
1313
- "step": 840
1314
- },
1315
- {
1316
- "epoch": 1.78,
1317
- "learning_rate": 3.581603349196371e-08,
1318
- "logits/chosen": 0.01494809053838253,
1319
- "logits/rejected": -0.08690011501312256,
1320
- "logps/chosen": -540.9631958007812,
1321
- "logps/rejected": -735.3316650390625,
1322
- "loss": 0.3262,
1323
- "rewards/accuracies": 0.918749988079071,
1324
- "rewards/chosen": -2.145940065383911,
1325
- "rewards/margins": 2.0986034870147705,
1326
- "rewards/rejected": -4.24454402923584,
1327
- "step": 850
1328
- },
1329
- {
1330
- "epoch": 1.8,
1331
- "learning_rate": 2.9324414157151367e-08,
1332
- "logits/chosen": -0.10003119707107544,
1333
- "logits/rejected": -0.13639459013938904,
1334
- "logps/chosen": -504.34552001953125,
1335
- "logps/rejected": -681.4578247070312,
1336
- "loss": 0.3261,
1337
- "rewards/accuracies": 0.8812500238418579,
1338
- "rewards/chosen": -2.0967519283294678,
1339
- "rewards/margins": 1.8127696514129639,
1340
- "rewards/rejected": -3.9095215797424316,
1341
- "step": 860
1342
- },
1343
- {
1344
- "epoch": 1.82,
1345
- "learning_rate": 2.3463749726290284e-08,
1346
- "logits/chosen": -0.005720620043575764,
1347
- "logits/rejected": -0.09044505655765533,
1348
- "logps/chosen": -550.0,
1349
- "logps/rejected": -727.3507690429688,
1350
- "loss": 0.3221,
1351
- "rewards/accuracies": 0.862500011920929,
1352
- "rewards/chosen": -2.1374850273132324,
1353
- "rewards/margins": 2.060269355773926,
1354
- "rewards/rejected": -4.197754859924316,
1355
- "step": 870
1356
- },
1357
- {
1358
- "epoch": 1.84,
1359
- "learning_rate": 1.824189659787284e-08,
1360
- "logits/chosen": -0.07426755130290985,
1361
- "logits/rejected": -0.09251859039068222,
1362
- "logps/chosen": -551.6723022460938,
1363
- "logps/rejected": -753.9895629882812,
1364
- "loss": 0.3162,
1365
- "rewards/accuracies": 0.8812500238418579,
1366
- "rewards/chosen": -2.1559104919433594,
1367
- "rewards/margins": 2.1075100898742676,
1368
- "rewards/rejected": -4.263420581817627,
1369
- "step": 880
1370
- },
1371
- {
1372
- "epoch": 1.86,
1373
- "learning_rate": 1.3665854824458035e-08,
1374
- "logits/chosen": -0.05439913272857666,
1375
- "logits/rejected": -0.06511974334716797,
1376
- "logps/chosen": -540.4793090820312,
1377
- "logps/rejected": -690.4094848632812,
1378
- "loss": 0.3265,
1379
- "rewards/accuracies": 0.8500000238418579,
1380
- "rewards/chosen": -2.1754844188690186,
1381
- "rewards/margins": 1.7633205652236938,
1382
- "rewards/rejected": -3.938805103302002,
1383
- "step": 890
1384
- },
1385
- {
1386
- "epoch": 1.88,
1387
- "learning_rate": 9.741758728888217e-09,
1388
- "logits/chosen": -0.06001782417297363,
1389
- "logits/rejected": -0.016363339498639107,
1390
- "logps/chosen": -546.7725830078125,
1391
- "logps/rejected": -690.0701293945312,
1392
- "loss": 0.307,
1393
- "rewards/accuracies": 0.8812500238418579,
1394
- "rewards/chosen": -2.229912281036377,
1395
- "rewards/margins": 1.9347511529922485,
1396
- "rewards/rejected": -4.164663314819336,
1397
- "step": 900
1398
- },
1399
- {
1400
- "epoch": 1.88,
1401
- "eval_logits/chosen": 0.06628188490867615,
1402
- "eval_logits/rejected": 0.02886618673801422,
1403
- "eval_logps/chosen": -549.4910278320312,
1404
- "eval_logps/rejected": -703.8400268554688,
1405
- "eval_loss": 0.5260834097862244,
1406
- "eval_rewards/accuracies": 0.77734375,
1407
- "eval_rewards/chosen": -2.4590635299682617,
1408
- "eval_rewards/margins": 1.4630858898162842,
1409
- "eval_rewards/rejected": -3.922149658203125,
1410
- "eval_runtime": 65.2404,
1411
- "eval_samples_per_second": 30.656,
1412
- "eval_steps_per_second": 0.49,
1413
- "step": 900
1414
- },
1415
- {
1416
- "epoch": 1.9,
1417
- "learning_rate": 6.474868681043577e-09,
1418
- "logits/chosen": -0.0006331875920295715,
1419
- "logits/rejected": 0.06078674644231796,
1420
- "logps/chosen": -526.890869140625,
1421
- "logps/rejected": -696.57080078125,
1422
- "loss": 0.3133,
1423
  "rewards/accuracies": 0.84375,
1424
- "rewards/chosen": -2.2198734283447266,
1425
- "rewards/margins": 1.9986340999603271,
1426
- "rewards/rejected": -4.218507289886475,
1427
- "step": 910
1428
- },
1429
- {
1430
- "epoch": 1.93,
1431
- "learning_rate": 3.869564046156459e-09,
1432
- "logits/chosen": -0.17484715580940247,
1433
- "logits/rejected": -0.2491791695356369,
1434
- "logps/chosen": -560.9608154296875,
1435
- "logps/rejected": -718.45703125,
1436
- "loss": 0.3088,
1437
- "rewards/accuracies": 0.9125000238418579,
1438
- "rewards/chosen": -2.2493414878845215,
1439
- "rewards/margins": 2.0530190467834473,
1440
- "rewards/rejected": -4.3023600578308105,
1441
- "step": 920
1442
- },
1443
- {
1444
- "epoch": 1.95,
1445
- "learning_rate": 1.929337314139412e-09,
1446
- "logits/chosen": -0.0817941427230835,
1447
- "logits/rejected": -0.2032664567232132,
1448
- "logps/chosen": -562.2503662109375,
1449
- "logps/rejected": -765.2064208984375,
1450
- "loss": 0.3264,
1451
- "rewards/accuracies": 0.8812500238418579,
1452
- "rewards/chosen": -2.2634310722351074,
1453
- "rewards/margins": 2.2729239463806152,
1454
- "rewards/rejected": -4.536355018615723,
1455
- "step": 930
1456
- },
1457
- {
1458
- "epoch": 1.97,
1459
- "learning_rate": 6.567894177967325e-10,
1460
- "logits/chosen": -0.061837755143642426,
1461
- "logits/rejected": -0.09424273669719696,
1462
- "logps/chosen": -570.0264892578125,
1463
- "logps/rejected": -782.4632568359375,
1464
- "loss": 0.3155,
1465
- "rewards/accuracies": 0.8999999761581421,
1466
- "rewards/chosen": -2.259819984436035,
1467
- "rewards/margins": 2.1893467903137207,
1468
- "rewards/rejected": -4.449166297912598,
1469
- "step": 940
1470
- },
1471
- {
1472
- "epoch": 1.99,
1473
- "learning_rate": 5.3626246194704575e-11,
1474
- "logits/chosen": 0.019718965515494347,
1475
- "logits/rejected": -0.05742845684289932,
1476
- "logps/chosen": -491.19879150390625,
1477
- "logps/rejected": -698.8577880859375,
1478
- "loss": 0.3247,
1479
- "rewards/accuracies": 0.8687499761581421,
1480
- "rewards/chosen": -2.148132801055908,
1481
- "rewards/margins": 2.056044816970825,
1482
- "rewards/rejected": -4.204176902770996,
1483
- "step": 950
1484
  },
1485
  {
1486
  "epoch": 2.0,
1487
- "step": 954,
1488
  "total_flos": 0.0,
1489
- "train_loss": 0.44779854000739333,
1490
- "train_runtime": 8782.9823,
1491
- "train_samples_per_second": 13.921,
1492
- "train_steps_per_second": 0.109
1493
  }
1494
  ],
1495
  "logging_steps": 10,
1496
- "max_steps": 954,
1497
  "num_train_epochs": 2,
1498
- "save_steps": 1000,
1499
  "total_flos": 0.0,
1500
  "trial_name": null,
1501
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.9950825430277486,
5
+ "eval_steps": 10000,
6
+ "global_step": 710,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.03,
13
+ "learning_rate": 1.4084507042253522e-07,
14
+ "logits/chosen": 0.0005512732896022499,
15
+ "logits/rejected": 0.0318959541618824,
16
+ "logps/chosen": -259.0977478027344,
17
+ "logps/rejected": -201.09397888183594,
18
+ "loss": 0.6931,
19
+ "rewards/accuracies": 0.3687500059604645,
20
+ "rewards/chosen": 0.00018751960305962712,
21
+ "rewards/margins": 0.0003801170678343624,
22
+ "rewards/rejected": -0.00019259755208622664,
23
  "step": 10
24
  },
25
  {
26
+ "epoch": 0.06,
27
+ "learning_rate": 2.8169014084507043e-07,
28
+ "logits/chosen": -0.07602560520172119,
29
+ "logits/rejected": -0.010130566544830799,
30
+ "logps/chosen": -326.53594970703125,
31
+ "logps/rejected": -200.25064086914062,
32
+ "loss": 0.6915,
33
+ "rewards/accuracies": 0.5249999761581421,
34
+ "rewards/chosen": 0.0007010429399088025,
35
+ "rewards/margins": 0.002572448458522558,
36
+ "rewards/rejected": -0.0018714054021984339,
37
  "step": 20
38
  },
39
  {
40
+ "epoch": 0.08,
41
+ "learning_rate": 4.225352112676056e-07,
42
+ "logits/chosen": -0.033220209181308746,
43
+ "logits/rejected": -0.0025589261204004288,
44
+ "logps/chosen": -334.3651123046875,
45
+ "logps/rejected": -227.4903564453125,
46
+ "loss": 0.6856,
47
+ "rewards/accuracies": 0.59375,
48
+ "rewards/chosen": 0.0007741742883808911,
49
+ "rewards/margins": 0.014154395088553429,
50
+ "rewards/rejected": -0.013380222022533417,
51
  "step": 30
52
  },
53
  {
54
+ "epoch": 0.11,
55
+ "learning_rate": 5.633802816901409e-07,
56
+ "logits/chosen": -0.004470109473913908,
57
+ "logits/rejected": 0.04596555978059769,
58
+ "logps/chosen": -307.7451171875,
59
+ "logps/rejected": -191.01101684570312,
60
+ "loss": 0.6758,
61
+ "rewards/accuracies": 0.550000011920929,
62
+ "rewards/chosen": 0.0008303679642267525,
63
+ "rewards/margins": 0.05119556933641434,
64
+ "rewards/rejected": -0.05036519095301628,
65
  "step": 40
66
  },
67
  {
68
+ "epoch": 0.14,
69
+ "learning_rate": 7.04225352112676e-07,
70
+ "logits/chosen": 0.06877349317073822,
71
+ "logits/rejected": 0.1612742692232132,
72
+ "logps/chosen": -343.5093994140625,
73
+ "logps/rejected": -217.77890014648438,
74
+ "loss": 0.663,
75
+ "rewards/accuracies": 0.574999988079071,
76
+ "rewards/chosen": -0.016251932829618454,
77
+ "rewards/margins": 0.08627420663833618,
78
+ "rewards/rejected": -0.10252614319324493,
79
  "step": 50
80
  },
81
  {
82
+ "epoch": 0.17,
83
+ "learning_rate": 8.450704225352112e-07,
84
+ "logits/chosen": 0.004598864819854498,
85
+ "logits/rejected": 0.026440221816301346,
86
+ "logps/chosen": -311.1943359375,
87
+ "logps/rejected": -236.7565155029297,
88
+ "loss": 0.6591,
89
+ "rewards/accuracies": 0.59375,
90
+ "rewards/chosen": -0.07662348449230194,
91
+ "rewards/margins": 0.09086361527442932,
92
+ "rewards/rejected": -0.16748711466789246,
93
  "step": 60
94
  },
95
  {
96
+ "epoch": 0.2,
97
+ "learning_rate": 9.859154929577465e-07,
98
+ "logits/chosen": -0.02923472784459591,
99
+ "logits/rejected": 0.041188597679138184,
100
+ "logps/chosen": -317.4526672363281,
101
+ "logps/rejected": -232.837646484375,
102
+ "loss": 0.6632,
103
+ "rewards/accuracies": 0.574999988079071,
104
+ "rewards/chosen": -0.04823857173323631,
105
+ "rewards/margins": 0.1199621707201004,
106
+ "rewards/rejected": -0.16820073127746582,
107
  "step": 70
108
  },
109
  {
110
+ "epoch": 0.22,
111
+ "learning_rate": 9.995106132599868e-07,
112
+ "logits/chosen": -0.010874450206756592,
113
+ "logits/rejected": 0.04368402436375618,
114
+ "logps/chosen": -339.08502197265625,
115
+ "logps/rejected": -266.75567626953125,
116
+ "loss": 0.654,
117
+ "rewards/accuracies": 0.6000000238418579,
118
+ "rewards/chosen": -0.007924405857920647,
119
+ "rewards/margins": 0.12536732852458954,
120
+ "rewards/rejected": -0.13329175114631653,
121
  "step": 80
122
  },
123
  {
124
+ "epoch": 0.25,
125
+ "learning_rate": 9.978201358980644e-07,
126
+ "logits/chosen": 0.05211018770933151,
127
+ "logits/rejected": 0.09429686516523361,
128
+ "logps/chosen": -314.8374938964844,
129
+ "logps/rejected": -219.6160888671875,
130
+ "loss": 0.6486,
131
+ "rewards/accuracies": 0.5874999761581421,
132
+ "rewards/chosen": 0.05235903337597847,
133
+ "rewards/margins": 0.1537412703037262,
134
+ "rewards/rejected": -0.10138221830129623,
135
  "step": 90
136
  },
137
  {
138
+ "epoch": 0.28,
139
+ "learning_rate": 9.949266103908894e-07,
140
+ "logits/chosen": -0.01588534563779831,
141
+ "logits/rejected": 0.06498686969280243,
142
+ "logps/chosen": -297.93145751953125,
143
+ "logps/rejected": -223.25234985351562,
144
+ "loss": 0.6594,
145
+ "rewards/accuracies": 0.5562499761581421,
146
+ "rewards/chosen": -0.013728643767535686,
147
+ "rewards/margins": 0.11913089454174042,
148
+ "rewards/rejected": -0.13285955786705017,
149
  "step": 100
150
  },
151
  {
152
+ "epoch": 0.31,
153
+ "learning_rate": 9.908370293252287e-07,
154
+ "logits/chosen": -0.09032365679740906,
155
+ "logits/rejected": -0.06336920708417892,
156
+ "logps/chosen": -279.67047119140625,
157
+ "logps/rejected": -207.52157592773438,
158
+ "loss": 0.6463,
159
+ "rewards/accuracies": 0.6312500238418579,
160
+ "rewards/chosen": 0.00655151903629303,
161
+ "rewards/margins": 0.1867600381374359,
162
+ "rewards/rejected": -0.1802085041999817,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  "step": 110
164
  },
165
  {
166
+ "epoch": 0.34,
167
+ "learning_rate": 9.855612757141654e-07,
168
+ "logits/chosen": -0.06192486360669136,
169
+ "logits/rejected": 0.024322770535945892,
170
+ "logps/chosen": -364.93701171875,
171
+ "logps/rejected": -231.4254608154297,
172
+ "loss": 0.6401,
173
+ "rewards/accuracies": 0.637499988079071,
174
+ "rewards/chosen": 0.04051176831126213,
175
+ "rewards/margins": 0.2519943118095398,
176
+ "rewards/rejected": -0.21148256957530975,
177
  "step": 120
178
  },
179
  {
180
+ "epoch": 0.37,
181
+ "learning_rate": 9.791120991134902e-07,
182
+ "logits/chosen": -0.11360426992177963,
183
+ "logits/rejected": -0.06546725332736969,
184
+ "logps/chosen": -313.6669006347656,
185
+ "logps/rejected": -237.9274139404297,
186
+ "loss": 0.6368,
187
+ "rewards/accuracies": 0.6000000238418579,
188
+ "rewards/chosen": -0.0725397914648056,
189
+ "rewards/margins": 0.16836170852184296,
190
+ "rewards/rejected": -0.24090149998664856,
191
  "step": 130
192
  },
193
  {
194
+ "epoch": 0.39,
195
+ "learning_rate": 9.715050848107168e-07,
196
+ "logits/chosen": -0.04263811185956001,
197
+ "logits/rejected": 0.01193575281649828,
198
+ "logps/chosen": -330.8939514160156,
199
+ "logps/rejected": -221.0767822265625,
200
+ "loss": 0.6248,
201
+ "rewards/accuracies": 0.6812499761581421,
202
+ "rewards/chosen": -0.009152719751000404,
203
+ "rewards/margins": 0.2825842797756195,
204
+ "rewards/rejected": -0.29173702001571655,
205
  "step": 140
206
  },
207
  {
208
+ "epoch": 0.42,
209
+ "learning_rate": 9.627586161611731e-07,
210
+ "logits/chosen": -0.13610024750232697,
211
+ "logits/rejected": -0.060576580464839935,
212
+ "logps/chosen": -310.90887451171875,
213
+ "logps/rejected": -219.16915893554688,
214
+ "loss": 0.6424,
215
+ "rewards/accuracies": 0.65625,
216
+ "rewards/chosen": -0.08018601685762405,
217
+ "rewards/margins": 0.2522713243961334,
218
+ "rewards/rejected": -0.3324573338031769,
219
  "step": 150
220
  },
221
  {
222
+ "epoch": 0.45,
223
+ "learning_rate": 9.528938301621955e-07,
224
+ "logits/chosen": -0.17108501493930817,
225
+ "logits/rejected": -0.06673868000507355,
226
+ "logps/chosen": -350.9739685058594,
227
+ "logps/rejected": -277.810791015625,
228
+ "loss": 0.6291,
229
+ "rewards/accuracies": 0.606249988079071,
230
+ "rewards/chosen": -0.09369239956140518,
231
+ "rewards/margins": 0.18727482855319977,
232
+ "rewards/rejected": -0.28096717596054077,
233
  "step": 160
234
  },
235
  {
236
+ "epoch": 0.48,
237
+ "learning_rate": 9.419345663727804e-07,
238
+ "logits/chosen": -0.0947573184967041,
239
+ "logits/rejected": -0.03230197727680206,
240
+ "logps/chosen": -347.60308837890625,
241
+ "logps/rejected": -229.42044067382812,
242
+ "loss": 0.6348,
243
+ "rewards/accuracies": 0.668749988079071,
244
+ "rewards/chosen": -0.09307502955198288,
245
+ "rewards/margins": 0.28278011083602905,
246
+ "rewards/rejected": -0.37585514783859253,
247
  "step": 170
248
  },
249
  {
250
+ "epoch": 0.51,
251
+ "learning_rate": 9.299073093021404e-07,
252
+ "logits/chosen": -0.08774133771657944,
253
+ "logits/rejected": -0.022734731435775757,
254
+ "logps/chosen": -342.45574951171875,
255
+ "logps/rejected": -227.8816375732422,
256
+ "loss": 0.6219,
257
+ "rewards/accuracies": 0.6000000238418579,
258
+ "rewards/chosen": -0.10549932718276978,
259
+ "rewards/margins": 0.2989550232887268,
260
+ "rewards/rejected": -0.40445438027381897,
261
  "step": 180
262
  },
263
  {
264
+ "epoch": 0.53,
265
+ "learning_rate": 9.168411244063861e-07,
266
+ "logits/chosen": -0.1278824657201767,
267
+ "logits/rejected": -0.05893976613879204,
268
+ "logps/chosen": -350.15997314453125,
269
+ "logps/rejected": -241.92715454101562,
270
+ "loss": 0.6348,
271
+ "rewards/accuracies": 0.6937500238418579,
272
+ "rewards/chosen": -0.10131518542766571,
273
+ "rewards/margins": 0.3444809019565582,
274
+ "rewards/rejected": -0.44579607248306274,
275
  "step": 190
276
  },
277
  {
278
+ "epoch": 0.56,
279
+ "learning_rate": 9.02767587848013e-07,
280
+ "logits/chosen": -0.09527161717414856,
281
+ "logits/rejected": -0.06935124844312668,
282
+ "logps/chosen": -298.73040771484375,
283
+ "logps/rejected": -257.57305908203125,
284
+ "loss": 0.6272,
285
+ "rewards/accuracies": 0.5562499761581421,
286
+ "rewards/chosen": -0.16879644989967346,
287
+ "rewards/margins": 0.16612185537815094,
288
+ "rewards/rejected": -0.3349182903766632,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  "step": 200
290
  },
291
  {
292
+ "epoch": 0.59,
293
+ "learning_rate": 8.877207101879301e-07,
294
+ "logits/chosen": -0.13824552297592163,
295
+ "logits/rejected": -0.0595315583050251,
296
+ "logps/chosen": -340.5955810546875,
297
+ "logps/rejected": -261.76763916015625,
298
+ "loss": 0.6222,
299
+ "rewards/accuracies": 0.6625000238418579,
300
+ "rewards/chosen": -0.16845914721488953,
301
+ "rewards/margins": 0.2435651272535324,
302
+ "rewards/rejected": -0.4120243191719055,
303
  "step": 210
304
  },
305
  {
306
+ "epoch": 0.62,
307
+ "learning_rate": 8.717368541944452e-07,
308
+ "logits/chosen": -0.07102419435977936,
309
+ "logits/rejected": 0.02669895812869072,
310
+ "logps/chosen": -359.16259765625,
311
+ "logps/rejected": -266.5383605957031,
312
+ "loss": 0.6314,
313
+ "rewards/accuracies": 0.7124999761581421,
314
+ "rewards/chosen": -0.24132736027240753,
315
+ "rewards/margins": 0.3515443503856659,
316
+ "rewards/rejected": -0.5928717255592346,
317
  "step": 220
318
  },
319
  {
320
+ "epoch": 0.65,
321
+ "learning_rate": 8.54854646967831e-07,
322
+ "logits/chosen": -0.1354692280292511,
323
+ "logits/rejected": -0.0510697066783905,
324
+ "logps/chosen": -381.85028076171875,
325
+ "logps/rejected": -264.0498046875,
326
+ "loss": 0.6208,
327
+ "rewards/accuracies": 0.643750011920929,
328
+ "rewards/chosen": -0.22062142193317413,
329
+ "rewards/margins": 0.3123762011528015,
330
+ "rewards/rejected": -0.5329976081848145,
331
  "step": 230
332
  },
333
  {
334
+ "epoch": 0.67,
335
+ "learning_rate": 8.371148865928318e-07,
336
+ "logits/chosen": -0.10147638618946075,
337
+ "logits/rejected": -0.049172915518283844,
338
+ "logps/chosen": -355.6955261230469,
339
+ "logps/rejected": -262.17279052734375,
340
+ "loss": 0.6157,
341
+ "rewards/accuracies": 0.637499988079071,
342
+ "rewards/chosen": -0.24571116268634796,
343
+ "rewards/margins": 0.259532630443573,
344
+ "rewards/rejected": -0.5052437782287598,
345
  "step": 240
346
  },
347
  {
348
+ "epoch": 0.7,
349
+ "learning_rate": 8.185604435447001e-07,
350
+ "logits/chosen": -0.08465194702148438,
351
+ "logits/rejected": -0.07315615564584732,
352
+ "logps/chosen": -322.062744140625,
353
+ "logps/rejected": -276.74908447265625,
354
+ "loss": 0.6184,
355
+ "rewards/accuracies": 0.6499999761581421,
356
+ "rewards/chosen": -0.3060557246208191,
357
+ "rewards/margins": 0.24059104919433594,
358
+ "rewards/rejected": -0.546646773815155,
359
  "step": 250
360
  },
361
  {
362
+ "epoch": 0.73,
363
+ "learning_rate": 7.992361570870287e-07,
364
+ "logits/chosen": -0.09758251905441284,
365
+ "logits/rejected": -0.011859369464218616,
366
+ "logps/chosen": -369.69940185546875,
367
+ "logps/rejected": -246.3345489501953,
368
+ "loss": 0.62,
369
+ "rewards/accuracies": 0.6875,
370
+ "rewards/chosen": -0.248373344540596,
371
+ "rewards/margins": 0.29594987630844116,
372
+ "rewards/rejected": -0.5443232655525208,
373
  "step": 260
374
  },
375
  {
376
+ "epoch": 0.76,
377
+ "learning_rate": 7.791887269117441e-07,
378
+ "logits/chosen": -0.10910804569721222,
379
+ "logits/rejected": -0.061185322701931,
380
+ "logps/chosen": -332.3966064453125,
381
+ "logps/rejected": -267.03057861328125,
382
+ "loss": 0.6309,
383
+ "rewards/accuracies": 0.625,
384
+ "rewards/chosen": -0.3056594729423523,
385
+ "rewards/margins": 0.2236270159482956,
386
+ "rewards/rejected": -0.5292865037918091,
387
  "step": 270
388
  },
389
  {
390
+ "epoch": 0.79,
391
+ "learning_rate": 7.584666002831294e-07,
392
+ "logits/chosen": -0.15460695326328278,
393
+ "logits/rejected": -0.11919967085123062,
394
+ "logps/chosen": -383.691650390625,
395
+ "logps/rejected": -281.88201904296875,
396
+ "loss": 0.6118,
397
+ "rewards/accuracies": 0.643750011920929,
398
+ "rewards/chosen": -0.25431981682777405,
399
+ "rewards/margins": 0.27084994316101074,
400
+ "rewards/rejected": -0.5251697301864624,
401
  "step": 280
402
  },
403
  {
404
+ "epoch": 0.81,
405
+ "learning_rate": 7.37119854958609e-07,
406
+ "logits/chosen": -0.09878663718700409,
407
+ "logits/rejected": -0.054516829550266266,
408
+ "logps/chosen": -367.6859130859375,
409
+ "logps/rejected": -266.76129150390625,
410
+ "loss": 0.6145,
411
+ "rewards/accuracies": 0.625,
412
+ "rewards/chosen": -0.2599807381629944,
413
+ "rewards/margins": 0.22561149299144745,
414
+ "rewards/rejected": -0.4855922758579254,
415
  "step": 290
416
  },
417
  {
418
+ "epoch": 0.84,
419
+ "learning_rate": 7.152000781692285e-07,
420
+ "logits/chosen": -0.06776697933673859,
421
+ "logits/rejected": -0.08343350887298584,
422
+ "logps/chosen": -330.73406982421875,
423
+ "logps/rejected": -282.1576232910156,
424
+ "loss": 0.6194,
425
+ "rewards/accuracies": 0.6312500238418579,
426
+ "rewards/chosen": -0.2815566062927246,
427
+ "rewards/margins": 0.25665608048439026,
428
+ "rewards/rejected": -0.5382126569747925,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  "step": 300
430
  },
431
  {
432
+ "epoch": 0.87,
433
+ "learning_rate": 6.927602419522946e-07,
434
+ "logits/chosen": -0.19854819774627686,
435
+ "logits/rejected": -0.15916487574577332,
436
+ "logps/chosen": -394.5406799316406,
437
+ "logps/rejected": -283.3205871582031,
438
+ "loss": 0.6056,
439
+ "rewards/accuracies": 0.6875,
440
+ "rewards/chosen": -0.18460381031036377,
441
+ "rewards/margins": 0.36649608612060547,
442
+ "rewards/rejected": -0.5510998964309692,
443
  "step": 310
444
  },
445
  {
446
+ "epoch": 0.9,
447
+ "learning_rate": 6.698545751374463e-07,
448
+ "logits/chosen": -0.1863429993391037,
449
+ "logits/rejected": -0.1495630294084549,
450
+ "logps/chosen": -378.22869873046875,
451
+ "logps/rejected": -304.0544128417969,
452
+ "loss": 0.6041,
453
+ "rewards/accuracies": 0.6187499761581421,
454
+ "rewards/chosen": -0.3425668478012085,
455
+ "rewards/margins": 0.2656816840171814,
456
+ "rewards/rejected": -0.6082485318183899,
457
  "step": 320
458
  },
459
  {
460
+ "epoch": 0.93,
461
+ "learning_rate": 6.465384322955224e-07,
462
+ "logits/chosen": -0.23867850005626678,
463
+ "logits/rejected": -0.15165486931800842,
464
+ "logps/chosen": -390.47216796875,
465
+ "logps/rejected": -301.1231994628906,
466
+ "loss": 0.6279,
467
+ "rewards/accuracies": 0.6499999761581421,
468
+ "rewards/chosen": -0.2970598340034485,
469
+ "rewards/margins": 0.28874671459198,
470
+ "rewards/rejected": -0.5858064889907837,
471
  "step": 330
472
  },
473
  {
474
+ "epoch": 0.96,
475
+ "learning_rate": 6.228681599669248e-07,
476
+ "logits/chosen": -0.25492700934410095,
477
+ "logits/rejected": -0.17601162195205688,
478
+ "logps/chosen": -340.286376953125,
479
+ "logps/rejected": -277.90606689453125,
480
+ "loss": 0.6209,
481
+ "rewards/accuracies": 0.6812499761581421,
482
+ "rewards/chosen": -0.287325382232666,
483
+ "rewards/margins": 0.2902410924434662,
484
+ "rewards/rejected": -0.5775664448738098,
485
  "step": 340
486
  },
487
  {
488
+ "epoch": 0.98,
489
+ "learning_rate": 5.989009604927586e-07,
490
+ "logits/chosen": -0.18320497870445251,
491
+ "logits/rejected": -0.09960085898637772,
492
+ "logps/chosen": -340.33306884765625,
493
+ "logps/rejected": -269.10101318359375,
494
+ "loss": 0.6106,
495
+ "rewards/accuracies": 0.6000000238418579,
496
+ "rewards/chosen": -0.3515200912952423,
497
+ "rewards/margins": 0.23425371944904327,
498
+ "rewards/rejected": -0.5857738256454468,
499
  "step": 350
500
  },
501
  {
502
+ "epoch": 1.01,
503
+ "learning_rate": 5.74694753777815e-07,
504
+ "logits/chosen": -0.17563189566135406,
505
+ "logits/rejected": -0.1343529373407364,
506
+ "logps/chosen": -359.41845703125,
507
+ "logps/rejected": -293.6738586425781,
508
+ "loss": 0.5647,
509
+ "rewards/accuracies": 0.793749988079071,
510
+ "rewards/chosen": -0.2671016454696655,
511
+ "rewards/margins": 0.4457646310329437,
512
+ "rewards/rejected": -0.7128663063049316,
513
  "step": 360
514
  },
515
  {
516
+ "epoch": 1.04,
517
+ "learning_rate": 5.503080373194666e-07,
518
+ "logits/chosen": -0.19107575714588165,
519
+ "logits/rejected": -0.1297108232975006,
520
+ "logps/chosen": -330.1180725097656,
521
+ "logps/rejected": -284.3822326660156,
522
+ "loss": 0.4933,
523
+ "rewards/accuracies": 0.856249988079071,
524
+ "rewards/chosen": -0.307260125875473,
525
+ "rewards/margins": 0.6193431615829468,
526
+ "rewards/rejected": -0.9266033172607422,
527
  "step": 370
528
  },
529
  {
530
+ "epoch": 1.07,
531
+ "learning_rate": 5.257997448407366e-07,
532
+ "logits/chosen": -0.25203296542167664,
533
+ "logits/rejected": -0.15098513662815094,
534
+ "logps/chosen": -356.20220947265625,
535
+ "logps/rejected": -304.3904113769531,
536
+ "loss": 0.4944,
537
+ "rewards/accuracies": 0.8125,
538
+ "rewards/chosen": -0.44451460242271423,
539
+ "rewards/margins": 0.5491349101066589,
540
+ "rewards/rejected": -0.9936494827270508,
541
  "step": 380
542
  },
543
  {
544
+ "epoch": 1.1,
545
+ "learning_rate": 5.012291038691665e-07,
546
+ "logits/chosen": -0.11390683799982071,
547
+ "logits/rejected": 0.04292234033346176,
548
+ "logps/chosen": -406.4115295410156,
549
+ "logps/rejected": -312.5035705566406,
550
+ "loss": 0.4786,
551
+ "rewards/accuracies": 0.875,
552
+ "rewards/chosen": -0.5159555673599243,
553
+ "rewards/margins": 0.7496730089187622,
554
+ "rewards/rejected": -1.2656285762786865,
555
  "step": 390
556
  },
557
  {
558
+ "epoch": 1.12,
559
+ "learning_rate": 4.7665549260567063e-07,
560
+ "logits/chosen": -0.1984197050333023,
561
+ "logits/rejected": -0.1335490345954895,
562
+ "logps/chosen": -358.4623107910156,
563
+ "logps/rejected": -365.38812255859375,
564
+ "loss": 0.4764,
565
+ "rewards/accuracies": 0.824999988079071,
566
+ "rewards/chosen": -0.8547590970993042,
567
+ "rewards/margins": 0.6313014626502991,
568
+ "rewards/rejected": -1.4860605001449585,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
569
  "step": 400
570
  },
571
  {
572
+ "epoch": 1.15,
573
+ "learning_rate": 4.521382964292663e-07,
574
+ "logits/chosen": -0.10411302745342255,
575
+ "logits/rejected": -0.04661521688103676,
576
+ "logps/chosen": -423.57183837890625,
577
+ "logps/rejected": -370.1197204589844,
578
+ "loss": 0.4652,
579
+ "rewards/accuracies": 0.8374999761581421,
580
+ "rewards/chosen": -0.7738268971443176,
581
+ "rewards/margins": 0.8890780210494995,
582
+ "rewards/rejected": -1.662904977798462,
583
  "step": 410
584
  },
585
  {
586
+ "epoch": 1.18,
587
+ "learning_rate": 4.277367643844574e-07,
588
+ "logits/chosen": -0.15602782368659973,
589
+ "logits/rejected": -0.07274512201547623,
590
+ "logps/chosen": -402.03546142578125,
591
+ "logps/rejected": -334.91290283203125,
592
+ "loss": 0.4601,
593
+ "rewards/accuracies": 0.84375,
594
+ "rewards/chosen": -0.7279518842697144,
595
+ "rewards/margins": 0.8058152198791504,
596
+ "rewards/rejected": -1.5337669849395752,
597
  "step": 420
598
  },
599
  {
600
+ "epoch": 1.21,
601
+ "learning_rate": 4.035098659980891e-07,
602
+ "logits/chosen": -0.06685085594654083,
603
+ "logits/rejected": 0.018254879862070084,
604
+ "logps/chosen": -423.72540283203125,
605
+ "logps/rejected": -415.8779296875,
606
+ "loss": 0.465,
607
+ "rewards/accuracies": 0.8062499761581421,
608
+ "rewards/chosen": -1.001319169998169,
609
+ "rewards/margins": 0.7759243845939636,
610
+ "rewards/rejected": -1.7772436141967773,
611
  "step": 430
612
  },
613
  {
614
+ "epoch": 1.24,
615
+ "learning_rate": 3.795161487716928e-07,
616
+ "logits/chosen": -0.09074730426073074,
617
+ "logits/rejected": 0.07650933414697647,
618
+ "logps/chosen": -479.47576904296875,
619
+ "logps/rejected": -421.45941162109375,
620
+ "loss": 0.4532,
621
+ "rewards/accuracies": 0.8812500238418579,
622
+ "rewards/chosen": -1.1542574167251587,
623
+ "rewards/margins": 0.8317985534667969,
624
+ "rewards/rejected": -1.9860557317733765,
625
  "step": 440
626
  },
627
  {
628
+ "epoch": 1.26,
629
+ "learning_rate": 3.5581359669371223e-07,
630
+ "logits/chosen": -0.022224614396691322,
631
+ "logits/rejected": 0.0994531437754631,
632
+ "logps/chosen": -337.89447021484375,
633
+ "logps/rejected": -357.2190856933594,
634
+ "loss": 0.4517,
635
+ "rewards/accuracies": 0.8500000238418579,
636
+ "rewards/chosen": -1.0043950080871582,
637
+ "rewards/margins": 0.7450950741767883,
638
+ "rewards/rejected": -1.7494901418685913,
639
  "step": 450
640
  },
641
  {
642
+ "epoch": 1.29,
643
+ "learning_rate": 3.324594901135326e-07,
644
+ "logits/chosen": -0.004194366279989481,
645
+ "logits/rejected": 0.06137076020240784,
646
+ "logps/chosen": -390.3214111328125,
647
+ "logps/rejected": -367.9646911621094,
648
+ "loss": 0.4545,
649
+ "rewards/accuracies": 0.862500011920929,
650
+ "rewards/chosen": -1.0364924669265747,
651
+ "rewards/margins": 0.8088264465332031,
652
+ "rewards/rejected": -1.8453190326690674,
653
  "step": 460
654
  },
655
  {
656
+ "epoch": 1.32,
657
+ "learning_rate": 3.095102673159463e-07,
658
+ "logits/chosen": -0.037959642708301544,
659
+ "logits/rejected": 0.04749811813235283,
660
+ "logps/chosen": -429.95220947265625,
661
+ "logps/rejected": -400.9679260253906,
662
+ "loss": 0.4324,
663
+ "rewards/accuracies": 0.8500000238418579,
664
+ "rewards/chosen": -1.0624555349349976,
665
+ "rewards/margins": 0.8692095875740051,
666
+ "rewards/rejected": -1.931665062904358,
667
  "step": 470
668
  },
669
  {
670
+ "epoch": 1.35,
671
+ "learning_rate": 2.870213881305802e-07,
672
+ "logits/chosen": -0.035066571086645126,
673
+ "logits/rejected": 0.040720295161008835,
674
+ "logps/chosen": -474.01177978515625,
675
+ "logps/rejected": -409.0857849121094,
676
+ "loss": 0.4415,
677
+ "rewards/accuracies": 0.84375,
678
+ "rewards/chosen": -1.0656111240386963,
679
+ "rewards/margins": 0.9570503234863281,
680
+ "rewards/rejected": -2.0226616859436035,
681
  "step": 480
682
  },
683
  {
684
+ "epoch": 1.38,
685
+ "learning_rate": 2.6504719990588745e-07,
686
+ "logits/chosen": 0.06609617173671722,
687
+ "logits/rejected": 0.09753261506557465,
688
+ "logps/chosen": -489.17449951171875,
689
+ "logps/rejected": -452.04052734375,
690
+ "loss": 0.4317,
691
+ "rewards/accuracies": 0.856249988079071,
692
+ "rewards/chosen": -1.2529394626617432,
693
+ "rewards/margins": 1.057414174079895,
694
+ "rewards/rejected": -2.3103537559509277,
695
  "step": 490
696
  },
697
  {
698
+ "epoch": 1.4,
699
+ "learning_rate": 2.436408061715988e-07,
700
+ "logits/chosen": -0.0025597973726689816,
701
+ "logits/rejected": 0.0930376946926117,
702
+ "logps/chosen": -509.1451110839844,
703
+ "logps/rejected": -503.16009521484375,
704
+ "loss": 0.432,
705
+ "rewards/accuracies": 0.78125,
706
+ "rewards/chosen": -1.5313117504119873,
707
+ "rewards/margins": 0.9933918118476868,
708
+ "rewards/rejected": -2.5247039794921875,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  "step": 500
710
  },
711
  {
712
+ "epoch": 1.43,
713
+ "learning_rate": 2.22853938307025e-07,
714
+ "logits/chosen": 0.20588858425617218,
715
+ "logits/rejected": 0.3083997964859009,
716
+ "logps/chosen": -441.025146484375,
717
+ "logps/rejected": -412.0359802246094,
718
+ "loss": 0.4362,
719
+ "rewards/accuracies": 0.8125,
720
+ "rewards/chosen": -1.1732820272445679,
721
+ "rewards/margins": 0.8777497410774231,
722
+ "rewards/rejected": -2.0510315895080566,
723
  "step": 510
724
  },
725
  {
726
+ "epoch": 1.46,
727
+ "learning_rate": 2.0273683052534173e-07,
728
+ "logits/chosen": 0.13039740920066833,
729
+ "logits/rejected": 0.21461403369903564,
730
+ "logps/chosen": -450.8875427246094,
731
+ "logps/rejected": -447.06219482421875,
732
+ "loss": 0.4382,
733
+ "rewards/accuracies": 0.84375,
734
+ "rewards/chosen": -1.076249599456787,
735
+ "rewards/margins": 0.9202485084533691,
736
+ "rewards/rejected": -1.9964981079101562,
737
  "step": 520
738
  },
739
  {
740
+ "epoch": 1.49,
741
+ "learning_rate": 1.833380984759764e-07,
742
+ "logits/chosen": 0.06673362106084824,
743
+ "logits/rejected": 0.14447852969169617,
744
+ "logps/chosen": -474.4076232910156,
745
+ "logps/rejected": -404.3707275390625,
746
+ "loss": 0.4378,
747
+ "rewards/accuracies": 0.8500000238418579,
748
+ "rewards/chosen": -1.1933484077453613,
749
+ "rewards/margins": 0.8106036186218262,
750
+ "rewards/rejected": -2.0039520263671875,
751
  "step": 530
752
  },
753
  {
754
+ "epoch": 1.52,
755
+ "learning_rate": 1.6470462175846606e-07,
756
+ "logits/chosen": 0.07205445319414139,
757
+ "logits/rejected": 0.2100316733121872,
758
+ "logps/chosen": -496.0130920410156,
759
+ "logps/rejected": -440.20806884765625,
760
+ "loss": 0.4301,
761
+ "rewards/accuracies": 0.84375,
762
+ "rewards/chosen": -1.321128010749817,
763
+ "rewards/margins": 0.9277971982955933,
764
+ "rewards/rejected": -2.24892520904541,
765
  "step": 540
766
  },
767
  {
768
+ "epoch": 1.55,
769
+ "learning_rate": 1.468814306317092e-07,
770
+ "logits/chosen": 0.14164285361766815,
771
+ "logits/rejected": 0.1971181184053421,
772
+ "logps/chosen": -481.12646484375,
773
+ "logps/rejected": -430.11212158203125,
774
+ "loss": 0.4254,
775
+ "rewards/accuracies": 0.856249988079071,
776
+ "rewards/chosen": -1.3925974369049072,
777
+ "rewards/margins": 0.8229917287826538,
778
+ "rewards/rejected": -2.2155890464782715,
779
  "step": 550
780
  },
781
  {
782
+ "epoch": 1.57,
783
+ "learning_rate": 1.299115971923958e-07,
784
+ "logits/chosen": 0.2140401303768158,
785
+ "logits/rejected": 0.3077262341976166,
786
+ "logps/chosen": -458.56732177734375,
787
+ "logps/rejected": -426.39404296875,
788
+ "loss": 0.4265,
789
+ "rewards/accuracies": 0.793749988079071,
790
+ "rewards/chosen": -1.2237931489944458,
791
+ "rewards/margins": 0.8743373155593872,
792
+ "rewards/rejected": -2.098130464553833,
793
  "step": 560
794
  },
795
  {
796
+ "epoch": 1.6,
797
+ "learning_rate": 1.1383613128559305e-07,
798
+ "logits/chosen": 0.15547600388526917,
799
+ "logits/rejected": 0.22594401240348816,
800
+ "logps/chosen": -452.096435546875,
801
+ "logps/rejected": -461.7115783691406,
802
+ "loss": 0.4158,
803
+ "rewards/accuracies": 0.78125,
804
+ "rewards/chosen": -1.3260557651519775,
805
+ "rewards/margins": 0.8624703288078308,
806
+ "rewards/rejected": -2.188526153564453,
807
  "step": 570
808
  },
809
  {
810
+ "epoch": 1.63,
811
+ "learning_rate": 9.869388139903495e-08,
812
+ "logits/chosen": 0.16985264420509338,
813
+ "logits/rejected": 0.24287500977516174,
814
+ "logps/chosen": -453.4273986816406,
815
+ "logps/rejected": -435.1573791503906,
816
+ "loss": 0.4389,
817
+ "rewards/accuracies": 0.831250011920929,
818
+ "rewards/chosen": -1.3252050876617432,
819
+ "rewards/margins": 0.9360635876655579,
820
+ "rewards/rejected": -2.2612688541412354,
821
  "step": 580
822
  },
823
  {
824
+ "epoch": 1.66,
825
+ "learning_rate": 8.452144078061818e-08,
826
+ "logits/chosen": 0.2516225576400757,
827
+ "logits/rejected": 0.35612887144088745,
828
+ "logps/chosen": -433.05828857421875,
829
+ "logps/rejected": -372.24273681640625,
830
+ "loss": 0.4428,
831
+ "rewards/accuracies": 0.824999988079071,
832
+ "rewards/chosen": -1.2665939331054688,
833
+ "rewards/margins": 0.7784386873245239,
834
+ "rewards/rejected": -2.0450327396392822,
835
  "step": 590
836
  },
837
  {
838
+ "epoch": 1.69,
839
+ "learning_rate": 7.135305900598321e-08,
840
+ "logits/chosen": 0.2013220340013504,
841
+ "logits/rejected": 0.30089515447616577,
842
+ "logps/chosen": -451.4398498535156,
843
+ "logps/rejected": -415.129638671875,
844
+ "loss": 0.4265,
845
+ "rewards/accuracies": 0.824999988079071,
846
+ "rewards/chosen": -1.3577957153320312,
847
+ "rewards/margins": 0.7237957119941711,
848
+ "rewards/rejected": -2.0815916061401367,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
849
  "step": 600
850
  },
851
  {
852
+ "epoch": 1.71,
853
+ "learning_rate": 5.9220559209888166e-08,
854
+ "logits/chosen": 0.14896592497825623,
855
+ "logits/rejected": 0.29967430233955383,
856
+ "logps/chosen": -466.57012939453125,
857
+ "logps/rejected": -392.4150390625,
858
+ "loss": 0.4393,
859
+ "rewards/accuracies": 0.824999988079071,
860
+ "rewards/chosen": -1.3139363527297974,
861
+ "rewards/margins": 0.8993911743164062,
862
+ "rewards/rejected": -2.213327646255493,
863
  "step": 610
864
  },
865
  {
866
+ "epoch": 1.74,
867
+ "learning_rate": 4.815326118139812e-08,
868
+ "logits/chosen": 0.002818810986354947,
869
+ "logits/rejected": 0.14390364289283752,
870
+ "logps/chosen": -485.42095947265625,
871
+ "logps/rejected": -445.1665954589844,
872
+ "loss": 0.4256,
873
  "rewards/accuracies": 0.862500011920929,
874
+ "rewards/chosen": -1.2553160190582275,
875
+ "rewards/margins": 1.0687906742095947,
876
+ "rewards/rejected": -2.3241066932678223,
877
  "step": 620
878
  },
879
  {
880
+ "epoch": 1.77,
881
+ "learning_rate": 3.81779105087407e-08,
882
+ "logits/chosen": 0.18004778027534485,
883
+ "logits/rejected": 0.2915050983428955,
884
+ "logps/chosen": -433.8690490722656,
885
+ "logps/rejected": -449.0361328125,
886
+ "loss": 0.4151,
887
+ "rewards/accuracies": 0.8125,
888
+ "rewards/chosen": -1.3270412683486938,
889
+ "rewards/margins": 0.9789830446243286,
890
+ "rewards/rejected": -2.3060243129730225,
891
  "step": 630
892
  },
893
  {
894
+ "epoch": 1.8,
895
+ "learning_rate": 2.9318613945057637e-08,
896
+ "logits/chosen": 0.19966045022010803,
897
+ "logits/rejected": 0.26041653752326965,
898
+ "logps/chosen": -475.998291015625,
899
+ "logps/rejected": -438.5953063964844,
900
+ "loss": 0.4212,
901
+ "rewards/accuracies": 0.824999988079071,
902
+ "rewards/chosen": -1.2703568935394287,
903
+ "rewards/margins": 0.9554103016853333,
904
+ "rewards/rejected": -2.225767135620117,
905
  "step": 640
906
  },
907
  {
908
+ "epoch": 1.83,
909
+ "learning_rate": 2.1596781151249523e-08,
910
+ "logits/chosen": 0.2097601592540741,
911
+ "logits/rejected": 0.332507848739624,
912
+ "logps/chosen": -467.12835693359375,
913
+ "logps/rejected": -460.8416442871094,
914
+ "loss": 0.4172,
915
+ "rewards/accuracies": 0.8500000238418579,
916
+ "rewards/chosen": -1.3552840948104858,
917
+ "rewards/margins": 1.1028287410736084,
918
+ "rewards/rejected": -2.4581127166748047,
919
  "step": 650
920
  },
921
  {
922
+ "epoch": 1.85,
923
+ "learning_rate": 1.5031072956701695e-08,
924
+ "logits/chosen": 0.16343393921852112,
925
+ "logits/rejected": 0.20303264260292053,
926
+ "logps/chosen": -433.28558349609375,
927
+ "logps/rejected": -458.15350341796875,
928
+ "loss": 0.4253,
929
+ "rewards/accuracies": 0.8062499761581421,
930
+ "rewards/chosen": -1.513602614402771,
931
+ "rewards/margins": 0.8022111058235168,
932
+ "rewards/rejected": -2.3158137798309326,
933
  "step": 660
934
  },
935
  {
936
+ "epoch": 1.88,
937
+ "learning_rate": 9.637356262923723e-09,
938
+ "logits/chosen": 0.08328579366207123,
939
+ "logits/rejected": 0.25923627614974976,
940
+ "logps/chosen": -468.628173828125,
941
+ "logps/rejected": -414.67608642578125,
942
+ "loss": 0.4355,
943
  "rewards/accuracies": 0.856249988079071,
944
+ "rewards/chosen": -1.3132652044296265,
945
+ "rewards/margins": 0.9038375616073608,
946
+ "rewards/rejected": -2.2171027660369873,
947
  "step": 670
948
  },
949
  {
950
+ "epoch": 1.91,
951
+ "learning_rate": 5.428665699084789e-09,
952
+ "logits/chosen": 0.18607434630393982,
953
+ "logits/rejected": 0.22493357956409454,
954
+ "logps/chosen": -407.1839294433594,
955
+ "logps/rejected": -398.12158203125,
956
+ "loss": 0.4253,
957
+ "rewards/accuracies": 0.7875000238418579,
958
+ "rewards/chosen": -1.400312900543213,
959
+ "rewards/margins": 0.7541720271110535,
960
+ "rewards/rejected": -2.154484987258911,
961
  "step": 680
962
  },
963
  {
964
+ "epoch": 1.94,
965
+ "learning_rate": 2.415172122110343e-09,
966
+ "logits/chosen": 0.029862558469176292,
967
+ "logits/rejected": 0.22492480278015137,
968
+ "logps/chosen": -504.3904724121094,
969
+ "logps/rejected": -438.556640625,
970
+ "loss": 0.4297,
971
+ "rewards/accuracies": 0.84375,
972
+ "rewards/chosen": -1.229479193687439,
973
+ "rewards/margins": 0.9579814076423645,
974
+ "rewards/rejected": -2.187460422515869,
975
  "step": 690
976
  },
977
  {
978
+ "epoch": 1.97,
979
+ "learning_rate": 6.041580374618327e-10,
980
+ "logits/chosen": 0.20579871535301208,
981
+ "logits/rejected": 0.33758652210235596,
982
+ "logps/chosen": -440.26605224609375,
983
+ "logps/rejected": -435.342041015625,
984
+ "loss": 0.4363,
985
+ "rewards/accuracies": 0.824999988079071,
986
+ "rewards/chosen": -1.2293314933776855,
987
+ "rewards/margins": 0.9770258069038391,
988
+ "rewards/rejected": -2.20635724067688,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
989
  "step": 700
990
  },
991
  {
992
+ "epoch": 2.0,
993
+ "learning_rate": 0.0,
994
+ "logits/chosen": 0.20219314098358154,
995
+ "logits/rejected": 0.39273345470428467,
996
+ "logps/chosen": -488.3095703125,
997
+ "logps/rejected": -427.59088134765625,
998
+ "loss": 0.425,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
999
  "rewards/accuracies": 0.84375,
1000
+ "rewards/chosen": -1.406359076499939,
1001
+ "rewards/margins": 0.9064434170722961,
1002
+ "rewards/rejected": -2.312802314758301,
1003
+ "step": 710
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1004
  },
1005
  {
1006
  "epoch": 2.0,
1007
+ "step": 710,
1008
  "total_flos": 0.0,
1009
+ "train_loss": 0.5396277803770253,
1010
+ "train_runtime": 10358.1577,
1011
+ "train_samples_per_second": 8.795,
1012
+ "train_steps_per_second": 0.069
1013
  }
1014
  ],
1015
  "logging_steps": 10,
1016
+ "max_steps": 710,
1017
  "num_train_epochs": 2,
1018
+ "save_steps": 10000,
1019
  "total_flos": 0.0,
1020
  "trial_name": null,
1021
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5afd5730214e6bd724e3cab0f3dcc26a9879ab9f6aff92cdb3a2b93fd0a49305
3
  size 6648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2390c62cafd275700a8cab032e404ced08bab84cec2b9e7aad9c5b00a3adfe4b
3
  size 6648