RikkiXu commited on
Commit
155df57
1 Parent(s): 63dba8c

Model save

Browse files
README.md CHANGED
@@ -15,15 +15,15 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model was trained from scratch on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.2511
19
- - Rewards/chosen: 14.1512
20
- - Rewards/rejected: -27.0299
21
- - Rewards/accuracies: 0.9297
22
- - Rewards/margins: 41.1811
23
- - Logps/rejected: -120.2706
24
- - Logps/chosen: -123.6211
25
- - Logits/rejected: -1.8742
26
- - Logits/chosen: -1.8698
27
 
28
  ## Model description
29
 
@@ -60,10 +60,10 @@ The following hyperparameters were used during training:
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
- | 0.1812 | 0.21 | 100 | 0.1474 | 12.7707 | -20.3277 | 0.9180 | 33.0984 | -113.5685 | -125.0015 | -1.7088 | -1.7301 |
64
- | 0.2958 | 0.42 | 200 | 0.2224 | 15.4746 | -23.1680 | 0.9258 | 38.6426 | -116.4087 | -122.2977 | -1.8350 | -1.8384 |
65
- | 0.3034 | 0.63 | 300 | 0.2672 | 14.1732 | -27.0300 | 0.9258 | 41.2032 | -120.2707 | -123.5991 | -1.8525 | -1.8496 |
66
- | 0.3576 | 0.84 | 400 | 0.2511 | 14.1512 | -27.0299 | 0.9297 | 41.1811 | -120.2706 | -123.6211 | -1.8742 | -1.8698 |
67
 
68
 
69
  ### Framework versions
 
15
 
16
  This model was trained from scratch on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.1420
19
+ - Rewards/chosen: 5.3389
20
+ - Rewards/rejected: -11.5415
21
+ - Rewards/accuracies: 0.9258
22
+ - Rewards/margins: 16.8803
23
+ - Logps/rejected: -131.7123
24
+ - Logps/chosen: -119.9761
25
+ - Logits/rejected: -1.8130
26
+ - Logits/chosen: -1.8134
27
 
28
  ## Model description
29
 
 
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
+ | 0.1496 | 0.21 | 100 | 0.1356 | 4.1320 | -11.2810 | 0.9414 | 15.4129 | -130.8439 | -123.9990 | -1.7910 | -1.8010 |
64
+ | 0.1795 | 0.42 | 200 | 0.1364 | 5.2675 | -11.0420 | 0.9336 | 16.3095 | -130.0476 | -120.2140 | -1.8607 | -1.8614 |
65
+ | 0.1585 | 0.63 | 300 | 0.1425 | 5.1387 | -11.7029 | 0.9258 | 16.8416 | -132.2504 | -120.6432 | -1.7960 | -1.7980 |
66
+ | 0.2005 | 0.84 | 400 | 0.1420 | 5.3389 | -11.5415 | 0.9258 | 16.8803 | -131.7123 | -119.9761 | -1.8130 | -1.8134 |
67
 
68
 
69
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.2831452648509995,
4
- "train_runtime": 7636.09,
5
  "train_samples": 61135,
6
- "train_samples_per_second": 8.006,
7
  "train_steps_per_second": 0.063
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.17749474911510196,
4
+ "train_runtime": 7645.2484,
5
  "train_samples": 61135,
6
+ "train_samples_per_second": 7.996,
7
  "train_steps_per_second": 0.063
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d9e87c778dd07bc5353b48ab0b1622d62c7f80b637a92d6e104e12a4c096d03
3
  size 4943178720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5505459ad98890a3203705be3e3b4413c1fdb1bed021f564a6052d2b7286ba53
3
  size 4943178720
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0e47ee8bdc7394a3c4fff74798dc65703da5a17a7fc91f399e0b692b0c5d987
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9cd4bbebcca95695bc07db64291f4364758bb91a1c6301d838900b964109691
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7077d5704ebaba1c41a0334618ab1a9bcc430ac4adbc6d5b16cb3a6fdfddc60
3
  size 4540532728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6603cc6955b683f2147a8107dd2423db37551bb18757806d3089f527c647290d
3
  size 4540532728
runs/May13_09-51-11_n136-129-074/events.out.tfevents.1715565205.n136-129-074.1804527.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b23f097896970362172141dbd46840a3bc9126f969a3f20bf6441135e008f6da
3
- size 35913
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e33ac5f6209d31be001170e36d34bbece7451aa9ab83dd828bb1e3deff048718
3
+ size 41083
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.2831452648509995,
4
- "train_runtime": 7636.09,
5
  "train_samples": 61135,
6
- "train_samples_per_second": 8.006,
7
  "train_steps_per_second": 0.063
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.17749474911510196,
4
+ "train_runtime": 7645.2484,
5
  "train_samples": 61135,
6
+ "train_samples_per_second": 7.996,
7
  "train_steps_per_second": 0.063
8
  }
trainer_state.json CHANGED
@@ -10,7 +10,7 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "grad_norm": 4387.344432836715,
14
  "learning_rate": 1.0416666666666666e-08,
15
  "logits/chosen": -1.689455509185791,
16
  "logits/rejected": -1.4794573783874512,
@@ -25,780 +25,780 @@
25
  },
26
  {
27
  "epoch": 0.02,
28
- "grad_norm": 2997.186248490448,
29
  "learning_rate": 1.0416666666666667e-07,
30
- "logits/chosen": -1.7082680463790894,
31
- "logits/rejected": -1.610369324684143,
32
- "logps/chosen": -139.5821990966797,
33
- "logps/rejected": -91.33868408203125,
34
- "loss": 0.7371,
35
- "rewards/accuracies": 0.5138888955116272,
36
- "rewards/chosen": 0.14069372415542603,
37
- "rewards/margins": 0.1832776665687561,
38
- "rewards/rejected": -0.04258394241333008,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.04,
43
- "grad_norm": 978.7690038539965,
44
  "learning_rate": 2.0833333333333333e-07,
45
- "logits/chosen": -1.640048623085022,
46
- "logits/rejected": -1.6500003337860107,
47
- "logps/chosen": -130.82679748535156,
48
- "logps/rejected": -93.84379577636719,
49
- "loss": 0.3586,
50
- "rewards/accuracies": 0.8125,
51
- "rewards/chosen": 1.8478235006332397,
52
- "rewards/margins": 2.4079792499542236,
53
- "rewards/rejected": -0.5601558685302734,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.06,
58
- "grad_norm": 882.5432957594307,
59
  "learning_rate": 3.1249999999999997e-07,
60
- "logits/chosen": -1.7016046047210693,
61
- "logits/rejected": -1.6314153671264648,
62
- "logps/chosen": -132.51332092285156,
63
- "logps/rejected": -104.1015625,
64
- "loss": 0.2002,
65
- "rewards/accuracies": 0.9312499761581421,
66
- "rewards/chosen": 5.684301853179932,
67
- "rewards/margins": 8.036184310913086,
68
- "rewards/rejected": -2.3518824577331543,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.08,
73
- "grad_norm": 948.7428618668862,
74
  "learning_rate": 4.1666666666666667e-07,
75
- "logits/chosen": -1.6547832489013672,
76
- "logits/rejected": -1.5681570768356323,
77
- "logps/chosen": -143.4683074951172,
78
- "logps/rejected": -105.14913177490234,
79
- "loss": 0.1727,
80
  "rewards/accuracies": 0.925000011920929,
81
- "rewards/chosen": 8.252656936645508,
82
- "rewards/margins": 14.83061695098877,
83
- "rewards/rejected": -6.577960968017578,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.1,
88
- "grad_norm": 1493.2315961148001,
89
  "learning_rate": 4.999733114418725e-07,
90
- "logits/chosen": -1.6189439296722412,
91
- "logits/rejected": -1.639786958694458,
92
- "logps/chosen": -126.5509262084961,
93
- "logps/rejected": -110.4835433959961,
94
- "loss": 0.1918,
95
- "rewards/accuracies": 0.862500011920929,
96
- "rewards/chosen": 8.523491859436035,
97
- "rewards/margins": 19.890926361083984,
98
- "rewards/rejected": -11.367437362670898,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.13,
103
- "grad_norm": 697.7628654861039,
104
  "learning_rate": 4.990398100856366e-07,
105
- "logits/chosen": -1.726836919784546,
106
- "logits/rejected": -1.664571762084961,
107
- "logps/chosen": -140.87637329101562,
108
- "logps/rejected": -116.43977355957031,
109
- "loss": 0.1811,
110
- "rewards/accuracies": 0.9312499761581421,
111
- "rewards/chosen": 10.12385082244873,
112
- "rewards/margins": 24.805103302001953,
113
- "rewards/rejected": -14.681253433227539,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.15,
118
- "grad_norm": 851.3584358950797,
119
  "learning_rate": 4.967775735898179e-07,
120
- "logits/chosen": -1.670240044593811,
121
- "logits/rejected": -1.6913667917251587,
122
- "logps/chosen": -134.24986267089844,
123
- "logps/rejected": -115.22517395019531,
124
- "loss": 0.2071,
125
- "rewards/accuracies": 0.918749988079071,
126
- "rewards/chosen": 10.848904609680176,
127
- "rewards/margins": 28.108707427978516,
128
- "rewards/rejected": -17.25979995727539,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.17,
133
- "grad_norm": 521.7442029758571,
134
  "learning_rate": 4.931986719649298e-07,
135
- "logits/chosen": -1.6372220516204834,
136
- "logits/rejected": -1.618297815322876,
137
- "logps/chosen": -128.79197692871094,
138
- "logps/rejected": -109.78861999511719,
139
- "loss": 0.2008,
140
  "rewards/accuracies": 0.918749988079071,
141
- "rewards/chosen": 11.66787052154541,
142
- "rewards/margins": 27.729543685913086,
143
- "rewards/rejected": -16.061674118041992,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.19,
148
- "grad_norm": 1335.0772057896347,
149
  "learning_rate": 4.883222001996351e-07,
150
- "logits/chosen": -1.6502501964569092,
151
- "logits/rejected": -1.6660646200180054,
152
- "logps/chosen": -136.35589599609375,
153
- "logps/rejected": -121.8341064453125,
154
- "loss": 0.1828,
155
- "rewards/accuracies": 0.9312499761581421,
156
- "rewards/chosen": 12.854347229003906,
157
- "rewards/margins": 32.79566955566406,
158
- "rewards/rejected": -19.941320419311523,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.21,
163
- "grad_norm": 597.1667877282997,
164
  "learning_rate": 4.821741763807186e-07,
165
- "logits/chosen": -1.6900157928466797,
166
- "logits/rejected": -1.6581432819366455,
167
- "logps/chosen": -116.9823989868164,
168
- "logps/rejected": -109.75309753417969,
169
- "loss": 0.1812,
170
- "rewards/accuracies": 0.9312499761581421,
171
- "rewards/chosen": 10.995382308959961,
172
- "rewards/margins": 30.093975067138672,
173
- "rewards/rejected": -19.098596572875977,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.21,
178
- "eval_logits/chosen": -1.730061650276184,
179
- "eval_logits/rejected": -1.7088191509246826,
180
- "eval_logps/chosen": -125.00153350830078,
181
- "eval_logps/rejected": -113.56846618652344,
182
- "eval_loss": 0.14742514491081238,
183
- "eval_rewards/accuracies": 0.91796875,
184
- "eval_rewards/chosen": 12.770716667175293,
185
- "eval_rewards/margins": 33.09844970703125,
186
- "eval_rewards/rejected": -20.32773208618164,
187
- "eval_runtime": 97.6127,
188
- "eval_samples_per_second": 20.489,
189
  "eval_steps_per_second": 0.328,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.23,
194
- "grad_norm": 748.8293985115087,
195
  "learning_rate": 4.747874028753375e-07,
196
- "logits/chosen": -1.6079628467559814,
197
- "logits/rejected": -1.6635444164276123,
198
- "logps/chosen": -121.47686767578125,
199
- "logps/rejected": -118.15767669677734,
200
- "loss": 0.1932,
201
- "rewards/accuracies": 0.925000011920929,
202
- "rewards/chosen": 10.569832801818848,
203
- "rewards/margins": 26.553226470947266,
204
- "rewards/rejected": -15.983392715454102,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 0.25,
209
- "grad_norm": 685.6845900004148,
210
  "learning_rate": 4.662012913161997e-07,
211
- "logits/chosen": -1.67098069190979,
212
- "logits/rejected": -1.6562950611114502,
213
- "logps/chosen": -120.14444732666016,
214
- "logps/rejected": -119.04862976074219,
215
- "loss": 0.2,
216
- "rewards/accuracies": 0.856249988079071,
217
- "rewards/chosen": 12.69383716583252,
218
- "rewards/margins": 30.399723052978516,
219
- "rewards/rejected": -17.705890655517578,
220
  "step": 120
221
  },
222
  {
223
  "epoch": 0.27,
224
- "grad_norm": 365.3303217685943,
225
  "learning_rate": 4.5646165232345103e-07,
226
- "logits/chosen": -1.6567928791046143,
227
- "logits/rejected": -1.6678619384765625,
228
- "logps/chosen": -126.27610778808594,
229
- "logps/rejected": -113.00162506103516,
230
- "loss": 0.2132,
231
- "rewards/accuracies": 0.9125000238418579,
232
- "rewards/chosen": 14.438420295715332,
233
- "rewards/margins": 34.15327835083008,
234
- "rewards/rejected": -19.714855194091797,
235
  "step": 130
236
  },
237
  {
238
  "epoch": 0.29,
239
- "grad_norm": 927.4815100211999,
240
  "learning_rate": 4.456204510851956e-07,
241
- "logits/chosen": -1.5695436000823975,
242
- "logits/rejected": -1.5368653535842896,
243
- "logps/chosen": -123.9181900024414,
244
- "logps/rejected": -111.4970474243164,
245
- "loss": 0.2485,
246
- "rewards/accuracies": 0.918749988079071,
247
- "rewards/chosen": 13.650964736938477,
248
- "rewards/margins": 32.08143997192383,
249
- "rewards/rejected": -18.43047523498535,
250
  "step": 140
251
  },
252
  {
253
  "epoch": 0.31,
254
- "grad_norm": 884.1656325190633,
255
  "learning_rate": 4.337355301007335e-07,
256
- "logits/chosen": -1.6969578266143799,
257
- "logits/rejected": -1.722955346107483,
258
- "logps/chosen": -121.66493225097656,
259
- "logps/rejected": -109.62093353271484,
260
- "loss": 0.1799,
261
- "rewards/accuracies": 0.925000011920929,
262
- "rewards/chosen": 14.053888320922852,
263
- "rewards/margins": 33.25988006591797,
264
- "rewards/rejected": -19.20599365234375,
265
  "step": 150
266
  },
267
  {
268
  "epoch": 0.33,
269
- "grad_norm": 1383.2317309989403,
270
  "learning_rate": 4.2087030056579986e-07,
271
- "logits/chosen": -1.5813496112823486,
272
- "logits/rejected": -1.5495407581329346,
273
- "logps/chosen": -129.09225463867188,
274
- "logps/rejected": -117.093994140625,
275
- "loss": 0.2686,
276
- "rewards/accuracies": 0.9125000238418579,
277
- "rewards/chosen": 12.873576164245605,
278
- "rewards/margins": 31.02213478088379,
279
- "rewards/rejected": -18.148557662963867,
280
  "step": 160
281
  },
282
  {
283
  "epoch": 0.36,
284
- "grad_norm": 1121.824339354007,
285
  "learning_rate": 4.070934040463998e-07,
286
- "logits/chosen": -1.7786369323730469,
287
- "logits/rejected": -1.7519384622573853,
288
- "logps/chosen": -124.59269714355469,
289
- "logps/rejected": -116.8897933959961,
290
- "loss": 0.2735,
291
- "rewards/accuracies": 0.918749988079071,
292
- "rewards/chosen": 14.548820495605469,
293
- "rewards/margins": 35.767005920410156,
294
- "rewards/rejected": -21.218185424804688,
295
  "step": 170
296
  },
297
  {
298
  "epoch": 0.38,
299
- "grad_norm": 1198.59830465361,
300
  "learning_rate": 3.9247834624635404e-07,
301
- "logits/chosen": -1.7289392948150635,
302
- "logits/rejected": -1.7189258337020874,
303
- "logps/chosen": -127.54146575927734,
304
- "logps/rejected": -112.2616195678711,
305
- "loss": 0.2876,
306
- "rewards/accuracies": 0.925000011920929,
307
- "rewards/chosen": 15.61170482635498,
308
- "rewards/margins": 37.00361251831055,
309
- "rewards/rejected": -21.391910552978516,
310
  "step": 180
311
  },
312
  {
313
  "epoch": 0.4,
314
- "grad_norm": 926.4816546126996,
315
  "learning_rate": 3.7710310482256523e-07,
316
- "logits/chosen": -1.7749055624008179,
317
- "logits/rejected": -1.7469419240951538,
318
- "logps/chosen": -116.5710678100586,
319
- "logps/rejected": -125.67762756347656,
320
- "loss": 0.2836,
321
- "rewards/accuracies": 0.8999999761581421,
322
- "rewards/chosen": 15.228490829467773,
323
- "rewards/margins": 38.81087112426758,
324
- "rewards/rejected": -23.582382202148438,
325
  "step": 190
326
  },
327
  {
328
  "epoch": 0.42,
329
- "grad_norm": 963.2845441111758,
330
  "learning_rate": 3.610497133404795e-07,
331
- "logits/chosen": -1.7404663562774658,
332
- "logits/rejected": -1.7393602132797241,
333
- "logps/chosen": -122.38069152832031,
334
- "logps/rejected": -109.40885925292969,
335
- "loss": 0.2958,
336
  "rewards/accuracies": 0.9375,
337
- "rewards/chosen": 16.75569725036621,
338
- "rewards/margins": 38.156455993652344,
339
- "rewards/rejected": -21.400760650634766,
340
  "step": 200
341
  },
342
  {
343
  "epoch": 0.42,
344
- "eval_logits/chosen": -1.8384383916854858,
345
- "eval_logits/rejected": -1.8349756002426147,
346
- "eval_logps/chosen": -122.29769134521484,
347
- "eval_logps/rejected": -116.40873718261719,
348
- "eval_loss": 0.22235894203186035,
349
- "eval_rewards/accuracies": 0.92578125,
350
- "eval_rewards/chosen": 15.474552154541016,
351
- "eval_rewards/margins": 38.64255142211914,
352
- "eval_rewards/rejected": -23.167999267578125,
353
- "eval_runtime": 97.6373,
354
- "eval_samples_per_second": 20.484,
355
  "eval_steps_per_second": 0.328,
356
  "step": 200
357
  },
358
  {
359
  "epoch": 0.44,
360
- "grad_norm": 707.2538341296229,
361
  "learning_rate": 3.4440382358952115e-07,
362
- "logits/chosen": -1.6704992055892944,
363
- "logits/rejected": -1.605607271194458,
364
- "logps/chosen": -120.807373046875,
365
- "logps/rejected": -107.77888488769531,
366
- "loss": 0.2821,
367
- "rewards/accuracies": 0.8687499761581421,
368
- "rewards/chosen": 11.783378601074219,
369
- "rewards/margins": 29.960418701171875,
370
- "rewards/rejected": -18.17704200744629,
371
  "step": 210
372
  },
373
  {
374
  "epoch": 0.46,
375
- "grad_norm": 601.6467819271398,
376
  "learning_rate": 3.272542485937368e-07,
377
- "logits/chosen": -1.8005359172821045,
378
- "logits/rejected": -1.845910668373108,
379
- "logps/chosen": -122.75736236572266,
380
- "logps/rejected": -108.84068298339844,
381
- "loss": 0.2678,
382
- "rewards/accuracies": 0.9125000238418579,
383
- "rewards/chosen": 13.717930793762207,
384
- "rewards/margins": 35.02998733520508,
385
- "rewards/rejected": -21.31205177307129,
386
  "step": 220
387
  },
388
  {
389
  "epoch": 0.48,
390
- "grad_norm": 474.6819088542097,
391
  "learning_rate": 3.096924887558854e-07,
392
- "logits/chosen": -1.7543551921844482,
393
- "logits/rejected": -1.703619360923767,
394
- "logps/chosen": -134.04598999023438,
395
- "logps/rejected": -127.0660400390625,
396
- "loss": 0.3505,
397
- "rewards/accuracies": 0.925000011920929,
398
- "rewards/chosen": 13.662841796875,
399
- "rewards/margins": 40.915740966796875,
400
- "rewards/rejected": -27.252899169921875,
401
  "step": 230
402
  },
403
  {
404
  "epoch": 0.5,
405
- "grad_norm": 1030.0648597846362,
406
  "learning_rate": 2.9181224366319943e-07,
407
- "logits/chosen": -1.7851312160491943,
408
- "logits/rejected": -1.8008880615234375,
409
- "logps/chosen": -120.84779357910156,
410
- "logps/rejected": -118.9937515258789,
411
- "loss": 0.343,
412
- "rewards/accuracies": 0.893750011920929,
413
- "rewards/chosen": 11.07734203338623,
414
- "rewards/margins": 32.441627502441406,
415
- "rewards/rejected": -21.364286422729492,
416
  "step": 240
417
  },
418
  {
419
  "epoch": 0.52,
420
- "grad_norm": 984.6682547225865,
421
  "learning_rate": 2.7370891215954565e-07,
422
- "logits/chosen": -1.7966454029083252,
423
- "logits/rejected": -1.8319499492645264,
424
- "logps/chosen": -124.015869140625,
425
- "logps/rejected": -120.4018783569336,
426
- "loss": 0.3481,
427
- "rewards/accuracies": 0.9437500238418579,
428
- "rewards/chosen": 13.399293899536133,
429
- "rewards/margins": 36.81779098510742,
430
- "rewards/rejected": -23.41849708557129,
431
  "step": 250
432
  },
433
  {
434
  "epoch": 0.54,
435
- "grad_norm": 466.5498060764853,
436
  "learning_rate": 2.55479083351317e-07,
437
- "logits/chosen": -1.8401811122894287,
438
- "logits/rejected": -1.8560435771942139,
439
- "logps/chosen": -128.3160858154297,
440
- "logps/rejected": -107.73912048339844,
441
- "loss": 0.2405,
442
- "rewards/accuracies": 0.887499988079071,
443
- "rewards/chosen": 15.51091480255127,
444
- "rewards/margins": 38.373741149902344,
445
- "rewards/rejected": -22.862829208374023,
446
  "step": 260
447
  },
448
  {
449
  "epoch": 0.56,
450
- "grad_norm": 878.144034078672,
451
  "learning_rate": 2.3722002126275822e-07,
452
- "logits/chosen": -1.8353513479232788,
453
- "logits/rejected": -1.7788803577423096,
454
- "logps/chosen": -118.90830993652344,
455
- "logps/rejected": -113.4903335571289,
456
- "loss": 0.3013,
457
- "rewards/accuracies": 0.893750011920929,
458
- "rewards/chosen": 13.150169372558594,
459
- "rewards/margins": 35.406681060791016,
460
- "rewards/rejected": -22.256511688232422,
461
  "step": 270
462
  },
463
  {
464
  "epoch": 0.59,
465
- "grad_norm": 1244.2545962296354,
466
  "learning_rate": 2.19029145890313e-07,
467
- "logits/chosen": -1.6921329498291016,
468
- "logits/rejected": -1.778611183166504,
469
- "logps/chosen": -124.7625732421875,
470
- "logps/rejected": -126.3423080444336,
471
- "loss": 0.3747,
472
- "rewards/accuracies": 0.8999999761581421,
473
- "rewards/chosen": 12.294075012207031,
474
- "rewards/margins": 37.28684997558594,
475
- "rewards/rejected": -24.992773056030273,
476
  "step": 280
477
  },
478
  {
479
  "epoch": 0.61,
480
- "grad_norm": 970.0623194716495,
481
  "learning_rate": 2.0100351342479216e-07,
482
- "logits/chosen": -1.788028359413147,
483
- "logits/rejected": -1.7638485431671143,
484
- "logps/chosen": -115.53651428222656,
485
- "logps/rejected": -114.30744934082031,
486
- "loss": 0.3137,
487
  "rewards/accuracies": 0.887499988079071,
488
- "rewards/chosen": 12.6263427734375,
489
- "rewards/margins": 37.81734848022461,
490
- "rewards/rejected": -25.19100570678711,
491
  "step": 290
492
  },
493
  {
494
  "epoch": 0.63,
495
- "grad_norm": 817.9238785214287,
496
  "learning_rate": 1.8323929841460178e-07,
497
- "logits/chosen": -1.7443387508392334,
498
- "logits/rejected": -1.6937278509140015,
499
- "logps/chosen": -133.70358276367188,
500
- "logps/rejected": -131.15541076660156,
501
- "loss": 0.3034,
502
- "rewards/accuracies": 0.8999999761581421,
503
- "rewards/chosen": 12.736581802368164,
504
- "rewards/margins": 37.65189743041992,
505
- "rewards/rejected": -24.91531753540039,
506
  "step": 300
507
  },
508
  {
509
  "epoch": 0.63,
510
- "eval_logits/chosen": -1.8495898246765137,
511
- "eval_logits/rejected": -1.8524861335754395,
512
- "eval_logps/chosen": -123.59905242919922,
513
- "eval_logps/rejected": -120.27072143554688,
514
- "eval_loss": 0.2672339975833893,
515
  "eval_rewards/accuracies": 0.92578125,
516
- "eval_rewards/chosen": 14.173208236694336,
517
- "eval_rewards/margins": 41.203189849853516,
518
- "eval_rewards/rejected": -27.02998161315918,
519
- "eval_runtime": 97.7326,
520
- "eval_samples_per_second": 20.464,
521
- "eval_steps_per_second": 0.327,
522
  "step": 300
523
  },
524
  {
525
  "epoch": 0.65,
526
- "grad_norm": 1032.5800107949206,
527
  "learning_rate": 1.6583128063291573e-07,
528
- "logits/chosen": -1.7456867694854736,
529
- "logits/rejected": -1.781978964805603,
530
- "logps/chosen": -124.9278335571289,
531
- "logps/rejected": -121.29362487792969,
532
- "loss": 0.3116,
533
  "rewards/accuracies": 0.9375,
534
- "rewards/chosen": 14.409858703613281,
535
- "rewards/margins": 37.164405822753906,
536
- "rewards/rejected": -22.75455093383789,
537
  "step": 310
538
  },
539
  {
540
  "epoch": 0.67,
541
- "grad_norm": 852.7418560203273,
542
  "learning_rate": 1.488723393865766e-07,
543
- "logits/chosen": -1.773827314376831,
544
- "logits/rejected": -1.742255449295044,
545
- "logps/chosen": -114.78021240234375,
546
- "logps/rejected": -120.72232818603516,
547
- "loss": 0.307,
548
- "rewards/accuracies": 0.893750011920929,
549
- "rewards/chosen": 16.016681671142578,
550
- "rewards/margins": 40.483619689941406,
551
- "rewards/rejected": -24.466938018798828,
552
  "step": 320
553
  },
554
  {
555
  "epoch": 0.69,
556
- "grad_norm": 530.8053526489199,
557
  "learning_rate": 1.3245295796480788e-07,
558
- "logits/chosen": -1.7004032135009766,
559
- "logits/rejected": -1.7918256521224976,
560
- "logps/chosen": -125.2526626586914,
561
- "logps/rejected": -116.03642272949219,
562
- "loss": 0.3323,
563
- "rewards/accuracies": 0.9125000238418579,
564
- "rewards/chosen": 12.930267333984375,
565
- "rewards/margins": 35.218624114990234,
566
- "rewards/rejected": -22.28835678100586,
567
  "step": 330
568
  },
569
  {
570
  "epoch": 0.71,
571
- "grad_norm": 574.055283608638,
572
  "learning_rate": 1.1666074087171627e-07,
573
- "logits/chosen": -1.6694965362548828,
574
- "logits/rejected": -1.7466586828231812,
575
- "logps/chosen": -119.57568359375,
576
- "logps/rejected": -108.32354736328125,
577
- "loss": 0.4687,
578
- "rewards/accuracies": 0.875,
579
- "rewards/chosen": 10.914609909057617,
580
- "rewards/margins": 31.60161781311035,
581
- "rewards/rejected": -20.687007904052734,
582
  "step": 340
583
  },
584
  {
585
  "epoch": 0.73,
586
- "grad_norm": 1566.6302259536992,
587
  "learning_rate": 1.0157994641835734e-07,
588
- "logits/chosen": -1.7077308893203735,
589
- "logits/rejected": -1.7222753763198853,
590
- "logps/chosen": -119.43708801269531,
591
- "logps/rejected": -127.2352523803711,
592
- "loss": 0.2776,
593
- "rewards/accuracies": 0.918749988079071,
594
- "rewards/chosen": 10.647879600524902,
595
- "rewards/margins": 36.324337005615234,
596
- "rewards/rejected": -25.67645835876465,
597
  "step": 350
598
  },
599
  {
600
  "epoch": 0.75,
601
- "grad_norm": 891.7532408538094,
602
  "learning_rate": 8.729103716819111e-08,
603
- "logits/chosen": -1.6924266815185547,
604
- "logits/rejected": -1.7151873111724854,
605
- "logps/chosen": -125.8857421875,
606
- "logps/rejected": -123.62667083740234,
607
- "loss": 0.2483,
608
- "rewards/accuracies": 0.862500011920929,
609
- "rewards/chosen": 12.26460075378418,
610
- "rewards/margins": 36.57375717163086,
611
- "rewards/rejected": -24.309158325195312,
612
  "step": 360
613
  },
614
  {
615
  "epoch": 0.77,
616
- "grad_norm": 1484.9700115288094,
617
  "learning_rate": 7.387025063449081e-08,
618
- "logits/chosen": -1.7990468740463257,
619
- "logits/rejected": -1.7503010034561157,
620
- "logps/chosen": -115.22508239746094,
621
- "logps/rejected": -112.91387939453125,
622
- "loss": 0.3162,
623
- "rewards/accuracies": 0.893750011920929,
624
- "rewards/chosen": 11.164658546447754,
625
- "rewards/margins": 31.5644588470459,
626
- "rewards/rejected": -20.399805068969727,
627
  "step": 370
628
  },
629
  {
630
  "epoch": 0.79,
631
- "grad_norm": 567.3331476500654,
632
  "learning_rate": 6.138919252022435e-08,
633
- "logits/chosen": -1.8380801677703857,
634
- "logits/rejected": -1.860713005065918,
635
- "logps/chosen": -123.00642395019531,
636
- "logps/rejected": -125.6397933959961,
637
- "loss": 0.2616,
638
- "rewards/accuracies": 0.9375,
639
- "rewards/chosen": 13.647308349609375,
640
- "rewards/margins": 40.07811737060547,
641
- "rewards/rejected": -26.430805206298828,
642
  "step": 380
643
  },
644
  {
645
  "epoch": 0.82,
646
- "grad_norm": 1013.5375269262001,
647
  "learning_rate": 4.991445467064689e-08,
648
- "logits/chosen": -1.717441201210022,
649
- "logits/rejected": -1.7240034341812134,
650
- "logps/chosen": -118.6935806274414,
651
- "logps/rejected": -114.69548034667969,
652
- "loss": 0.2438,
653
- "rewards/accuracies": 0.956250011920929,
654
- "rewards/chosen": 13.80224609375,
655
- "rewards/margins": 38.19357681274414,
656
- "rewards/rejected": -24.391324996948242,
657
  "step": 390
658
  },
659
  {
660
  "epoch": 0.84,
661
- "grad_norm": 1086.7904609993896,
662
  "learning_rate": 3.9507259776993954e-08,
663
- "logits/chosen": -1.6992905139923096,
664
- "logits/rejected": -1.7782999277114868,
665
- "logps/chosen": -121.951171875,
666
- "logps/rejected": -115.44358825683594,
667
- "loss": 0.3576,
668
  "rewards/accuracies": 0.9125000238418579,
669
- "rewards/chosen": 10.810310363769531,
670
- "rewards/margins": 34.673805236816406,
671
- "rewards/rejected": -23.86349868774414,
672
  "step": 400
673
  },
674
  {
675
  "epoch": 0.84,
676
- "eval_logits/chosen": -1.869842529296875,
677
- "eval_logits/rejected": -1.8741588592529297,
678
- "eval_logps/chosen": -123.62108612060547,
679
- "eval_logps/rejected": -120.2706298828125,
680
- "eval_loss": 0.2510662376880646,
681
- "eval_rewards/accuracies": 0.9296875,
682
- "eval_rewards/chosen": 14.151167869567871,
683
- "eval_rewards/margins": 41.181060791015625,
684
- "eval_rewards/rejected": -27.029890060424805,
685
- "eval_runtime": 97.6822,
686
- "eval_samples_per_second": 20.475,
687
  "eval_steps_per_second": 0.328,
688
  "step": 400
689
  },
690
  {
691
  "epoch": 0.86,
692
- "grad_norm": 953.3240690024764,
693
  "learning_rate": 3.022313472693447e-08,
694
- "logits/chosen": -1.792865514755249,
695
- "logits/rejected": -1.8273773193359375,
696
- "logps/chosen": -132.37612915039062,
697
- "logps/rejected": -118.2215805053711,
698
- "loss": 0.3428,
699
  "rewards/accuracies": 0.925000011920929,
700
- "rewards/chosen": 14.9389009475708,
701
- "rewards/margins": 41.05461502075195,
702
- "rewards/rejected": -26.115713119506836,
703
  "step": 410
704
  },
705
  {
706
  "epoch": 0.88,
707
- "grad_norm": 497.3107054185315,
708
  "learning_rate": 2.2111614344599684e-08,
709
- "logits/chosen": -1.7695974111557007,
710
- "logits/rejected": -1.768599271774292,
711
- "logps/chosen": -116.97047424316406,
712
- "logps/rejected": -114.58748626708984,
713
- "loss": 0.2612,
714
- "rewards/accuracies": 0.918749988079071,
715
- "rewards/chosen": 11.339197158813477,
716
- "rewards/margins": 35.26213455200195,
717
- "rewards/rejected": -23.92293357849121,
718
  "step": 420
719
  },
720
  {
721
  "epoch": 0.9,
722
- "grad_norm": 978.5564078023309,
723
  "learning_rate": 1.521597710086439e-08,
724
- "logits/chosen": -1.7172062397003174,
725
- "logits/rejected": -1.747591257095337,
726
- "logps/chosen": -133.02786254882812,
727
- "logps/rejected": -114.73051452636719,
728
- "loss": 0.282,
729
- "rewards/accuracies": 0.9125000238418579,
730
- "rewards/chosen": 13.081387519836426,
731
- "rewards/margins": 34.326393127441406,
732
- "rewards/rejected": -21.245006561279297,
733
  "step": 430
734
  },
735
  {
736
  "epoch": 0.92,
737
- "grad_norm": 837.4554645359331,
738
  "learning_rate": 9.57301420397924e-09,
739
- "logits/chosen": -1.8500797748565674,
740
- "logits/rejected": -1.8606961965560913,
741
- "logps/chosen": -124.95164489746094,
742
- "logps/rejected": -119.78662109375,
743
- "loss": 0.2435,
744
- "rewards/accuracies": 0.9312499761581421,
745
- "rewards/chosen": 14.02147388458252,
746
- "rewards/margins": 40.40496063232422,
747
- "rewards/rejected": -26.38348388671875,
748
  "step": 440
749
  },
750
  {
751
  "epoch": 0.94,
752
- "grad_norm": 928.2897315645127,
753
  "learning_rate": 5.212833302556258e-09,
754
- "logits/chosen": -1.8660519123077393,
755
- "logits/rejected": -1.83090078830719,
756
- "logps/chosen": -120.15888977050781,
757
- "logps/rejected": -115.3041000366211,
758
- "loss": 0.2863,
759
- "rewards/accuracies": 0.893750011920929,
760
- "rewards/chosen": 15.148035049438477,
761
- "rewards/margins": 42.080291748046875,
762
- "rewards/rejected": -26.9322566986084,
763
  "step": 450
764
  },
765
  {
766
  "epoch": 0.96,
767
- "grad_norm": 623.0626054843979,
768
  "learning_rate": 2.158697848236607e-09,
769
- "logits/chosen": -1.7563292980194092,
770
- "logits/rejected": -1.7755203247070312,
771
- "logps/chosen": -124.97782897949219,
772
- "logps/rejected": -119.69969177246094,
773
- "loss": 0.2375,
774
- "rewards/accuracies": 0.9375,
775
- "rewards/chosen": 14.375999450683594,
776
- "rewards/margins": 38.44708251953125,
777
- "rewards/rejected": -24.071086883544922,
778
  "step": 460
779
  },
780
  {
781
  "epoch": 0.98,
782
- "grad_norm": 652.9309582790959,
783
  "learning_rate": 4.269029751107489e-10,
784
- "logits/chosen": -1.7278000116348267,
785
- "logits/rejected": -1.7499659061431885,
786
- "logps/chosen": -118.06231689453125,
787
- "logps/rejected": -129.90151977539062,
788
- "loss": 0.2885,
789
- "rewards/accuracies": 0.8999999761581421,
790
- "rewards/chosen": 11.649320602416992,
791
- "rewards/margins": 37.77583312988281,
792
- "rewards/rejected": -26.126514434814453,
793
  "step": 470
794
  },
795
  {
796
  "epoch": 1.0,
797
  "step": 478,
798
  "total_flos": 0.0,
799
- "train_loss": 0.2831452648509995,
800
- "train_runtime": 7636.09,
801
- "train_samples_per_second": 8.006,
802
  "train_steps_per_second": 0.063
803
  }
804
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "grad_norm": 1316.2617480695828,
14
  "learning_rate": 1.0416666666666666e-08,
15
  "logits/chosen": -1.689455509185791,
16
  "logits/rejected": -1.4794573783874512,
 
25
  },
26
  {
27
  "epoch": 0.02,
28
+ "grad_norm": 1084.7724692148897,
29
  "learning_rate": 1.0416666666666667e-07,
30
+ "logits/chosen": -1.707624912261963,
31
+ "logits/rejected": -1.6101186275482178,
32
+ "logps/chosen": -139.66224670410156,
33
+ "logps/rejected": -91.32621002197266,
34
+ "loss": 0.686,
35
+ "rewards/accuracies": 0.5,
36
+ "rewards/chosen": 0.01818913221359253,
37
+ "rewards/margins": 0.027222516015172005,
38
+ "rewards/rejected": -0.009033381938934326,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.04,
43
+ "grad_norm": 372.5890585979663,
44
  "learning_rate": 2.0833333333333333e-07,
45
+ "logits/chosen": -1.6384038925170898,
46
+ "logits/rejected": -1.6487312316894531,
47
+ "logps/chosen": -130.37515258789062,
48
+ "logps/rejected": -93.99095153808594,
49
+ "loss": 0.4495,
50
+ "rewards/accuracies": 0.84375,
51
+ "rewards/chosen": 0.6898423433303833,
52
+ "rewards/margins": 0.9020320177078247,
53
+ "rewards/rejected": -0.21218962967395782,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.06,
58
+ "grad_norm": 363.4016752055454,
59
  "learning_rate": 3.1249999999999997e-07,
60
+ "logits/chosen": -1.6896642446517944,
61
+ "logits/rejected": -1.6273959875106812,
62
+ "logps/chosen": -130.80935668945312,
63
+ "logps/rejected": -106.37054443359375,
64
+ "loss": 0.2556,
65
+ "rewards/accuracies": 0.9437500238418579,
66
+ "rewards/chosen": 2.216484785079956,
67
+ "rewards/margins": 3.602745771408081,
68
+ "rewards/rejected": -1.386260986328125,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.08,
73
+ "grad_norm": 254.07955635631413,
74
  "learning_rate": 4.1666666666666667e-07,
75
+ "logits/chosen": -1.6251739263534546,
76
+ "logits/rejected": -1.5512189865112305,
77
+ "logps/chosen": -142.7510528564453,
78
+ "logps/rejected": -113.31219482421875,
79
+ "loss": 0.1925,
80
  "rewards/accuracies": 0.925000011920929,
81
+ "rewards/chosen": 2.6909842491149902,
82
+ "rewards/margins": 7.113295078277588,
83
+ "rewards/rejected": -4.422310829162598,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.1,
88
+ "grad_norm": 450.1083647442556,
89
  "learning_rate": 4.999733114418725e-07,
90
+ "logits/chosen": -1.5827839374542236,
91
+ "logits/rejected": -1.6095731258392334,
92
+ "logps/chosen": -127.47169494628906,
93
+ "logps/rejected": -124.7472152709961,
94
+ "loss": 0.187,
95
+ "rewards/accuracies": 0.8687499761581421,
96
+ "rewards/chosen": 2.2808213233947754,
97
+ "rewards/margins": 9.9701509475708,
98
+ "rewards/rejected": -7.6893310546875,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.13,
103
+ "grad_norm": 256.12553340364644,
104
  "learning_rate": 4.990398100856366e-07,
105
+ "logits/chosen": -1.6854372024536133,
106
+ "logits/rejected": -1.6347172260284424,
107
+ "logps/chosen": -143.6377410888672,
108
+ "logps/rejected": -138.22506713867188,
109
+ "loss": 0.1745,
110
+ "rewards/accuracies": 0.9437500238418579,
111
+ "rewards/chosen": 2.208740711212158,
112
+ "rewards/margins": 13.148699760437012,
113
+ "rewards/rejected": -10.939959526062012,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.15,
118
+ "grad_norm": 266.6931182384945,
119
  "learning_rate": 4.967775735898179e-07,
120
+ "logits/chosen": -1.6533622741699219,
121
+ "logits/rejected": -1.6828343868255615,
122
+ "logps/chosen": -136.77774047851562,
123
+ "logps/rejected": -139.76748657226562,
124
+ "loss": 0.1719,
125
+ "rewards/accuracies": 0.9125000238418579,
126
+ "rewards/chosen": 2.496314525604248,
127
+ "rewards/margins": 15.036949157714844,
128
+ "rewards/rejected": -12.540634155273438,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.17,
133
+ "grad_norm": 311.4093156505733,
134
  "learning_rate": 4.931986719649298e-07,
135
+ "logits/chosen": -1.6512008905410767,
136
+ "logits/rejected": -1.642218828201294,
137
+ "logps/chosen": -129.7277374267578,
138
+ "logps/rejected": -130.1470489501953,
139
+ "loss": 0.1638,
140
  "rewards/accuracies": 0.918749988079071,
141
+ "rewards/chosen": 3.2196338176727295,
142
+ "rewards/margins": 14.145663261413574,
143
+ "rewards/rejected": -10.926031112670898,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.19,
148
+ "grad_norm": 353.08827705610673,
149
  "learning_rate": 4.883222001996351e-07,
150
+ "logits/chosen": -1.7057621479034424,
151
+ "logits/rejected": -1.7253615856170654,
152
+ "logps/chosen": -136.57843017578125,
153
+ "logps/rejected": -141.9429931640625,
154
+ "loss": 0.1501,
155
+ "rewards/accuracies": 0.918749988079071,
156
+ "rewards/chosen": 3.7895398139953613,
157
+ "rewards/margins": 15.804595947265625,
158
+ "rewards/rejected": -12.015056610107422,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.21,
163
+ "grad_norm": 173.53718581228586,
164
  "learning_rate": 4.821741763807186e-07,
165
+ "logits/chosen": -1.7514231204986572,
166
+ "logits/rejected": -1.7408148050308228,
167
+ "logps/chosen": -116.7274169921875,
168
+ "logps/rejected": -125.6600570678711,
169
+ "loss": 0.1496,
170
+ "rewards/accuracies": 0.90625,
171
+ "rewards/chosen": 3.375108242034912,
172
+ "rewards/margins": 13.876774787902832,
173
+ "rewards/rejected": -10.501666069030762,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.21,
178
+ "eval_logits/chosen": -1.8009788990020752,
179
+ "eval_logits/rejected": -1.790999174118042,
180
+ "eval_logps/chosen": -123.99901580810547,
181
+ "eval_logps/rejected": -130.8439178466797,
182
+ "eval_loss": 0.13556738197803497,
183
+ "eval_rewards/accuracies": 0.94140625,
184
+ "eval_rewards/chosen": 4.131972789764404,
185
+ "eval_rewards/margins": 15.41292953491211,
186
+ "eval_rewards/rejected": -11.280956268310547,
187
+ "eval_runtime": 97.6442,
188
+ "eval_samples_per_second": 20.483,
189
  "eval_steps_per_second": 0.328,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.23,
194
+ "grad_norm": 220.3790872760113,
195
  "learning_rate": 4.747874028753375e-07,
196
+ "logits/chosen": -1.6608690023422241,
197
+ "logits/rejected": -1.7199954986572266,
198
+ "logps/chosen": -120.6917724609375,
199
+ "logps/rejected": -133.25762939453125,
200
+ "loss": 0.1546,
201
+ "rewards/accuracies": 0.9375,
202
+ "rewards/chosen": 3.4064812660217285,
203
+ "rewards/margins": 12.731483459472656,
204
+ "rewards/rejected": -9.325002670288086,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 0.25,
209
+ "grad_norm": 179.12119430014053,
210
  "learning_rate": 4.662012913161997e-07,
211
+ "logits/chosen": -1.7208821773529053,
212
+ "logits/rejected": -1.7148889303207397,
213
+ "logps/chosen": -118.548583984375,
214
+ "logps/rejected": -133.46463012695312,
215
+ "loss": 0.1456,
216
+ "rewards/accuracies": 0.875,
217
+ "rewards/chosen": 4.286909580230713,
218
+ "rewards/margins": 13.92347240447998,
219
+ "rewards/rejected": -9.636563301086426,
220
  "step": 120
221
  },
222
  {
223
  "epoch": 0.27,
224
+ "grad_norm": 119.5811123757762,
225
  "learning_rate": 4.5646165232345103e-07,
226
+ "logits/chosen": -1.688997507095337,
227
+ "logits/rejected": -1.7153244018554688,
228
+ "logps/chosen": -124.4625244140625,
229
+ "logps/rejected": -129.4587860107422,
230
+ "loss": 0.1477,
231
+ "rewards/accuracies": 0.918749988079071,
232
+ "rewards/chosen": 4.8756022453308105,
233
+ "rewards/margins": 15.727206230163574,
234
+ "rewards/rejected": -10.851605415344238,
235
  "step": 130
236
  },
237
  {
238
  "epoch": 0.29,
239
+ "grad_norm": 293.67492501764275,
240
  "learning_rate": 4.456204510851956e-07,
241
+ "logits/chosen": -1.6008217334747314,
242
+ "logits/rejected": -1.5764399766921997,
243
+ "logps/chosen": -121.84040832519531,
244
+ "logps/rejected": -125.70499420166016,
245
+ "loss": 0.1534,
246
+ "rewards/accuracies": 0.90625,
247
+ "rewards/chosen": 4.718628406524658,
248
+ "rewards/margins": 14.510149955749512,
249
+ "rewards/rejected": -9.791521072387695,
250
  "step": 140
251
  },
252
  {
253
  "epoch": 0.31,
254
+ "grad_norm": 250.47040288737202,
255
  "learning_rate": 4.337355301007335e-07,
256
+ "logits/chosen": -1.7394781112670898,
257
+ "logits/rejected": -1.7636123895645142,
258
+ "logps/chosen": -119.86863708496094,
259
+ "logps/rejected": -122.49459075927734,
260
+ "loss": 0.1403,
261
+ "rewards/accuracies": 0.8999999761581421,
262
+ "rewards/chosen": 4.755049705505371,
263
+ "rewards/margins": 14.378946304321289,
264
+ "rewards/rejected": -9.623896598815918,
265
  "step": 150
266
  },
267
  {
268
  "epoch": 0.33,
269
+ "grad_norm": 394.54185792168863,
270
  "learning_rate": 4.2087030056579986e-07,
271
+ "logits/chosen": -1.6095482110977173,
272
+ "logits/rejected": -1.5893223285675049,
273
+ "logps/chosen": -127.0114517211914,
274
+ "logps/rejected": -128.70870971679688,
275
+ "loss": 0.1577,
276
+ "rewards/accuracies": 0.925000011920929,
277
+ "rewards/chosen": 4.486311435699463,
278
+ "rewards/margins": 13.415287971496582,
279
+ "rewards/rejected": -8.928976058959961,
280
  "step": 160
281
  },
282
  {
283
  "epoch": 0.36,
284
+ "grad_norm": 336.28954028277553,
285
  "learning_rate": 4.070934040463998e-07,
286
+ "logits/chosen": -1.7846260070800781,
287
+ "logits/rejected": -1.7679131031036377,
288
+ "logps/chosen": -121.9030990600586,
289
+ "logps/rejected": -130.14794921875,
290
+ "loss": 0.1516,
291
+ "rewards/accuracies": 0.9312499761581421,
292
+ "rewards/chosen": 5.171528339385986,
293
+ "rewards/margins": 15.514431953430176,
294
+ "rewards/rejected": -10.342904090881348,
295
  "step": 170
296
  },
297
  {
298
  "epoch": 0.38,
299
+ "grad_norm": 266.7294156199543,
300
  "learning_rate": 3.9247834624635404e-07,
301
+ "logits/chosen": -1.7311102151870728,
302
+ "logits/rejected": -1.725064992904663,
303
+ "logps/chosen": -125.84059143066406,
304
+ "logps/rejected": -125.44111633300781,
305
+ "loss": 0.1616,
306
+ "rewards/accuracies": 0.90625,
307
+ "rewards/chosen": 5.19378137588501,
308
+ "rewards/margins": 15.565200805664062,
309
+ "rewards/rejected": -10.371419906616211,
310
  "step": 180
311
  },
312
  {
313
  "epoch": 0.4,
314
+ "grad_norm": 243.44642169675112,
315
  "learning_rate": 3.7710310482256523e-07,
316
+ "logits/chosen": -1.764722466468811,
317
+ "logits/rejected": -1.7476263046264648,
318
+ "logps/chosen": -114.77116394042969,
319
+ "logps/rejected": -139.33917236328125,
320
+ "loss": 0.1684,
321
+ "rewards/accuracies": 0.887499988079071,
322
+ "rewards/chosen": 5.108515739440918,
323
+ "rewards/margins": 16.28169822692871,
324
+ "rewards/rejected": -11.173181533813477,
325
  "step": 190
326
  },
327
  {
328
  "epoch": 0.42,
329
+ "grad_norm": 258.83830985573707,
330
  "learning_rate": 3.610497133404795e-07,
331
+ "logits/chosen": -1.7498699426651,
332
+ "logits/rejected": -1.7545902729034424,
333
+ "logps/chosen": -120.42120361328125,
334
+ "logps/rejected": -123.0103988647461,
335
+ "loss": 0.1795,
336
  "rewards/accuracies": 0.9375,
337
+ "rewards/chosen": 5.614555835723877,
338
+ "rewards/margins": 16.115243911743164,
339
+ "rewards/rejected": -10.500688552856445,
340
  "step": 200
341
  },
342
  {
343
  "epoch": 0.42,
344
+ "eval_logits/chosen": -1.86138117313385,
345
+ "eval_logits/rejected": -1.8606913089752197,
346
+ "eval_logps/chosen": -120.21395111083984,
347
+ "eval_logps/rejected": -130.0475616455078,
348
+ "eval_loss": 0.13641399145126343,
349
+ "eval_rewards/accuracies": 0.93359375,
350
+ "eval_rewards/chosen": 5.267488479614258,
351
+ "eval_rewards/margins": 16.30953025817871,
352
+ "eval_rewards/rejected": -11.042043685913086,
353
+ "eval_runtime": 97.5652,
354
+ "eval_samples_per_second": 20.499,
355
  "eval_steps_per_second": 0.328,
356
  "step": 200
357
  },
358
  {
359
  "epoch": 0.44,
360
+ "grad_norm": 258.76531038458563,
361
  "learning_rate": 3.4440382358952115e-07,
362
+ "logits/chosen": -1.689432144165039,
363
+ "logits/rejected": -1.632364273071289,
364
+ "logps/chosen": -117.7408447265625,
365
+ "logps/rejected": -117.53926086425781,
366
+ "loss": 0.1494,
367
+ "rewards/accuracies": 0.887499988079071,
368
+ "rewards/chosen": 4.45497989654541,
369
+ "rewards/margins": 12.836206436157227,
370
+ "rewards/rejected": -8.381224632263184,
371
  "step": 210
372
  },
373
  {
374
  "epoch": 0.46,
375
+ "grad_norm": 163.97448768931915,
376
  "learning_rate": 3.272542485937368e-07,
377
+ "logits/chosen": -1.8162240982055664,
378
+ "logits/rejected": -1.8604834079742432,
379
+ "logps/chosen": -118.64457702636719,
380
+ "logps/rejected": -118.34297180175781,
381
+ "loss": 0.1524,
382
+ "rewards/accuracies": 0.925000011920929,
383
+ "rewards/chosen": 5.34921407699585,
384
+ "rewards/margins": 14.593510627746582,
385
+ "rewards/rejected": -9.244296073913574,
386
  "step": 220
387
  },
388
  {
389
  "epoch": 0.48,
390
+ "grad_norm": 122.96583033580265,
391
  "learning_rate": 3.096924887558854e-07,
392
+ "logits/chosen": -1.7572132349014282,
393
+ "logits/rejected": -1.708581566810608,
394
+ "logps/chosen": -131.16842651367188,
395
+ "logps/rejected": -137.84829711914062,
396
+ "loss": 0.1815,
397
+ "rewards/accuracies": 0.9437500238418579,
398
+ "rewards/chosen": 4.962122440338135,
399
+ "rewards/margins": 16.372663497924805,
400
+ "rewards/rejected": -11.410540580749512,
401
  "step": 230
402
  },
403
  {
404
  "epoch": 0.5,
405
+ "grad_norm": 300.9054437339817,
406
  "learning_rate": 2.9181224366319943e-07,
407
+ "logits/chosen": -1.768376111984253,
408
+ "logits/rejected": -1.7853620052337646,
409
+ "logps/chosen": -118.08064270019531,
410
+ "logps/rejected": -129.52337646484375,
411
+ "loss": 0.1638,
412
+ "rewards/accuracies": 0.8999999761581421,
413
+ "rewards/chosen": 4.15334415435791,
414
+ "rewards/margins": 13.721521377563477,
415
+ "rewards/rejected": -9.568175315856934,
416
  "step": 240
417
  },
418
  {
419
  "epoch": 0.52,
420
+ "grad_norm": 279.7087004166346,
421
  "learning_rate": 2.7370891215954565e-07,
422
+ "logits/chosen": -1.7569881677627563,
423
+ "logits/rejected": -1.7871322631835938,
424
+ "logps/chosen": -121.43367767333984,
425
+ "logps/rejected": -130.93099975585938,
426
+ "loss": 0.1737,
427
+ "rewards/accuracies": 0.9312499761581421,
428
+ "rewards/chosen": 4.7944416999816895,
429
+ "rewards/margins": 14.978727340698242,
430
+ "rewards/rejected": -10.184286117553711,
431
  "step": 250
432
  },
433
  {
434
  "epoch": 0.54,
435
+ "grad_norm": 163.84527157544522,
436
  "learning_rate": 2.55479083351317e-07,
437
+ "logits/chosen": -1.7968614101409912,
438
+ "logits/rejected": -1.8222767114639282,
439
+ "logps/chosen": -125.540283203125,
440
+ "logps/rejected": -119.97703552246094,
441
+ "loss": 0.1484,
442
+ "rewards/accuracies": 0.90625,
443
+ "rewards/chosen": 5.486014366149902,
444
+ "rewards/margins": 16.016239166259766,
445
+ "rewards/rejected": -10.530224800109863,
446
  "step": 260
447
  },
448
  {
449
  "epoch": 0.56,
450
+ "grad_norm": 150.71761898213634,
451
  "learning_rate": 2.3722002126275822e-07,
452
+ "logits/chosen": -1.787398099899292,
453
+ "logits/rejected": -1.723350167274475,
454
+ "logps/chosen": -115.22535705566406,
455
+ "logps/rejected": -123.31414794921875,
456
+ "loss": 0.139,
457
+ "rewards/accuracies": 0.90625,
458
+ "rewards/chosen": 5.049933433532715,
459
+ "rewards/margins": 14.674034118652344,
460
+ "rewards/rejected": -9.624099731445312,
461
  "step": 270
462
  },
463
  {
464
  "epoch": 0.59,
465
+ "grad_norm": 267.05255152770263,
466
  "learning_rate": 2.19029145890313e-07,
467
+ "logits/chosen": -1.6403396129608154,
468
+ "logits/rejected": -1.728877067565918,
469
+ "logps/chosen": -121.6760482788086,
470
+ "logps/rejected": -137.2602996826172,
471
+ "loss": 0.1837,
472
+ "rewards/accuracies": 0.918749988079071,
473
+ "rewards/chosen": 4.614184379577637,
474
+ "rewards/margins": 15.387414932250977,
475
+ "rewards/rejected": -10.773229598999023,
476
  "step": 280
477
  },
478
  {
479
  "epoch": 0.61,
480
+ "grad_norm": 355.7773679008215,
481
  "learning_rate": 2.0100351342479216e-07,
482
+ "logits/chosen": -1.7196556329727173,
483
+ "logits/rejected": -1.7061948776245117,
484
+ "logps/chosen": -112.95915222167969,
485
+ "logps/rejected": -124.29833984375,
486
+ "loss": 0.1648,
487
  "rewards/accuracies": 0.887499988079071,
488
+ "rewards/chosen": 4.5611090660095215,
489
+ "rewards/margins": 15.115681648254395,
490
+ "rewards/rejected": -10.554571151733398,
491
  "step": 290
492
  },
493
  {
494
  "epoch": 0.63,
495
+ "grad_norm": 387.9028451533635,
496
  "learning_rate": 1.8323929841460178e-07,
497
+ "logits/chosen": -1.6907150745391846,
498
+ "logits/rejected": -1.6335220336914062,
499
+ "logps/chosen": -130.05313110351562,
500
+ "logps/rejected": -141.36476135253906,
501
+ "loss": 0.1585,
502
+ "rewards/accuracies": 0.9312499761581421,
503
+ "rewards/chosen": 4.916111946105957,
504
+ "rewards/margins": 15.453518867492676,
505
+ "rewards/rejected": -10.537405967712402,
506
  "step": 300
507
  },
508
  {
509
  "epoch": 0.63,
510
+ "eval_logits/chosen": -1.7980220317840576,
511
+ "eval_logits/rejected": -1.7959610223770142,
512
+ "eval_logps/chosen": -120.6431655883789,
513
+ "eval_logps/rejected": -132.25042724609375,
514
+ "eval_loss": 0.14247241616249084,
515
  "eval_rewards/accuracies": 0.92578125,
516
+ "eval_rewards/chosen": 5.138728141784668,
517
+ "eval_rewards/margins": 16.841632843017578,
518
+ "eval_rewards/rejected": -11.702906608581543,
519
+ "eval_runtime": 97.7011,
520
+ "eval_samples_per_second": 20.471,
521
+ "eval_steps_per_second": 0.328,
522
  "step": 300
523
  },
524
  {
525
  "epoch": 0.65,
526
+ "grad_norm": 308.7683125090126,
527
  "learning_rate": 1.6583128063291573e-07,
528
+ "logits/chosen": -1.6941922903060913,
529
+ "logits/rejected": -1.7201515436172485,
530
+ "logps/chosen": -121.898193359375,
531
+ "logps/rejected": -132.00962829589844,
532
+ "loss": 0.1619,
533
  "rewards/accuracies": 0.9375,
534
+ "rewards/chosen": 5.231846809387207,
535
+ "rewards/margins": 15.2730131149292,
536
+ "rewards/rejected": -10.041168212890625,
537
  "step": 310
538
  },
539
  {
540
  "epoch": 0.67,
541
+ "grad_norm": 208.88097429038612,
542
  "learning_rate": 1.488723393865766e-07,
543
+ "logits/chosen": -1.7199640274047852,
544
+ "logits/rejected": -1.6923316717147827,
545
+ "logps/chosen": -111.40040588378906,
546
+ "logps/rejected": -131.6649627685547,
547
+ "loss": 0.154,
548
+ "rewards/accuracies": 0.8812500238418579,
549
+ "rewards/chosen": 5.818942070007324,
550
+ "rewards/margins": 16.441822052001953,
551
+ "rewards/rejected": -10.62287712097168,
552
  "step": 320
553
  },
554
  {
555
  "epoch": 0.69,
556
+ "grad_norm": 167.39144709959334,
557
  "learning_rate": 1.3245295796480788e-07,
558
+ "logits/chosen": -1.65103018283844,
559
+ "logits/rejected": -1.7277615070343018,
560
+ "logps/chosen": -121.6876449584961,
561
+ "logps/rejected": -127.0235366821289,
562
+ "loss": 0.1507,
563
+ "rewards/accuracies": 0.90625,
564
+ "rewards/chosen": 4.948590278625488,
565
+ "rewards/margins": 14.931228637695312,
566
+ "rewards/rejected": -9.982640266418457,
567
  "step": 330
568
  },
569
  {
570
  "epoch": 0.71,
571
+ "grad_norm": 156.90849982717606,
572
  "learning_rate": 1.1666074087171627e-07,
573
+ "logits/chosen": -1.602086067199707,
574
+ "logits/rejected": -1.6703577041625977,
575
+ "logps/chosen": -115.71211242675781,
576
+ "logps/rejected": -116.56599426269531,
577
+ "loss": 0.1908,
578
+ "rewards/accuracies": 0.8812500238418579,
579
+ "rewards/chosen": 4.433460235595703,
580
+ "rewards/margins": 13.112295150756836,
581
+ "rewards/rejected": -8.678834915161133,
582
  "step": 340
583
  },
584
  {
585
  "epoch": 0.73,
586
+ "grad_norm": 295.768622683131,
587
  "learning_rate": 1.0157994641835734e-07,
588
+ "logits/chosen": -1.6467043161392212,
589
+ "logits/rejected": -1.659854531288147,
590
+ "logps/chosen": -115.77754974365234,
591
+ "logps/rejected": -137.26531982421875,
592
+ "loss": 0.1326,
593
+ "rewards/accuracies": 0.9375,
594
+ "rewards/chosen": 4.292226314544678,
595
+ "rewards/margins": 15.004185676574707,
596
+ "rewards/rejected": -10.711957931518555,
597
  "step": 350
598
  },
599
  {
600
  "epoch": 0.75,
601
+ "grad_norm": 201.50715466732544,
602
  "learning_rate": 8.729103716819111e-08,
603
+ "logits/chosen": -1.6420570611953735,
604
+ "logits/rejected": -1.663745641708374,
605
+ "logps/chosen": -122.47066497802734,
606
+ "logps/rejected": -131.9073486328125,
607
+ "loss": 0.1543,
608
+ "rewards/accuracies": 0.8687499761581421,
609
+ "rewards/chosen": 4.703906059265137,
610
+ "rewards/margins": 14.480855941772461,
611
+ "rewards/rejected": -9.776951789855957,
612
  "step": 360
613
  },
614
  {
615
  "epoch": 0.77,
616
+ "grad_norm": 398.4959635020424,
617
  "learning_rate": 7.387025063449081e-08,
618
+ "logits/chosen": -1.7505977153778076,
619
+ "logits/rejected": -1.6930338144302368,
620
+ "logps/chosen": -112.33251953125,
621
+ "logps/rejected": -121.57230377197266,
622
+ "loss": 0.1443,
623
+ "rewards/accuracies": 0.9437500238418579,
624
+ "rewards/chosen": 4.217165470123291,
625
+ "rewards/margins": 12.93463134765625,
626
+ "rewards/rejected": -8.7174654006958,
627
  "step": 370
628
  },
629
  {
630
  "epoch": 0.79,
631
+ "grad_norm": 239.63863129657855,
632
  "learning_rate": 6.138919252022435e-08,
633
+ "logits/chosen": -1.7603282928466797,
634
+ "logits/rejected": -1.7833961248397827,
635
+ "logps/chosen": -119.14958190917969,
636
+ "logps/rejected": -136.85977172851562,
637
+ "loss": 0.1576,
638
+ "rewards/accuracies": 0.949999988079071,
639
+ "rewards/chosen": 5.251246452331543,
640
+ "rewards/margins": 16.546478271484375,
641
+ "rewards/rejected": -11.295232772827148,
642
  "step": 380
643
  },
644
  {
645
  "epoch": 0.82,
646
+ "grad_norm": 197.58750581294214,
647
  "learning_rate": 4.991445467064689e-08,
648
+ "logits/chosen": -1.656961441040039,
649
+ "logits/rejected": -1.6608669757843018,
650
+ "logps/chosen": -115.59519958496094,
651
+ "logps/rejected": -124.886474609375,
652
+ "loss": 0.1242,
653
+ "rewards/accuracies": 0.9624999761581421,
654
+ "rewards/chosen": 5.070186614990234,
655
+ "rewards/margins": 15.444877624511719,
656
+ "rewards/rejected": -10.3746919631958,
657
  "step": 390
658
  },
659
  {
660
  "epoch": 0.84,
661
+ "grad_norm": 266.2162184394133,
662
  "learning_rate": 3.9507259776993954e-08,
663
+ "logits/chosen": -1.636885643005371,
664
+ "logits/rejected": -1.713200330734253,
665
+ "logps/chosen": -117.88565826416016,
666
+ "logps/rejected": -124.86863708496094,
667
+ "loss": 0.2005,
668
  "rewards/accuracies": 0.9125000238418579,
669
+ "rewards/chosen": 4.46274471282959,
670
+ "rewards/margins": 14.449310302734375,
671
+ "rewards/rejected": -9.986566543579102,
672
  "step": 400
673
  },
674
  {
675
  "epoch": 0.84,
676
+ "eval_logits/chosen": -1.8133598566055298,
677
+ "eval_logits/rejected": -1.8130455017089844,
678
+ "eval_logps/chosen": -119.97606658935547,
679
+ "eval_logps/rejected": -131.71229553222656,
680
+ "eval_loss": 0.14202240109443665,
681
+ "eval_rewards/accuracies": 0.92578125,
682
+ "eval_rewards/chosen": 5.338851451873779,
683
+ "eval_rewards/margins": 16.880319595336914,
684
+ "eval_rewards/rejected": -11.541468620300293,
685
+ "eval_runtime": 97.6019,
686
+ "eval_samples_per_second": 20.491,
687
  "eval_steps_per_second": 0.328,
688
  "step": 400
689
  },
690
  {
691
  "epoch": 0.86,
692
+ "grad_norm": 300.6588760835356,
693
  "learning_rate": 3.022313472693447e-08,
694
+ "logits/chosen": -1.735870599746704,
695
+ "logits/rejected": -1.7638452053070068,
696
+ "logps/chosen": -128.670166015625,
697
+ "logps/rejected": -129.5255126953125,
698
+ "loss": 0.1508,
699
  "rewards/accuracies": 0.925000011920929,
700
+ "rewards/chosen": 5.593460559844971,
701
+ "rewards/margins": 16.819358825683594,
702
+ "rewards/rejected": -11.225897789001465,
703
  "step": 410
704
  },
705
  {
706
  "epoch": 0.88,
707
+ "grad_norm": 184.26802651594184,
708
  "learning_rate": 2.2111614344599684e-08,
709
+ "logits/chosen": -1.7033697366714478,
710
+ "logits/rejected": -1.704993486404419,
711
+ "logps/chosen": -113.72819519042969,
712
+ "logps/rejected": -124.34715270996094,
713
+ "loss": 0.1419,
714
+ "rewards/accuracies": 0.9125000238418579,
715
+ "rewards/chosen": 4.3744401931762695,
716
+ "rewards/margins": 14.479223251342773,
717
+ "rewards/rejected": -10.104782104492188,
718
  "step": 420
719
  },
720
  {
721
  "epoch": 0.9,
722
+ "grad_norm": 269.33105520831003,
723
  "learning_rate": 1.521597710086439e-08,
724
+ "logits/chosen": -1.6633046865463257,
725
+ "logits/rejected": -1.6779390573501587,
726
+ "logps/chosen": -129.4523162841797,
727
+ "logps/rejected": -124.72953796386719,
728
+ "loss": 0.154,
729
+ "rewards/accuracies": 0.918749988079071,
730
+ "rewards/chosen": 4.997079372406006,
731
+ "rewards/margins": 14.370283126831055,
732
+ "rewards/rejected": -9.373201370239258,
733
  "step": 430
734
  },
735
  {
736
  "epoch": 0.92,
737
+ "grad_norm": 181.5022378731684,
738
  "learning_rate": 9.57301420397924e-09,
739
+ "logits/chosen": -1.7851531505584717,
740
+ "logits/rejected": -1.790157675743103,
741
+ "logps/chosen": -120.8554458618164,
742
+ "logps/rejected": -131.34410095214844,
743
+ "loss": 0.144,
744
+ "rewards/accuracies": 0.96875,
745
+ "rewards/chosen": 5.435299873352051,
746
+ "rewards/margins": 16.81759262084961,
747
+ "rewards/rejected": -11.382290840148926,
748
  "step": 440
749
  },
750
  {
751
  "epoch": 0.94,
752
+ "grad_norm": 227.09412853715097,
753
  "learning_rate": 5.212833302556258e-09,
754
+ "logits/chosen": -1.7998348474502563,
755
+ "logits/rejected": -1.7618439197540283,
756
+ "logps/chosen": -116.46590423583984,
757
+ "logps/rejected": -127.42796325683594,
758
+ "loss": 0.159,
759
+ "rewards/accuracies": 0.8999999761581421,
760
+ "rewards/chosen": 5.652300834655762,
761
+ "rewards/margins": 17.369140625,
762
+ "rewards/rejected": -11.716839790344238,
763
  "step": 450
764
  },
765
  {
766
  "epoch": 0.96,
767
+ "grad_norm": 225.69247261851913,
768
  "learning_rate": 2.158697848236607e-09,
769
+ "logits/chosen": -1.6827681064605713,
770
+ "logits/rejected": -1.7046699523925781,
771
+ "logps/chosen": -120.79095458984375,
772
+ "logps/rejected": -130.98043823242188,
773
+ "loss": 0.1335,
774
+ "rewards/accuracies": 0.925000011920929,
775
+ "rewards/chosen": 5.568859100341797,
776
+ "rewards/margins": 16.174407958984375,
777
+ "rewards/rejected": -10.605547904968262,
778
  "step": 460
779
  },
780
  {
781
  "epoch": 0.98,
782
+ "grad_norm": 199.48903785359678,
783
  "learning_rate": 4.269029751107489e-10,
784
+ "logits/chosen": -1.6656444072723389,
785
+ "logits/rejected": -1.6796722412109375,
786
+ "logps/chosen": -114.7574462890625,
787
+ "logps/rejected": -139.92117309570312,
788
+ "loss": 0.1514,
789
+ "rewards/accuracies": 0.90625,
790
+ "rewards/chosen": 4.486257553100586,
791
+ "rewards/margins": 15.330111503601074,
792
+ "rewards/rejected": -10.843853950500488,
793
  "step": 470
794
  },
795
  {
796
  "epoch": 1.0,
797
  "step": 478,
798
  "total_flos": 0.0,
799
+ "train_loss": 0.17749474911510196,
800
+ "train_runtime": 7645.2484,
801
+ "train_samples_per_second": 7.996,
802
  "train_steps_per_second": 0.063
803
  }
804
  ],