RikkiXu commited on
Commit
a36d8e7
1 Parent(s): 4ddafe0

Model save

Browse files
README.md CHANGED
@@ -32,7 +32,7 @@ More information needed
32
  ### Training hyperparameters
33
 
34
  The following hyperparameters were used during training:
35
- - learning_rate: 5e-09
36
  - train_batch_size: 4
37
  - eval_batch_size: 4
38
  - seed: 42
@@ -52,7 +52,7 @@ The following hyperparameters were used during training:
52
 
53
  ### Framework versions
54
 
55
- - Transformers 4.41.1
56
  - Pytorch 2.1.2+cu118
57
  - Datasets 2.16.1
58
- - Tokenizers 0.19.1
 
32
  ### Training hyperparameters
33
 
34
  The following hyperparameters were used during training:
35
+ - learning_rate: 1e-09
36
  - train_batch_size: 4
37
  - eval_batch_size: 4
38
  - seed: 42
 
52
 
53
  ### Framework versions
54
 
55
+ - Transformers 4.39.3
56
  - Pytorch 2.1.2+cu118
57
  - Datasets 2.16.1
58
+ - Tokenizers 0.15.2
all_results.json CHANGED
@@ -1,9 +1,8 @@
1
  {
2
- "epoch": 0.9990783410138249,
3
- "total_flos": 0.0,
4
- "train_loss": 0.7834695007968213,
5
- "train_runtime": 9239.3637,
6
- "train_samples": 69410,
7
- "train_samples_per_second": 7.512,
8
- "train_steps_per_second": 0.059
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.7340851113084674,
4
+ "train_runtime": 8012.0764,
5
+ "train_samples": 66084,
6
+ "train_samples_per_second": 8.248,
7
+ "train_steps_per_second": 0.064
 
8
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
- "transformers_version": "4.41.1"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
+ "transformers_version": "4.39.3"
6
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c9c59d81358b922cc7dc4a0f1212ed3989092cc463daa04e06aade722a12a55
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98b359f2ed439c25068388b47040cdb10b77f5f5fa53649105ea9d05368882f0
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ad6c951a013cca13cc4d3888f5fd9cebd9dbd9d0db94933753fda8a549ad3f8
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3fff5c90a9f6a4b054b84b812f4f3aef8e56cc79b45cf52084d4bd2d0be53fc
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:432431dead4b9b1064f55e868128f357e49949a9557abbb160b8a3320007e2bd
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e93c1614085b19e11689c215dea89f11cb83c8457a6e6cc90cb2d26115dc084f
3
  size 4540516344
runs/Jul07_23-56-32_n136-100-194/events.out.tfevents.1720368281.n136-100-194.1414729.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:441817b12b3830eb99b45ae5ebc5129b9e7a0311347eafc843945004a5dffb69
3
- size 39904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d166b3928c1bc2acc55a3c2e2a9d6496526373bf273f5d85b4f36b96b6565889
3
+ size 40946
train_results.json CHANGED
@@ -1,9 +1,8 @@
1
  {
2
- "epoch": 0.9990783410138249,
3
- "total_flos": 0.0,
4
- "train_loss": 0.7834695007968213,
5
- "train_runtime": 9239.3637,
6
- "train_samples": 69410,
7
- "train_samples_per_second": 7.512,
8
- "train_steps_per_second": 0.059
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.7340851113084674,
4
+ "train_runtime": 8012.0764,
5
+ "train_samples": 66084,
6
+ "train_samples_per_second": 8.248,
7
+ "train_steps_per_second": 0.064
 
8
  }
trainer_state.json CHANGED
@@ -1,22 +1,22 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9990783410138249,
5
  "eval_steps": 10000000,
6
- "global_step": 542,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0018433179723502304,
13
- "grad_norm": 3898.482747931532,
14
- "learning_rate": 9.090909090909091e-11,
15
- "logits/chosen": -1.6609081029891968,
16
- "logits/rejected": -1.6088519096374512,
17
- "logps/chosen": -0.9401239156723022,
18
- "logps/rejected": -0.9000049829483032,
19
- "loss": 0.913,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
22
  "rewards/margins": 0.0,
@@ -24,842 +24,785 @@
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.018433179723502304,
28
- "grad_norm": 4090.8547186973697,
29
- "learning_rate": 9.090909090909091e-10,
30
- "logits/chosen": -1.655083417892456,
31
- "logits/rejected": -1.5546139478683472,
32
- "logps/chosen": -0.9712722301483154,
33
- "logps/rejected": -0.9472112655639648,
34
- "loss": 0.9926,
35
- "rewards/accuracies": 0.3472222089767456,
36
- "rewards/chosen": 0.004639791324734688,
37
- "rewards/margins": -0.15443584322929382,
38
- "rewards/rejected": 0.15907563269138336,
39
  "step": 10
40
  },
41
  {
42
- "epoch": 0.03686635944700461,
43
- "grad_norm": 5249.87937646366,
44
- "learning_rate": 1.8181818181818182e-09,
45
- "logits/chosen": -1.6007907390594482,
46
- "logits/rejected": -1.5696344375610352,
47
- "logps/chosen": -1.0197725296020508,
48
- "logps/rejected": -0.950878918170929,
49
- "loss": 0.9901,
50
- "rewards/accuracies": 0.581250011920929,
51
- "rewards/chosen": 0.06652222573757172,
52
- "rewards/margins": 0.1879386305809021,
53
- "rewards/rejected": -0.12141638994216919,
54
  "step": 20
55
  },
56
  {
57
- "epoch": 0.055299539170506916,
58
- "grad_norm": 3473.4229634351905,
59
- "learning_rate": 2.727272727272727e-09,
60
- "logits/chosen": -1.6087005138397217,
61
- "logits/rejected": -1.5368092060089111,
62
- "logps/chosen": -0.9995329976081848,
63
- "logps/rejected": -0.999841570854187,
64
- "loss": 1.0333,
65
- "rewards/accuracies": 0.6000000238418579,
66
- "rewards/chosen": 0.08179456740617752,
67
- "rewards/margins": -0.014446260407567024,
68
- "rewards/rejected": 0.09624083340167999,
69
  "step": 30
70
  },
71
  {
72
- "epoch": 0.07373271889400922,
73
- "grad_norm": 4366.816381959647,
74
- "learning_rate": 3.6363636363636364e-09,
75
- "logits/chosen": -1.6297199726104736,
76
- "logits/rejected": -1.527199625968933,
77
- "logps/chosen": -0.9844824075698853,
78
- "logps/rejected": -0.9811803102493286,
79
- "loss": 1.0512,
80
- "rewards/accuracies": 0.45625001192092896,
81
- "rewards/chosen": 0.014942830428481102,
82
- "rewards/margins": -0.04575050622224808,
83
- "rewards/rejected": 0.06069333478808403,
84
  "step": 40
85
  },
86
  {
87
- "epoch": 0.09216589861751152,
88
- "grad_norm": 4310.555577886656,
89
- "learning_rate": 4.545454545454545e-09,
90
- "logits/chosen": -1.5640833377838135,
91
- "logits/rejected": -1.491424322128296,
92
- "logps/chosen": -0.979416012763977,
93
- "logps/rejected": -0.8847635984420776,
94
- "loss": 1.0104,
95
- "rewards/accuracies": 0.46875,
96
- "rewards/chosen": -0.07140497118234634,
97
- "rewards/margins": -0.12020325660705566,
98
- "rewards/rejected": 0.04879828169941902,
99
  "step": 50
100
  },
101
  {
102
- "epoch": 0.11059907834101383,
103
- "grad_norm": 3816.3756252973885,
104
- "learning_rate": 4.99869966817273e-09,
105
- "logits/chosen": -1.5969488620758057,
106
- "logits/rejected": -1.5034626722335815,
107
- "logps/chosen": -1.027340054512024,
108
- "logps/rejected": -1.0131927728652954,
109
- "loss": 1.0351,
110
- "rewards/accuracies": 0.4437499940395355,
111
- "rewards/chosen": -0.028199095278978348,
112
- "rewards/margins": -0.04337681084871292,
113
- "rewards/rejected": 0.015177717432379723,
114
  "step": 60
115
  },
116
  {
117
- "epoch": 0.12903225806451613,
118
- "grad_norm": 4072.2596400359776,
119
- "learning_rate": 4.98830512828915e-09,
120
- "logits/chosen": -1.7021442651748657,
121
- "logits/rejected": -1.6100256443023682,
122
- "logps/chosen": -0.9533787965774536,
123
- "logps/rejected": -0.9375246167182922,
124
- "loss": 1.0037,
125
- "rewards/accuracies": 0.5562499761581421,
126
- "rewards/chosen": 0.08136853575706482,
127
- "rewards/margins": -0.00438536424189806,
128
- "rewards/rejected": 0.08575389534235,
129
  "step": 70
130
  },
131
  {
132
- "epoch": 0.14746543778801843,
133
- "grad_norm": 4118.420754219571,
134
- "learning_rate": 4.967559289596846e-09,
135
- "logits/chosen": -1.5828940868377686,
136
- "logits/rejected": -1.5408188104629517,
137
- "logps/chosen": -0.991844654083252,
138
- "logps/rejected": -0.9241277575492859,
139
- "loss": 1.006,
140
- "rewards/accuracies": 0.512499988079071,
141
- "rewards/chosen": 0.0019341229926794767,
142
- "rewards/margins": -0.02749333344399929,
143
- "rewards/rejected": 0.029427463188767433,
144
  "step": 80
145
  },
146
  {
147
- "epoch": 0.16589861751152074,
148
- "grad_norm": 3629.5409761234123,
149
- "learning_rate": 4.936548454363532e-09,
150
- "logits/chosen": -1.6866261959075928,
151
- "logits/rejected": -1.618090271949768,
152
- "logps/chosen": -1.0860463380813599,
153
- "logps/rejected": -0.9448333978652954,
154
- "loss": 0.9926,
155
- "rewards/accuracies": 0.550000011920929,
156
- "rewards/chosen": 0.08469289541244507,
157
- "rewards/margins": 0.13724537193775177,
158
- "rewards/rejected": -0.0525524728000164,
159
  "step": 90
160
  },
161
  {
162
- "epoch": 0.18433179723502305,
163
- "grad_norm": 3898.733244088321,
164
- "learning_rate": 4.895401627034106e-09,
165
- "logits/chosen": -1.5633373260498047,
166
- "logits/rejected": -1.4719525575637817,
167
- "logps/chosen": -1.0307183265686035,
168
- "logps/rejected": -0.9637019038200378,
169
- "loss": 0.9422,
170
- "rewards/accuracies": 0.518750011920929,
171
- "rewards/chosen": 0.1219116598367691,
172
- "rewards/margins": 0.014806958846747875,
173
- "rewards/rejected": 0.10710470378398895,
174
  "step": 100
175
  },
176
  {
177
- "epoch": 0.20276497695852536,
178
- "grad_norm": 3696.4743498583503,
179
- "learning_rate": 4.844289977574774e-09,
180
- "logits/chosen": -1.5632708072662354,
181
- "logits/rejected": -1.496795892715454,
182
- "logps/chosen": -1.0272352695465088,
183
- "logps/rejected": -0.9775659441947937,
184
- "loss": 0.9388,
185
- "rewards/accuracies": 0.5874999761581421,
186
- "rewards/chosen": 0.17640800774097443,
187
- "rewards/margins": 0.18908366560935974,
188
- "rewards/rejected": -0.012675672769546509,
189
  "step": 110
190
  },
191
  {
192
- "epoch": 0.22119815668202766,
193
- "grad_norm": 3181.2117454771997,
194
- "learning_rate": 4.783426129409464e-09,
195
- "logits/chosen": -1.6399650573730469,
196
- "logits/rejected": -1.528159737586975,
197
- "logps/chosen": -0.9425110816955566,
198
- "logps/rejected": -0.9258049130439758,
199
- "loss": 0.9311,
200
- "rewards/accuracies": 0.5625,
201
- "rewards/chosen": 0.13392865657806396,
202
- "rewards/margins": 0.16803190112113953,
203
- "rewards/rejected": -0.03410324081778526,
204
  "step": 120
205
  },
206
  {
207
- "epoch": 0.23963133640552994,
208
- "grad_norm": 2982.5336500421463,
209
- "learning_rate": 4.713063274910708e-09,
210
- "logits/chosen": -1.5521310567855835,
211
- "logits/rejected": -1.4859836101531982,
212
- "logps/chosen": -1.0397251844406128,
213
- "logps/rejected": -0.9695302844047546,
214
- "loss": 0.9569,
215
- "rewards/accuracies": 0.59375,
216
- "rewards/chosen": 0.1583443433046341,
217
- "rewards/margins": 0.1197737455368042,
218
- "rewards/rejected": 0.03857060521841049,
219
  "step": 130
220
  },
221
  {
222
- "epoch": 0.25806451612903225,
223
- "grad_norm": 3620.919118773496,
224
- "learning_rate": 4.633494122124504e-09,
225
- "logits/chosen": -1.5602772235870361,
226
- "logits/rejected": -1.5094670057296753,
227
- "logps/chosen": -1.0015778541564941,
228
- "logps/rejected": -0.9640843272209167,
229
- "loss": 0.8817,
230
- "rewards/accuracies": 0.6000000238418579,
231
- "rewards/chosen": 0.1787305474281311,
232
- "rewards/margins": 0.17238874733448029,
233
- "rewards/rejected": 0.006341800093650818,
234
  "step": 140
235
  },
236
  {
237
- "epoch": 0.2764976958525346,
238
- "grad_norm": 4013.4092182868308,
239
- "learning_rate": 4.545049677110793e-09,
240
- "logits/chosen": -1.5850383043289185,
241
- "logits/rejected": -1.5053658485412598,
242
- "logps/chosen": -0.9671676754951477,
243
- "logps/rejected": -0.9508814811706543,
244
- "loss": 0.8971,
245
- "rewards/accuracies": 0.6312500238418579,
246
- "rewards/chosen": 0.3438531458377838,
247
- "rewards/margins": 0.29903319478034973,
248
- "rewards/rejected": 0.044819992035627365,
249
  "step": 150
250
  },
251
  {
252
- "epoch": 0.29493087557603687,
253
- "grad_norm": 3947.6099295976574,
254
- "learning_rate": 4.4480978669649716e-09,
255
- "logits/chosen": -1.5827395915985107,
256
- "logits/rejected": -1.4827316999435425,
257
- "logps/chosen": -0.9799680709838867,
258
- "logps/rejected": -0.9488567113876343,
259
- "loss": 0.8639,
260
- "rewards/accuracies": 0.6875,
261
- "rewards/chosen": 0.31666576862335205,
262
- "rewards/margins": 0.38411837816238403,
263
- "rewards/rejected": -0.06745252758264542,
264
  "step": 160
265
  },
266
  {
267
- "epoch": 0.31336405529953915,
268
- "grad_norm": 3208.2709652021513,
269
- "learning_rate": 4.343042009248641e-09,
270
- "logits/chosen": -1.6824086904525757,
271
- "logits/rejected": -1.6013917922973633,
272
- "logps/chosen": -1.0053058862686157,
273
- "logps/rejected": -0.9279009699821472,
274
- "loss": 0.8277,
275
- "rewards/accuracies": 0.6937500238418579,
276
- "rewards/chosen": 0.48275047540664673,
277
- "rewards/margins": 0.4515896737575531,
278
- "rewards/rejected": 0.03116079606115818,
279
  "step": 170
280
  },
281
  {
282
- "epoch": 0.3317972350230415,
283
- "grad_norm": 3131.9144039177236,
284
- "learning_rate": 4.230319134196747e-09,
285
- "logits/chosen": -1.5756456851959229,
286
- "logits/rejected": -1.4971188306808472,
287
- "logps/chosen": -1.0033220052719116,
288
- "logps/rejected": -0.9541338086128235,
289
- "loss": 0.8293,
290
- "rewards/accuracies": 0.699999988079071,
291
- "rewards/chosen": 0.5112671852111816,
292
- "rewards/margins": 0.4981989860534668,
293
- "rewards/rejected": 0.013068236410617828,
294
  "step": 180
295
  },
296
  {
297
- "epoch": 0.35023041474654376,
298
- "grad_norm": 2715.064519767043,
299
- "learning_rate": 4.110398166680671e-09,
300
- "logits/chosen": -1.5660308599472046,
301
- "logits/rejected": -1.4755548238754272,
302
- "logps/chosen": -0.9532132148742676,
303
- "logps/rejected": -0.8907047510147095,
304
- "loss": 0.7906,
305
- "rewards/accuracies": 0.6812499761581421,
306
- "rewards/chosen": 0.6197713017463684,
307
- "rewards/margins": 0.528595507144928,
308
- "rewards/rejected": 0.09117577970027924,
309
  "step": 190
310
  },
311
  {
312
- "epoch": 0.3686635944700461,
313
- "grad_norm": 3468.2349150799664,
314
- "learning_rate": 3.9837779754902876e-09,
315
- "logits/chosen": -1.7327858209609985,
316
- "logits/rejected": -1.6148407459259033,
317
- "logps/chosen": -1.0091302394866943,
318
- "logps/rejected": -0.9430079460144043,
319
- "loss": 0.7922,
320
- "rewards/accuracies": 0.7124999761581421,
321
- "rewards/chosen": 0.5713268518447876,
322
- "rewards/margins": 0.5340843200683594,
323
- "rewards/rejected": 0.03724261373281479,
324
  "step": 200
325
  },
326
  {
327
- "epoch": 0.3870967741935484,
328
- "grad_norm": 5541.889609678496,
329
- "learning_rate": 3.8509852980499295e-09,
330
- "logits/chosen": -1.5810222625732422,
331
- "logits/rejected": -1.5188120603561401,
332
- "logps/chosen": -0.9732600450515747,
333
- "logps/rejected": -0.8989869356155396,
334
- "loss": 0.8017,
335
- "rewards/accuracies": 0.737500011920929,
336
- "rewards/chosen": 0.5253499746322632,
337
- "rewards/margins": 0.5527364015579224,
338
- "rewards/rejected": -0.027386415749788284,
339
  "step": 210
340
  },
341
  {
342
- "epoch": 0.4055299539170507,
343
- "grad_norm": 3226.71468419498,
344
- "learning_rate": 3.7125725492013986e-09,
345
- "logits/chosen": -1.6166194677352905,
346
- "logits/rejected": -1.5297718048095703,
347
- "logps/chosen": -0.9580337405204773,
348
- "logps/rejected": -0.9028044939041138,
349
- "loss": 0.7911,
350
- "rewards/accuracies": 0.731249988079071,
351
- "rewards/chosen": 0.6303132772445679,
352
- "rewards/margins": 0.5151628255844116,
353
- "rewards/rejected": 0.11515048891305923,
354
  "step": 220
355
  },
356
  {
357
- "epoch": 0.423963133640553,
358
- "grad_norm": 3730.157392610105,
359
- "learning_rate": 3.5691155231694373e-09,
360
- "logits/chosen": -1.6063117980957031,
361
- "logits/rejected": -1.5548478364944458,
362
- "logps/chosen": -0.9622556567192078,
363
- "logps/rejected": -0.9150797128677368,
364
- "loss": 0.7521,
365
- "rewards/accuracies": 0.737500011920929,
366
- "rewards/chosen": 0.6007626056671143,
367
- "rewards/margins": 0.6586055755615234,
368
- "rewards/rejected": -0.057842958718538284,
369
  "step": 230
370
  },
371
  {
372
- "epoch": 0.4423963133640553,
373
- "grad_norm": 3478.733032746658,
374
- "learning_rate": 3.421210998269447e-09,
375
- "logits/chosen": -1.6624641418457031,
376
- "logits/rejected": -1.6196787357330322,
377
- "logps/chosen": -1.015082597732544,
378
- "logps/rejected": -0.9574974775314331,
379
- "loss": 0.7876,
380
- "rewards/accuracies": 0.65625,
381
- "rewards/chosen": 0.6061643362045288,
382
- "rewards/margins": 0.4766843914985657,
383
- "rewards/rejected": 0.1294800341129303,
384
  "step": 240
385
  },
386
  {
387
- "epoch": 0.4608294930875576,
388
- "grad_norm": 4199.026342735118,
389
- "learning_rate": 3.269474254321818e-09,
390
- "logits/chosen": -1.6640870571136475,
391
- "logits/rejected": -1.611088514328003,
392
- "logps/chosen": -0.9812959432601929,
393
- "logps/rejected": -0.9682718515396118,
394
- "loss": 0.7719,
395
- "rewards/accuracies": 0.6812499761581421,
396
- "rewards/chosen": 0.734277606010437,
397
- "rewards/margins": 0.5840839147567749,
398
- "rewards/rejected": 0.15019364655017853,
399
  "step": 250
400
  },
401
  {
402
- "epoch": 0.4792626728110599,
403
- "grad_norm": 2843.441875870463,
404
- "learning_rate": 3.1145365131003605e-09,
405
- "logits/chosen": -1.6078466176986694,
406
- "logits/rejected": -1.5514041185379028,
407
- "logps/chosen": -0.9493719935417175,
408
- "logps/rejected": -0.8929805755615234,
409
- "loss": 0.7262,
410
- "rewards/accuracies": 0.7437499761581421,
411
- "rewards/chosen": 0.7151438593864441,
412
- "rewards/margins": 0.7053098082542419,
413
- "rewards/rejected": 0.00983402505517006,
414
  "step": 260
415
  },
416
  {
417
- "epoch": 0.4976958525345622,
418
- "grad_norm": 3021.910017633022,
419
- "learning_rate": 2.95704231246255e-09,
420
- "logits/chosen": -1.641345739364624,
421
- "logits/rejected": -1.5682355165481567,
422
- "logps/chosen": -0.9710962176322937,
423
- "logps/rejected": -0.9229364395141602,
424
- "loss": 0.7197,
425
- "rewards/accuracies": 0.7875000238418579,
426
- "rewards/chosen": 0.8893934488296509,
427
- "rewards/margins": 0.7715497016906738,
428
- "rewards/rejected": 0.11784378439188004,
429
  "step": 270
430
  },
431
  {
432
- "epoch": 0.5161290322580645,
433
- "grad_norm": 2769.0118344627704,
434
- "learning_rate": 2.797646825085125e-09,
435
- "logits/chosen": -1.6461395025253296,
436
- "logits/rejected": -1.614833116531372,
437
- "logps/chosen": -0.9965534210205078,
438
- "logps/rejected": -0.9103838801383972,
439
- "loss": 0.7102,
440
- "rewards/accuracies": 0.7437499761581421,
441
- "rewards/chosen": 0.900854229927063,
442
- "rewards/margins": 0.7569249868392944,
443
- "rewards/rejected": 0.14392916858196259,
444
  "step": 280
445
  },
446
  {
447
- "epoch": 0.5345622119815668,
448
- "grad_norm": 3201.218696167003,
449
- "learning_rate": 2.6370131329590557e-09,
450
- "logits/chosen": -1.6856491565704346,
451
- "logits/rejected": -1.5777591466903687,
452
- "logps/chosen": -0.9708482623100281,
453
- "logps/rejected": -0.9296213984489441,
454
- "loss": 0.6843,
455
- "rewards/accuracies": 0.71875,
456
- "rewards/chosen": 0.9540289044380188,
457
- "rewards/margins": 0.750830352306366,
458
- "rewards/rejected": 0.2031985968351364,
459
  "step": 290
460
  },
461
  {
462
- "epoch": 0.5529953917050692,
463
- "grad_norm": 2497.168902383888,
464
- "learning_rate": 2.4758094689819246e-09,
465
- "logits/chosen": -1.6694984436035156,
466
- "logits/rejected": -1.5708268880844116,
467
- "logps/chosen": -0.9928863644599915,
468
- "logps/rejected": -0.8953903913497925,
469
- "loss": 0.6994,
470
- "rewards/accuracies": 0.731249988079071,
471
- "rewards/chosen": 0.7281567454338074,
472
- "rewards/margins": 0.802379310131073,
473
- "rewards/rejected": -0.07422257214784622,
474
  "step": 300
475
  },
476
  {
477
- "epoch": 0.5714285714285714,
478
- "grad_norm": 4042.9572154642865,
479
- "learning_rate": 2.3147064371226394e-09,
480
- "logits/chosen": -1.6897979974746704,
481
- "logits/rejected": -1.6063530445098877,
482
- "logps/chosen": -0.9562484622001648,
483
- "logps/rejected": -0.8966572880744934,
484
- "loss": 0.7118,
485
- "rewards/accuracies": 0.75,
486
- "rewards/chosen": 0.9196362495422363,
487
- "rewards/margins": 0.9230009317398071,
488
- "rewards/rejected": -0.0033645853400230408,
489
  "step": 310
490
  },
491
  {
492
- "epoch": 0.5898617511520737,
493
- "grad_norm": 2587.202113246667,
494
- "learning_rate": 2.154374222722545e-09,
495
- "logits/chosen": -1.5904594659805298,
496
- "logits/rejected": -1.5287564992904663,
497
- "logps/chosen": -1.0147100687026978,
498
- "logps/rejected": -0.9374237060546875,
499
- "loss": 0.7142,
500
- "rewards/accuracies": 0.7562500238418579,
501
- "rewards/chosen": 0.939732551574707,
502
- "rewards/margins": 0.9331587553024292,
503
- "rewards/rejected": 0.006573830731213093,
504
  "step": 320
505
  },
506
  {
507
- "epoch": 0.6082949308755761,
508
- "grad_norm": 3045.5495188898726,
509
- "learning_rate": 1.995479804538004e-09,
510
- "logits/chosen": -1.6013281345367432,
511
- "logits/rejected": -1.518786072731018,
512
- "logps/chosen": -1.043778419494629,
513
- "logps/rejected": -0.9321629405021667,
514
- "loss": 0.693,
515
- "rewards/accuracies": 0.768750011920929,
516
- "rewards/chosen": 0.9917459487915039,
517
- "rewards/margins": 0.8832038640975952,
518
- "rewards/rejected": 0.10854210704565048,
519
  "step": 330
520
  },
521
  {
522
- "epoch": 0.6267281105990783,
523
- "grad_norm": 3007.069780597671,
524
- "learning_rate": 1.8386841801223184e-09,
525
- "logits/chosen": -1.6415650844573975,
526
- "logits/rejected": -1.5566316843032837,
527
- "logps/chosen": -0.9854591488838196,
528
- "logps/rejected": -0.9954888224601746,
529
- "loss": 0.6912,
530
- "rewards/accuracies": 0.762499988079071,
531
- "rewards/chosen": 0.7283134460449219,
532
- "rewards/margins": 0.7907330989837646,
533
- "rewards/rejected": -0.06241961196064949,
534
  "step": 340
535
  },
536
  {
537
- "epoch": 0.6451612903225806,
538
- "grad_norm": 4406.908866729794,
539
- "learning_rate": 1.6846396160893263e-09,
540
- "logits/chosen": -1.6521275043487549,
541
- "logits/rejected": -1.563493013381958,
542
- "logps/chosen": -1.016737699508667,
543
- "logps/rejected": -0.9376832246780396,
544
- "loss": 0.7053,
545
- "rewards/accuracies": 0.731249988079071,
546
- "rewards/chosen": 1.0002192258834839,
547
- "rewards/margins": 0.828022837638855,
548
- "rewards/rejected": 0.17219629883766174,
549
  "step": 350
550
  },
551
  {
552
- "epoch": 0.663594470046083,
553
- "grad_norm": 2676.172542864631,
554
- "learning_rate": 1.5339869346975361e-09,
555
- "logits/chosen": -1.6332248449325562,
556
- "logits/rejected": -1.5798089504241943,
557
- "logps/chosen": -1.0168434381484985,
558
- "logps/rejected": -0.923861026763916,
559
- "loss": 0.6317,
560
- "rewards/accuracies": 0.824999988079071,
561
- "rewards/chosen": 1.0311250686645508,
562
- "rewards/margins": 1.0256620645523071,
563
- "rewards/rejected": 0.005463090725243092,
564
  "step": 360
565
  },
566
  {
567
- "epoch": 0.6820276497695853,
568
- "grad_norm": 3538.899443074702,
569
- "learning_rate": 1.3873528480425386e-09,
570
- "logits/chosen": -1.6682748794555664,
571
- "logits/rejected": -1.5604602098464966,
572
- "logps/chosen": -0.9834707379341125,
573
- "logps/rejected": -0.918461799621582,
574
- "loss": 0.6548,
575
- "rewards/accuracies": 0.8187500238418579,
576
- "rewards/chosen": 1.1218345165252686,
577
- "rewards/margins": 0.9969808459281921,
578
- "rewards/rejected": 0.1248536929488182,
579
  "step": 370
580
  },
581
  {
582
- "epoch": 0.7004608294930875,
583
- "grad_norm": 2976.475346671877,
584
- "learning_rate": 1.2453473509474186e-09,
585
- "logits/chosen": -1.660499930381775,
586
- "logits/rejected": -1.5915935039520264,
587
- "logps/chosen": -0.9880379438400269,
588
- "logps/rejected": -0.9460450410842896,
589
- "loss": 0.7138,
590
- "rewards/accuracies": 0.762499988079071,
591
- "rewards/chosen": 1.1316003799438477,
592
- "rewards/margins": 0.9571952819824219,
593
- "rewards/rejected": 0.17440509796142578,
594
  "step": 380
595
  },
596
  {
597
- "epoch": 0.7188940092165899,
598
- "grad_norm": 2873.8297776136696,
599
- "learning_rate": 1.1085611833966747e-09,
600
- "logits/chosen": -1.594157338142395,
601
- "logits/rejected": -1.5532026290893555,
602
- "logps/chosen": -1.0556713342666626,
603
- "logps/rejected": -0.9804666638374329,
604
- "loss": 0.7064,
605
- "rewards/accuracies": 0.7562500238418579,
606
- "rewards/chosen": 1.1315768957138062,
607
- "rewards/margins": 1.1887867450714111,
608
- "rewards/rejected": -0.057209838181734085,
609
  "step": 390
610
  },
611
  {
612
- "epoch": 0.7373271889400922,
613
- "grad_norm": 3810.435964314268,
614
- "learning_rate": 9.77563373069879e-10,
615
- "logits/chosen": -1.6148662567138672,
616
- "logits/rejected": -1.4739339351654053,
617
- "logps/chosen": -0.9978437423706055,
618
- "logps/rejected": -0.9537578821182251,
619
- "loss": 0.655,
620
- "rewards/accuracies": 0.7562500238418579,
621
- "rewards/chosen": 1.0335350036621094,
622
- "rewards/margins": 0.9007118940353394,
623
- "rewards/rejected": 0.13282322883605957,
624
  "step": 400
625
  },
626
  {
627
- "epoch": 0.7557603686635944,
628
- "grad_norm": 3353.3656429864536,
629
- "learning_rate": 8.528988681980848e-10,
630
- "logits/chosen": -1.525498390197754,
631
- "logits/rejected": -1.5023047924041748,
632
- "logps/chosen": -1.0074002742767334,
633
- "logps/rejected": -0.9325093030929565,
634
- "loss": 0.6624,
635
- "rewards/accuracies": 0.793749988079071,
636
- "rewards/chosen": 0.9953389167785645,
637
- "rewards/margins": 1.0517241954803467,
638
- "rewards/rejected": -0.05638519674539566,
639
  "step": 410
640
  },
641
  {
642
- "epoch": 0.7741935483870968,
643
- "grad_norm": 3108.3553021985713,
644
- "learning_rate": 7.350862705902492e-10,
645
- "logits/chosen": -1.612502098083496,
646
- "logits/rejected": -1.5601556301116943,
647
- "logps/chosen": -0.9830726385116577,
648
- "logps/rejected": -0.9087923169136047,
649
- "loss": 0.668,
650
- "rewards/accuracies": 0.8062499761581421,
651
- "rewards/chosen": 1.2218666076660156,
652
- "rewards/margins": 1.0628268718719482,
653
- "rewards/rejected": 0.15903989970684052,
654
  "step": 420
655
  },
656
  {
657
- "epoch": 0.7926267281105991,
658
- "grad_norm": 2848.4192218675507,
659
- "learning_rate": 6.246156782602395e-10,
660
- "logits/chosen": -1.6182113885879517,
661
- "logits/rejected": -1.5692577362060547,
662
- "logps/chosen": -0.9712175130844116,
663
- "logps/rejected": -0.9052525758743286,
664
- "loss": 0.6509,
665
- "rewards/accuracies": 0.7749999761581421,
666
- "rewards/chosen": 0.8137975931167603,
667
- "rewards/margins": 0.871743381023407,
668
- "rewards/rejected": -0.057945869863033295,
669
  "step": 430
670
  },
671
  {
672
- "epoch": 0.8110599078341014,
673
- "grad_norm": 3760.6065599338026,
674
- "learning_rate": 5.219466466290479e-10,
675
- "logits/chosen": -1.582889199256897,
676
- "logits/rejected": -1.4811861515045166,
677
- "logps/chosen": -0.9767929911613464,
678
- "logps/rejected": -0.9605540037155151,
679
- "loss": 0.6505,
680
- "rewards/accuracies": 0.793749988079071,
681
- "rewards/chosen": 0.7346361875534058,
682
- "rewards/margins": 1.012524962425232,
683
- "rewards/rejected": -0.27788880467414856,
684
  "step": 440
685
  },
686
  {
687
- "epoch": 0.8294930875576036,
688
- "grad_norm": 4053.0728361684164,
689
- "learning_rate": 4.2750627678356803e-10,
690
- "logits/chosen": -1.5421695709228516,
691
- "logits/rejected": -1.4773906469345093,
692
- "logps/chosen": -0.9987564086914062,
693
- "logps/rejected": -0.9237726330757141,
694
- "loss": 0.6655,
695
- "rewards/accuracies": 0.762499988079071,
696
- "rewards/chosen": 1.0704389810562134,
697
- "rewards/margins": 1.020028829574585,
698
- "rewards/rejected": 0.05040997266769409,
699
  "step": 450
700
  },
701
  {
702
- "epoch": 0.847926267281106,
703
- "grad_norm": 3444.670240085631,
704
- "learning_rate": 3.4168743874474826e-10,
705
- "logits/chosen": -1.6144073009490967,
706
- "logits/rejected": -1.5566074848175049,
707
- "logps/chosen": -1.0152393579483032,
708
- "logps/rejected": -0.929918646812439,
709
- "loss": 0.6458,
710
- "rewards/accuracies": 0.7562500238418579,
711
- "rewards/chosen": 0.7092639207839966,
712
- "rewards/margins": 0.7927274107933044,
713
- "rewards/rejected": -0.08346347510814667,
714
  "step": 460
715
  },
716
  {
717
- "epoch": 0.8663594470046083,
718
- "grad_norm": 3106.2819052615973,
719
- "learning_rate": 2.6484713713628097e-10,
720
- "logits/chosen": -1.6743053197860718,
721
- "logits/rejected": -1.5681962966918945,
722
- "logps/chosen": -0.9611037969589233,
723
- "logps/rejected": -0.9157723188400269,
724
- "loss": 0.6663,
725
- "rewards/accuracies": 0.793749988079071,
726
- "rewards/chosen": 0.9063161611557007,
727
- "rewards/margins": 0.7436623573303223,
728
- "rewards/rejected": 0.16265380382537842,
729
  "step": 470
730
  },
731
  {
732
- "epoch": 0.8847926267281107,
733
- "grad_norm": 3028.8363800911566,
734
- "learning_rate": 1.9730502605261797e-10,
735
- "logits/chosen": -1.5557960271835327,
736
- "logits/rejected": -1.5075175762176514,
737
- "logps/chosen": -0.9503531455993652,
738
- "logps/rejected": -0.919326663017273,
739
- "loss": 0.6366,
740
- "rewards/accuracies": 0.8187500238418579,
741
- "rewards/chosen": 0.9357951283454895,
742
- "rewards/margins": 0.953781247138977,
743
- "rewards/rejected": -0.01798621378839016,
744
  "step": 480
745
  },
746
  {
747
- "epoch": 0.9032258064516129,
748
- "grad_norm": 3953.758778776682,
749
- "learning_rate": 1.39342079304427e-10,
750
- "logits/chosen": -1.6463558673858643,
751
- "logits/rejected": -1.5752487182617188,
752
- "logps/chosen": -1.0051090717315674,
753
- "logps/rejected": -0.980335533618927,
754
- "loss": 0.6936,
755
- "rewards/accuracies": 0.8125,
756
- "rewards/chosen": 1.2566089630126953,
757
- "rewards/margins": 1.2254388332366943,
758
- "rewards/rejected": 0.031170058995485306,
759
  "step": 490
760
  },
761
  {
762
- "epoch": 0.9216589861751152,
763
- "grad_norm": 2794.294715742837,
764
- "learning_rate": 9.119942157324367e-11,
765
- "logits/chosen": -1.6388028860092163,
766
- "logits/rejected": -1.5481789112091064,
767
- "logps/chosen": -1.0447478294372559,
768
- "logps/rejected": -1.0064018964767456,
769
- "loss": 0.6587,
770
- "rewards/accuracies": 0.84375,
771
- "rewards/chosen": 1.1869680881500244,
772
- "rewards/margins": 1.1323801279067993,
773
- "rewards/rejected": 0.054588038474321365,
774
  "step": 500
775
  },
776
  {
777
- "epoch": 0.9400921658986175,
778
- "grad_norm": 2259.2971695939314,
779
- "learning_rate": 5.3077325337695935e-11,
780
- "logits/chosen": -1.6551536321640015,
781
- "logits/rejected": -1.5781865119934082,
782
- "logps/chosen": -0.9668482542037964,
783
- "logps/rejected": -0.9251588582992554,
784
- "loss": 0.6534,
785
- "rewards/accuracies": 0.8125,
786
- "rewards/chosen": 1.0340436697006226,
787
- "rewards/margins": 0.9162915349006653,
788
- "rewards/rejected": 0.1177520900964737,
789
  "step": 510
790
  },
791
  {
792
- "epoch": 0.9585253456221198,
793
- "grad_norm": 2756.1908243931707,
794
- "learning_rate": 2.5134377744054636e-11,
795
- "logits/chosen": -1.6491291522979736,
796
- "logits/rejected": -1.5655778646469116,
797
- "logps/chosen": -1.0213550329208374,
798
- "logps/rejected": -0.9846656918525696,
799
- "loss": 0.664,
800
- "rewards/accuracies": 0.7749999761581421,
801
- "rewards/chosen": 1.088184118270874,
802
- "rewards/margins": 0.7975610494613647,
803
- "rewards/rejected": 0.29062318801879883,
804
- "step": 520
805
- },
806
- {
807
- "epoch": 0.9769585253456221,
808
- "grad_norm": 3373.335584334466,
809
- "learning_rate": 7.486820886929147e-12,
810
- "logits/chosen": -1.6995586156845093,
811
- "logits/rejected": -1.606285810470581,
812
- "logps/chosen": -0.9370242953300476,
813
- "logps/rejected": -0.935525119304657,
814
- "loss": 0.6841,
815
- "rewards/accuracies": 0.7437499761581421,
816
- "rewards/chosen": 0.8417491912841797,
817
- "rewards/margins": 0.7081137895584106,
818
- "rewards/rejected": 0.13363537192344666,
819
- "step": 530
820
- },
821
- {
822
- "epoch": 0.9953917050691244,
823
- "grad_norm": 3339.7952039913303,
824
- "learning_rate": 2.080682445118609e-13,
825
- "logits/chosen": -1.6441770792007446,
826
- "logits/rejected": -1.5704596042633057,
827
- "logps/chosen": -0.9470025897026062,
828
- "logps/rejected": -0.9030858278274536,
829
- "loss": 0.659,
830
- "rewards/accuracies": 0.7875000238418579,
831
- "rewards/chosen": 0.9892306327819824,
832
- "rewards/margins": 0.7913962602615356,
833
- "rewards/rejected": 0.19783440232276917,
834
- "step": 540
835
- },
836
- {
837
- "epoch": 0.9990783410138249,
838
- "step": 542,
839
  "total_flos": 0.0,
840
- "train_loss": 0.7834695007968213,
841
- "train_runtime": 9239.3637,
842
- "train_samples_per_second": 7.512,
843
- "train_steps_per_second": 0.059
844
  }
845
  ],
846
  "logging_steps": 10,
847
- "max_steps": 542,
848
  "num_input_tokens_seen": 0,
849
  "num_train_epochs": 1,
850
  "save_steps": 100,
851
- "stateful_callbacks": {
852
- "TrainerControl": {
853
- "args": {
854
- "should_epoch_stop": false,
855
- "should_evaluate": false,
856
- "should_log": false,
857
- "should_save": true,
858
- "should_training_stop": false
859
- },
860
- "attributes": {}
861
- }
862
- },
863
  "total_flos": 0.0,
864
  "train_batch_size": 4,
865
  "trial_name": null,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9990319457889641,
5
  "eval_steps": 10000000,
6
+ "global_step": 516,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0,
13
+ "grad_norm": 1841.589330811575,
14
+ "learning_rate": 1.9230769230769234e-11,
15
+ "logits/chosen": -1.8683955669403076,
16
+ "logits/rejected": -1.7658718824386597,
17
+ "logps/chosen": -1.0707917213439941,
18
+ "logps/rejected": -1.2424218654632568,
19
+ "loss": 0.6843,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
22
  "rewards/margins": 0.0,
 
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.02,
28
+ "grad_norm": 1996.1274022232458,
29
+ "learning_rate": 1.9230769230769234e-10,
30
+ "logits/chosen": -1.661571979522705,
31
+ "logits/rejected": -1.6195077896118164,
32
+ "logps/chosen": -0.9485270380973816,
33
+ "logps/rejected": -0.9299606680870056,
34
+ "loss": 0.7139,
35
+ "rewards/accuracies": 0.3402777910232544,
36
+ "rewards/chosen": -0.010606925934553146,
37
+ "rewards/margins": -0.04614371806383133,
38
+ "rewards/rejected": 0.035536784678697586,
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.04,
43
+ "grad_norm": 1476.5922039435188,
44
+ "learning_rate": 3.8461538461538467e-10,
45
+ "logits/chosen": -1.5835134983062744,
46
+ "logits/rejected": -1.5358213186264038,
47
+ "logps/chosen": -1.0245001316070557,
48
+ "logps/rejected": -0.9702553749084473,
49
+ "loss": 0.7328,
50
+ "rewards/accuracies": 0.5249999761581421,
51
+ "rewards/chosen": 0.002407646970823407,
52
+ "rewards/margins": 0.012017359957098961,
53
+ "rewards/rejected": -0.009609714150428772,
54
  "step": 20
55
  },
56
  {
57
+ "epoch": 0.06,
58
+ "grad_norm": 2327.1082042354296,
59
+ "learning_rate": 5.769230769230769e-10,
60
+ "logits/chosen": -1.5297813415527344,
61
+ "logits/rejected": -1.470786690711975,
62
+ "logps/chosen": -1.003636360168457,
63
+ "logps/rejected": -0.9809616208076477,
64
+ "loss": 0.7501,
65
+ "rewards/accuracies": 0.5249999761581421,
66
+ "rewards/chosen": -0.0802309662103653,
67
+ "rewards/margins": -0.03612281754612923,
68
+ "rewards/rejected": -0.04410814121365547,
69
  "step": 30
70
  },
71
  {
72
+ "epoch": 0.08,
73
+ "grad_norm": 1754.5725922762788,
74
+ "learning_rate": 7.692307692307693e-10,
75
+ "logits/chosen": -1.5550693273544312,
76
+ "logits/rejected": -1.4969508647918701,
77
+ "logps/chosen": -0.9998669624328613,
78
+ "logps/rejected": -0.9377104640007019,
79
+ "loss": 0.7259,
80
+ "rewards/accuracies": 0.518750011920929,
81
+ "rewards/chosen": 0.02974640764296055,
82
+ "rewards/margins": 0.025205513462424278,
83
+ "rewards/rejected": 0.004540898837149143,
84
  "step": 40
85
  },
86
  {
87
+ "epoch": 0.1,
88
+ "grad_norm": 1834.9151562164598,
89
+ "learning_rate": 9.615384615384616e-10,
90
+ "logits/chosen": -1.6489346027374268,
91
+ "logits/rejected": -1.603512167930603,
92
+ "logps/chosen": -0.9903966784477234,
93
+ "logps/rejected": -0.968778133392334,
94
+ "loss": 0.7275,
95
+ "rewards/accuracies": 0.5625,
96
+ "rewards/chosen": -0.0027869821060448885,
97
+ "rewards/margins": 0.03262440487742424,
98
+ "rewards/rejected": -0.03541138768196106,
99
  "step": 50
100
  },
101
  {
102
+ "epoch": 0.12,
103
+ "grad_norm": 1858.8148428923591,
104
+ "learning_rate": 9.99266706925562e-10,
105
+ "logits/chosen": -1.6008754968643188,
106
+ "logits/rejected": -1.5484288930892944,
107
+ "logps/chosen": -0.9955031275749207,
108
+ "logps/rejected": -0.931098461151123,
109
+ "loss": 0.7562,
110
+ "rewards/accuracies": 0.44999998807907104,
111
+ "rewards/chosen": -0.00754826795309782,
112
+ "rewards/margins": -0.04739421606063843,
113
+ "rewards/rejected": 0.039845943450927734,
114
  "step": 60
115
  },
116
  {
117
+ "epoch": 0.14,
118
+ "grad_norm": 2425.8586612790064,
119
+ "learning_rate": 9.96291389741603e-10,
120
+ "logits/chosen": -1.5926530361175537,
121
+ "logits/rejected": -1.5061492919921875,
122
+ "logps/chosen": -0.9957473874092102,
123
+ "logps/rejected": -0.9462421536445618,
124
+ "loss": 0.7475,
125
+ "rewards/accuracies": 0.550000011920929,
126
+ "rewards/chosen": 0.044678620994091034,
127
+ "rewards/margins": 0.04425480216741562,
128
+ "rewards/rejected": 0.0004238195833750069,
129
  "step": 70
130
  },
131
  {
132
+ "epoch": 0.15,
133
+ "grad_norm": 1974.3624260956421,
134
+ "learning_rate": 9.91041841371078e-10,
135
+ "logits/chosen": -1.5462015867233276,
136
+ "logits/rejected": -1.52878737449646,
137
+ "logps/chosen": -1.0357959270477295,
138
+ "logps/rejected": -0.9859043955802917,
139
+ "loss": 0.737,
140
+ "rewards/accuracies": 0.574999988079071,
141
+ "rewards/chosen": 0.019811829552054405,
142
+ "rewards/margins": 0.09474115073680878,
143
+ "rewards/rejected": -0.07492931932210922,
144
  "step": 80
145
  },
146
  {
147
+ "epoch": 0.17,
148
+ "grad_norm": 1805.8384539876447,
149
+ "learning_rate": 9.835421176144035e-10,
150
+ "logits/chosen": -1.6784532070159912,
151
+ "logits/rejected": -1.6200278997421265,
152
+ "logps/chosen": -1.0107990503311157,
153
+ "logps/rejected": -0.924695611000061,
154
+ "loss": 0.7294,
155
+ "rewards/accuracies": 0.512499988079071,
156
+ "rewards/chosen": 0.06031709909439087,
157
+ "rewards/margins": 0.016556955873966217,
158
+ "rewards/rejected": 0.04376014322042465,
159
  "step": 90
160
  },
161
  {
162
+ "epoch": 0.19,
163
+ "grad_norm": 1972.7826610383331,
164
+ "learning_rate": 9.738265855914014e-10,
165
+ "logits/chosen": -1.632147192955017,
166
+ "logits/rejected": -1.56899094581604,
167
+ "logps/chosen": -0.9780662655830383,
168
+ "logps/rejected": -0.9384719133377075,
169
+ "loss": 0.7363,
170
+ "rewards/accuracies": 0.46875,
171
+ "rewards/chosen": 0.010622555390000343,
172
+ "rewards/margins": -0.028813939541578293,
173
+ "rewards/rejected": 0.03943649306893349,
174
  "step": 100
175
  },
176
  {
177
+ "epoch": 0.21,
178
+ "grad_norm": 2332.6350905446534,
179
+ "learning_rate": 9.619397662556434e-10,
180
+ "logits/chosen": -1.6633880138397217,
181
+ "logits/rejected": -1.5872291326522827,
182
+ "logps/chosen": -0.8982691764831543,
183
+ "logps/rejected": -0.8728898167610168,
184
+ "loss": 0.7458,
185
+ "rewards/accuracies": 0.5062500238418579,
186
+ "rewards/chosen": 0.03617207705974579,
187
+ "rewards/margins": 0.01802017167210579,
188
+ "rewards/rejected": 0.01815190538764,
189
  "step": 110
190
  },
191
  {
192
+ "epoch": 0.23,
193
+ "grad_norm": 1941.4125890247126,
194
+ "learning_rate": 9.47936130379344e-10,
195
+ "logits/chosen": -1.54219651222229,
196
+ "logits/rejected": -1.522878885269165,
197
+ "logps/chosen": -0.9705562591552734,
198
+ "logps/rejected": -0.9400444030761719,
199
+ "loss": 0.7336,
200
+ "rewards/accuracies": 0.5249999761581421,
201
+ "rewards/chosen": 0.059115856885910034,
202
+ "rewards/margins": 0.04324622079730034,
203
+ "rewards/rejected": 0.01586962677538395,
204
  "step": 120
205
  },
206
  {
207
+ "epoch": 0.25,
208
+ "grad_norm": 1715.1411693424232,
209
+ "learning_rate": 9.318798489436919e-10,
210
+ "logits/chosen": -1.57927405834198,
211
+ "logits/rejected": -1.489496111869812,
212
+ "logps/chosen": -0.9656535983085632,
213
+ "logps/rejected": -0.9289010167121887,
214
+ "loss": 0.7313,
215
+ "rewards/accuracies": 0.543749988079071,
216
+ "rewards/chosen": 0.053683798760175705,
217
+ "rewards/margins": 0.05398359149694443,
218
+ "rewards/rejected": -0.00029979198006913066,
219
  "step": 130
220
  },
221
  {
222
+ "epoch": 0.27,
223
+ "grad_norm": 1655.2138808718737,
224
+ "learning_rate": 9.138444990784454e-10,
225
+ "logits/chosen": -1.575226068496704,
226
+ "logits/rejected": -1.5274800062179565,
227
+ "logps/chosen": -0.9927815198898315,
228
+ "logps/rejected": -0.9941291809082031,
229
+ "loss": 0.7376,
230
+ "rewards/accuracies": 0.512499988079071,
231
+ "rewards/chosen": 0.016064394265413284,
232
+ "rewards/margins": 0.029078301042318344,
233
+ "rewards/rejected": -0.013013908639550209,
234
  "step": 140
235
  },
236
  {
237
+ "epoch": 0.29,
238
+ "grad_norm": 1948.0751706535148,
239
+ "learning_rate": 8.939127268983109e-10,
240
+ "logits/chosen": -1.560948133468628,
241
+ "logits/rejected": -1.5372627973556519,
242
+ "logps/chosen": -1.0732929706573486,
243
+ "logps/rejected": -0.9957958459854126,
244
+ "loss": 0.7216,
245
+ "rewards/accuracies": 0.48750001192092896,
246
+ "rewards/chosen": 0.07632608711719513,
247
+ "rewards/margins": 0.030022624880075455,
248
+ "rewards/rejected": 0.046303462237119675,
249
  "step": 150
250
  },
251
  {
252
+ "epoch": 0.31,
253
+ "grad_norm": 1823.5612035138797,
254
+ "learning_rate": 8.721758687811352e-10,
255
+ "logits/chosen": -1.6760711669921875,
256
+ "logits/rejected": -1.6009712219238281,
257
+ "logps/chosen": -0.9717607498168945,
258
+ "logps/rejected": -0.9428688883781433,
259
+ "loss": 0.7558,
260
+ "rewards/accuracies": 0.48124998807907104,
261
+ "rewards/chosen": 0.010309430770576,
262
+ "rewards/margins": -0.04144478961825371,
263
+ "rewards/rejected": 0.05175423622131348,
264
  "step": 160
265
  },
266
  {
267
+ "epoch": 0.33,
268
+ "grad_norm": 1740.6546374262923,
269
+ "learning_rate": 8.487335328233912e-10,
270
+ "logits/chosen": -1.5352542400360107,
271
+ "logits/rejected": -1.4410475492477417,
272
+ "logps/chosen": -0.9954330325126648,
273
+ "logps/rejected": -0.9735302925109863,
274
+ "loss": 0.7379,
275
+ "rewards/accuracies": 0.512499988079071,
276
+ "rewards/chosen": 0.04769414663314819,
277
+ "rewards/margins": 0.01896754838526249,
278
+ "rewards/rejected": 0.028726596385240555,
279
  "step": 170
280
  },
281
  {
282
+ "epoch": 0.35,
283
+ "grad_norm": 1979.6537790499867,
284
+ "learning_rate": 8.236931423909139e-10,
285
+ "logits/chosen": -1.672133207321167,
286
+ "logits/rejected": -1.5740010738372803,
287
+ "logps/chosen": -0.9754056930541992,
288
+ "logps/rejected": -0.951374351978302,
289
+ "loss": 0.724,
290
+ "rewards/accuracies": 0.4625000059604645,
291
+ "rewards/chosen": 0.05998270958662033,
292
+ "rewards/margins": 0.04238981008529663,
293
+ "rewards/rejected": 0.017592918127775192,
294
  "step": 180
295
  },
296
  {
297
+ "epoch": 0.37,
298
+ "grad_norm": 1975.0077002715911,
299
+ "learning_rate": 7.971694438565449e-10,
300
+ "logits/chosen": -1.6265771389007568,
301
+ "logits/rejected": -1.5644137859344482,
302
+ "logps/chosen": -0.9732062220573425,
303
+ "logps/rejected": -0.9743107557296753,
304
+ "loss": 0.7296,
305
+ "rewards/accuracies": 0.48124998807907104,
306
+ "rewards/chosen": -0.016829270869493484,
307
+ "rewards/margins": -0.01913767121732235,
308
+ "rewards/rejected": 0.0023084029089659452,
309
  "step": 190
310
  },
311
  {
312
+ "epoch": 0.39,
313
+ "grad_norm": 1823.8748366422865,
314
+ "learning_rate": 7.692839807804521e-10,
315
+ "logits/chosen": -1.6312087774276733,
316
+ "logits/rejected": -1.5969831943511963,
317
+ "logps/chosen": -0.9732693433761597,
318
+ "logps/rejected": -0.9268602132797241,
319
+ "loss": 0.7454,
320
+ "rewards/accuracies": 0.44999998807907104,
321
+ "rewards/chosen": -0.04321768134832382,
322
+ "rewards/margins": -0.05310916155576706,
323
+ "rewards/rejected": 0.009891483001410961,
324
  "step": 200
325
  },
326
  {
327
+ "epoch": 0.41,
328
+ "grad_norm": 1665.7874215210654,
329
+ "learning_rate": 7.401645369426697e-10,
330
+ "logits/chosen": -1.606499433517456,
331
+ "logits/rejected": -1.5401887893676758,
332
+ "logps/chosen": -0.9636400938034058,
333
+ "logps/rejected": -0.9143346548080444,
334
+ "loss": 0.7287,
335
+ "rewards/accuracies": 0.4937500059604645,
336
+ "rewards/chosen": 0.021665044128894806,
337
+ "rewards/margins": -0.03331884369254112,
338
+ "rewards/rejected": 0.05498389154672623,
339
  "step": 210
340
  },
341
  {
342
+ "epoch": 0.43,
343
+ "grad_norm": 1814.5837493015931,
344
+ "learning_rate": 7.099445507801324e-10,
345
+ "logits/chosen": -1.6374841928482056,
346
+ "logits/rejected": -1.590040922164917,
347
+ "logps/chosen": -0.9993332624435425,
348
+ "logps/rejected": -0.9410373568534851,
349
+ "loss": 0.7117,
350
+ "rewards/accuracies": 0.543749988079071,
351
+ "rewards/chosen": 0.03604020178318024,
352
+ "rewards/margins": 0.030328240245580673,
353
+ "rewards/rejected": 0.005711960140615702,
354
  "step": 220
355
  },
356
  {
357
+ "epoch": 0.45,
358
+ "grad_norm": 2021.643245419743,
359
+ "learning_rate": 6.7876250391152e-10,
360
+ "logits/chosen": -1.5899055004119873,
361
+ "logits/rejected": -1.5487146377563477,
362
+ "logps/chosen": -0.9511580467224121,
363
+ "logps/rejected": -0.9713302850723267,
364
+ "loss": 0.7297,
365
+ "rewards/accuracies": 0.518750011920929,
366
+ "rewards/chosen": 0.03084571287035942,
367
+ "rewards/margins": 0.06426262110471725,
368
+ "rewards/rejected": -0.03341691941022873,
369
  "step": 230
370
  },
371
  {
372
+ "epoch": 0.46,
373
+ "grad_norm": 2420.5535415129702,
374
+ "learning_rate": 6.467612865519674e-10,
375
+ "logits/chosen": -1.6261504888534546,
376
+ "logits/rejected": -1.6002610921859741,
377
+ "logps/chosen": -0.9878660440444946,
378
+ "logps/rejected": -0.9101985692977905,
379
+ "loss": 0.7436,
380
+ "rewards/accuracies": 0.48124998807907104,
381
+ "rewards/chosen": -0.028089094907045364,
382
+ "rewards/margins": -0.04090605676174164,
383
+ "rewards/rejected": 0.012816962786018848,
384
  "step": 240
385
  },
386
  {
387
+ "epoch": 0.48,
388
+ "grad_norm": 1826.8127431178382,
389
+ "learning_rate": 6.14087542725593e-10,
390
+ "logits/chosen": -1.6483690738677979,
391
+ "logits/rejected": -1.6149110794067383,
392
+ "logps/chosen": -1.0198554992675781,
393
+ "logps/rejected": -0.9351280331611633,
394
+ "loss": 0.7446,
395
+ "rewards/accuracies": 0.48750001192092896,
396
+ "rewards/chosen": 0.016190512105822563,
397
+ "rewards/margins": -7.110387377906591e-05,
398
+ "rewards/rejected": 0.016261618584394455,
399
  "step": 250
400
  },
401
  {
402
+ "epoch": 0.5,
403
+ "grad_norm": 1733.386808938765,
404
+ "learning_rate": 5.808909982763825e-10,
405
+ "logits/chosen": -1.6453545093536377,
406
+ "logits/rejected": -1.554999828338623,
407
+ "logps/chosen": -0.9937711954116821,
408
+ "logps/rejected": -0.9480551481246948,
409
+ "loss": 0.737,
410
+ "rewards/accuracies": 0.4437499940395355,
411
+ "rewards/chosen": 0.027714818716049194,
412
+ "rewards/margins": -0.043810728937387466,
413
+ "rewards/rejected": 0.07152555137872696,
414
  "step": 260
415
  },
416
  {
417
+ "epoch": 0.52,
418
+ "grad_norm": 1823.0310181293855,
419
+ "learning_rate": 5.473237747567806e-10,
420
+ "logits/chosen": -1.6353635787963867,
421
+ "logits/rejected": -1.5713129043579102,
422
+ "logps/chosen": -0.9697279930114746,
423
+ "logps/rejected": -0.957770824432373,
424
+ "loss": 0.7381,
425
+ "rewards/accuracies": 0.5375000238418579,
426
+ "rewards/chosen": -0.038542490452528,
427
+ "rewards/margins": -0.031249618157744408,
428
+ "rewards/rejected": -0.007292867638170719,
429
  "step": 270
430
  },
431
  {
432
+ "epoch": 0.54,
433
+ "grad_norm": 1896.714047804176,
434
+ "learning_rate": 5.135396923380673e-10,
435
+ "logits/chosen": -1.5689613819122314,
436
+ "logits/rejected": -1.4959182739257812,
437
+ "logps/chosen": -0.9839603304862976,
438
+ "logps/rejected": -0.9465781450271606,
439
+ "loss": 0.7359,
440
+ "rewards/accuracies": 0.518750011920929,
441
+ "rewards/chosen": 0.06361061334609985,
442
+ "rewards/margins": 0.005773247219622135,
443
+ "rewards/rejected": 0.05783736705780029,
444
  "step": 280
445
  },
446
  {
447
+ "epoch": 0.56,
448
+ "grad_norm": 2320.8508484306008,
449
+ "learning_rate": 4.796935649368935e-10,
450
+ "logits/chosen": -1.574792504310608,
451
+ "logits/rejected": -1.4957481622695923,
452
+ "logps/chosen": -1.0472533702850342,
453
+ "logps/rejected": -0.993044376373291,
454
+ "loss": 0.7367,
455
+ "rewards/accuracies": 0.48750001192092896,
456
+ "rewards/chosen": 0.014282060787081718,
457
+ "rewards/margins": 0.02767084166407585,
458
+ "rewards/rejected": -0.013388775289058685,
459
  "step": 290
460
  },
461
  {
462
+ "epoch": 0.58,
463
+ "grad_norm": 1825.0797751905636,
464
+ "learning_rate": 4.4594049078802925e-10,
465
+ "logits/chosen": -1.5986864566802979,
466
+ "logits/rejected": -1.5016899108886719,
467
+ "logps/chosen": -0.9563978314399719,
468
+ "logps/rejected": -0.9198349118232727,
469
+ "loss": 0.7259,
470
+ "rewards/accuracies": 0.5375000238418579,
471
+ "rewards/chosen": 0.09224653244018555,
472
+ "rewards/margins": 0.07078908383846283,
473
+ "rewards/rejected": 0.021457448601722717,
474
  "step": 300
475
  },
476
  {
477
+ "epoch": 0.6,
478
+ "grad_norm": 2580.7259376510724,
479
+ "learning_rate": 4.1243514171423466e-10,
480
+ "logits/chosen": -1.5819472074508667,
481
+ "logits/rejected": -1.5410352945327759,
482
+ "logps/chosen": -0.9796406030654907,
483
+ "logps/rejected": -0.9497900009155273,
484
+ "loss": 0.7348,
485
+ "rewards/accuracies": 0.5,
486
+ "rewards/chosen": 0.03940460830926895,
487
+ "rewards/margins": -0.009207022376358509,
488
+ "rewards/rejected": 0.048611629754304886,
489
  "step": 310
490
  },
491
  {
492
+ "epoch": 0.62,
493
+ "grad_norm": 1729.5664068861677,
494
+ "learning_rate": 3.793310543501473e-10,
495
+ "logits/chosen": -1.6621357202529907,
496
+ "logits/rejected": -1.5896662473678589,
497
+ "logps/chosen": -0.9787474870681763,
498
+ "logps/rejected": -0.9559493064880371,
499
+ "loss": 0.7398,
500
+ "rewards/accuracies": 0.5249999761581421,
501
+ "rewards/chosen": -0.022534608840942383,
502
+ "rewards/margins": 0.009389793500304222,
503
+ "rewards/rejected": -0.031924404203891754,
504
  "step": 320
505
  },
506
  {
507
+ "epoch": 0.64,
508
+ "grad_norm": 2107.6254901282878,
509
+ "learning_rate": 3.4677992656811053e-10,
510
+ "logits/chosen": -1.6322323083877563,
511
+ "logits/rejected": -1.6013027429580688,
512
+ "logps/chosen": -1.018520712852478,
513
+ "logps/rejected": -0.9697484970092773,
514
+ "loss": 0.7397,
515
+ "rewards/accuracies": 0.512499988079071,
516
+ "rewards/chosen": -0.016207193955779076,
517
+ "rewards/margins": -0.05724817514419556,
518
+ "rewards/rejected": 0.04104097932577133,
519
  "step": 330
520
  },
521
  {
522
+ "epoch": 0.66,
523
+ "grad_norm": 2346.5024165631285,
524
+ "learning_rate": 3.149309223300428e-10,
525
+ "logits/chosen": -1.5328480005264282,
526
+ "logits/rejected": -1.5037202835083008,
527
+ "logps/chosen": -1.0620964765548706,
528
+ "logps/rejected": -0.9756827354431152,
529
+ "loss": 0.7423,
530
+ "rewards/accuracies": 0.5562499761581421,
531
+ "rewards/chosen": 0.051995206624269485,
532
+ "rewards/margins": 0.0909401923418045,
533
+ "rewards/rejected": -0.03894497826695442,
534
  "step": 340
535
  },
536
  {
537
+ "epoch": 0.68,
538
+ "grad_norm": 1860.3254470648908,
539
+ "learning_rate": 2.8392998815082717e-10,
540
+ "logits/chosen": -1.6585187911987305,
541
+ "logits/rejected": -1.5651119947433472,
542
+ "logps/chosen": -1.0541408061981201,
543
+ "logps/rejected": -1.0554534196853638,
544
+ "loss": 0.7345,
545
+ "rewards/accuracies": 0.6000000238418579,
546
+ "rewards/chosen": 0.07690231502056122,
547
+ "rewards/margins": 0.0986892357468605,
548
+ "rewards/rejected": -0.02178690955042839,
549
  "step": 350
550
  },
551
  {
552
+ "epoch": 0.7,
553
+ "grad_norm": 1848.6713797847476,
554
+ "learning_rate": 2.5391918430549634e-10,
555
+ "logits/chosen": -1.6919893026351929,
556
+ "logits/rejected": -1.630860686302185,
557
+ "logps/chosen": -1.0240895748138428,
558
+ "logps/rejected": -0.9422369003295898,
559
+ "loss": 0.7225,
560
+ "rewards/accuracies": 0.48124998807907104,
561
+ "rewards/chosen": 0.052596330642700195,
562
+ "rewards/margins": 0.016732681542634964,
563
+ "rewards/rejected": 0.03586364910006523,
564
  "step": 360
565
  },
566
  {
567
+ "epoch": 0.72,
568
+ "grad_norm": 1745.482314493582,
569
+ "learning_rate": 2.250360338449226e-10,
570
+ "logits/chosen": -1.7182960510253906,
571
+ "logits/rejected": -1.705248236656189,
572
+ "logps/chosen": -0.9747020602226257,
573
+ "logps/rejected": -0.927233874797821,
574
+ "loss": 0.716,
575
+ "rewards/accuracies": 0.5375000238418579,
576
+ "rewards/chosen": 0.005029407795518637,
577
+ "rewards/margins": 0.09355296194553375,
578
+ "rewards/rejected": -0.08852354437112808,
579
  "step": 370
580
  },
581
  {
582
+ "epoch": 0.74,
583
+ "grad_norm": 1796.573665803334,
584
+ "learning_rate": 1.9741289240311756e-10,
585
+ "logits/chosen": -1.6247894763946533,
586
+ "logits/rejected": -1.5761014223098755,
587
+ "logps/chosen": -0.9921888113021851,
588
+ "logps/rejected": -0.9521619081497192,
589
+ "loss": 0.7394,
590
+ "rewards/accuracies": 0.518750011920929,
591
+ "rewards/chosen": 0.03736250475049019,
592
+ "rewards/margins": -0.005258217919617891,
593
+ "rewards/rejected": 0.04262072592973709,
594
  "step": 380
595
  },
596
  {
597
+ "epoch": 0.76,
598
+ "grad_norm": 1721.8345915070997,
599
+ "learning_rate": 1.7117634168396773e-10,
600
+ "logits/chosen": -1.620558500289917,
601
+ "logits/rejected": -1.5566461086273193,
602
+ "logps/chosen": -1.003068208694458,
603
+ "logps/rejected": -0.9802311658859253,
604
+ "loss": 0.7272,
605
+ "rewards/accuracies": 0.518750011920929,
606
+ "rewards/chosen": 0.0402844175696373,
607
+ "rewards/margins": 0.06007431820034981,
608
+ "rewards/rejected": -0.01978989504277706,
609
  "step": 390
610
  },
611
  {
612
+ "epoch": 0.77,
613
+ "grad_norm": 1711.1281038444054,
614
+ "learning_rate": 1.4644660940672628e-10,
615
+ "logits/chosen": -1.6632341146469116,
616
+ "logits/rejected": -1.5769034624099731,
617
+ "logps/chosen": -0.996769905090332,
618
+ "logps/rejected": -0.9531647562980652,
619
+ "loss": 0.7362,
620
+ "rewards/accuracies": 0.4749999940395355,
621
+ "rewards/chosen": 0.02927534654736519,
622
+ "rewards/margins": 0.02135869301855564,
623
+ "rewards/rejected": 0.007916651666164398,
624
  "step": 400
625
  },
626
  {
627
+ "epoch": 0.79,
628
+ "grad_norm": 2141.9848630404285,
629
+ "learning_rate": 1.2333701836832813e-10,
630
+ "logits/chosen": -1.6260700225830078,
631
+ "logits/rejected": -1.5618007183074951,
632
+ "logps/chosen": -0.9725567698478699,
633
+ "logps/rejected": -0.9434686899185181,
634
+ "loss": 0.7235,
635
+ "rewards/accuracies": 0.53125,
636
+ "rewards/chosen": -0.033636633306741714,
637
+ "rewards/margins": -0.02703043445944786,
638
+ "rewards/rejected": -0.006606197915971279,
639
  "step": 410
640
  },
641
  {
642
+ "epoch": 0.81,
643
+ "grad_norm": 2481.4760711884373,
644
+ "learning_rate": 1.0195346714717813e-10,
645
+ "logits/chosen": -1.5375875234603882,
646
+ "logits/rejected": -1.5268070697784424,
647
+ "logps/chosen": -0.9649657011032104,
648
+ "logps/rejected": -0.9462583661079407,
649
+ "loss": 0.7543,
650
+ "rewards/accuracies": 0.4625000059604645,
651
+ "rewards/chosen": -0.02682650461792946,
652
+ "rewards/margins": -0.07992960512638092,
653
+ "rewards/rejected": 0.05310310050845146,
654
  "step": 420
655
  },
656
  {
657
+ "epoch": 0.83,
658
+ "grad_norm": 1967.5434793497254,
659
+ "learning_rate": 8.239394482805996e-11,
660
+ "logits/chosen": -1.5937187671661377,
661
+ "logits/rejected": -1.5494930744171143,
662
+ "logps/chosen": -1.0188677310943604,
663
+ "logps/rejected": -0.9621112942695618,
664
+ "loss": 0.7302,
665
+ "rewards/accuracies": 0.48124998807907104,
666
+ "rewards/chosen": 0.057189978659152985,
667
+ "rewards/margins": 0.0319669134914875,
668
+ "rewards/rejected": 0.02522306516766548,
669
  "step": 430
670
  },
671
  {
672
+ "epoch": 0.85,
673
+ "grad_norm": 1970.7510190413827,
674
+ "learning_rate": 6.474808197191401e-11,
675
+ "logits/chosen": -1.631608247756958,
676
+ "logits/rejected": -1.5823745727539062,
677
+ "logps/chosen": -1.0462229251861572,
678
+ "logps/rejected": -0.9751529693603516,
679
+ "loss": 0.7293,
680
+ "rewards/accuracies": 0.59375,
681
+ "rewards/chosen": 0.0702277421951294,
682
+ "rewards/margins": 0.0910101979970932,
683
+ "rewards/rejected": -0.020782459527254105,
684
  "step": 440
685
  },
686
  {
687
+ "epoch": 0.87,
688
+ "grad_norm": 2032.2295716484484,
689
+ "learning_rate": 4.9096739888146e-11,
690
+ "logits/chosen": -1.622164011001587,
691
+ "logits/rejected": -1.529624342918396,
692
+ "logps/chosen": -1.013816475868225,
693
+ "logps/rejected": -0.9728986620903015,
694
+ "loss": 0.7463,
695
+ "rewards/accuracies": 0.48124998807907104,
696
+ "rewards/chosen": 0.07482706010341644,
697
+ "rewards/margins": -0.03386348485946655,
698
+ "rewards/rejected": 0.10869055986404419,
699
  "step": 450
700
  },
701
  {
702
+ "epoch": 0.89,
703
+ "grad_norm": 1776.741650731322,
704
+ "learning_rate": 3.5511640091604293e-11,
705
+ "logits/chosen": -1.5639044046401978,
706
+ "logits/rejected": -1.5262724161148071,
707
+ "logps/chosen": -1.0560028553009033,
708
+ "logps/rejected": -0.9387346506118774,
709
+ "loss": 0.7216,
710
+ "rewards/accuracies": 0.5625,
711
+ "rewards/chosen": 0.04164951294660568,
712
+ "rewards/margins": 0.14367111027240753,
713
+ "rewards/rejected": -0.10202159732580185,
714
  "step": 460
715
  },
716
  {
717
+ "epoch": 0.91,
718
+ "grad_norm": 1907.6345668857712,
719
+ "learning_rate": 2.4055035642222225e-11,
720
+ "logits/chosen": -1.6381734609603882,
721
+ "logits/rejected": -1.5699491500854492,
722
+ "logps/chosen": -0.9839171171188354,
723
+ "logps/rejected": -0.9137029647827148,
724
+ "loss": 0.7333,
725
+ "rewards/accuracies": 0.48124998807907104,
726
+ "rewards/chosen": 0.05203929543495178,
727
+ "rewards/margins": 0.0004328913928475231,
728
+ "rewards/rejected": 0.05160640552639961,
729
  "step": 470
730
  },
731
  {
732
+ "epoch": 0.93,
733
+ "grad_norm": 1909.400973276575,
734
+ "learning_rate": 1.477942587339426e-11,
735
+ "logits/chosen": -1.609994649887085,
736
+ "logits/rejected": -1.5665223598480225,
737
+ "logps/chosen": -0.9485662579536438,
738
+ "logps/rejected": -0.9070123434066772,
739
+ "loss": 0.7389,
740
+ "rewards/accuracies": 0.5062500238418579,
741
+ "rewards/chosen": 0.026085853576660156,
742
+ "rewards/margins": -0.039593569934368134,
743
+ "rewards/rejected": 0.06567941606044769,
744
  "step": 480
745
  },
746
  {
747
+ "epoch": 0.95,
748
+ "grad_norm": 1986.3370377223296,
749
+ "learning_rate": 7.727315816331515e-12,
750
+ "logits/chosen": -1.656057357788086,
751
+ "logits/rejected": -1.605974793434143,
752
+ "logps/chosen": -1.0781848430633545,
753
+ "logps/rejected": -0.9999237060546875,
754
+ "loss": 0.733,
755
+ "rewards/accuracies": 0.4625000059604645,
756
+ "rewards/chosen": -0.023666301742196083,
757
+ "rewards/margins": -0.07170303910970688,
758
+ "rewards/rejected": 0.04803674668073654,
759
  "step": 490
760
  },
761
  {
762
+ "epoch": 0.97,
763
+ "grad_norm": 1606.2598352691068,
764
+ "learning_rate": 2.9310214228202016e-12,
765
+ "logits/chosen": -1.6220519542694092,
766
+ "logits/rejected": -1.5478252172470093,
767
+ "logps/chosen": -0.9590246081352234,
768
+ "logps/rejected": -0.9175594449043274,
769
+ "loss": 0.7359,
770
+ "rewards/accuracies": 0.45625001192092896,
771
+ "rewards/chosen": -0.058479368686676025,
772
+ "rewards/margins": -0.08338940888643265,
773
+ "rewards/rejected": 0.024910034611821175,
774
  "step": 500
775
  },
776
  {
777
+ "epoch": 0.99,
778
+ "grad_norm": 2004.3062556110438,
779
+ "learning_rate": 4.125214789427734e-13,
780
+ "logits/chosen": -1.5623807907104492,
781
+ "logits/rejected": -1.499871015548706,
782
+ "logps/chosen": -0.993099570274353,
783
+ "logps/rejected": -0.9391816854476929,
784
+ "loss": 0.7238,
785
+ "rewards/accuracies": 0.512499988079071,
786
+ "rewards/chosen": 0.05033453181385994,
787
+ "rewards/margins": 0.11875885725021362,
788
+ "rewards/rejected": -0.06842432916164398,
789
  "step": 510
790
  },
791
  {
792
+ "epoch": 1.0,
793
+ "step": 516,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
794
  "total_flos": 0.0,
795
+ "train_loss": 0.7340851113084674,
796
+ "train_runtime": 8012.0764,
797
+ "train_samples_per_second": 8.248,
798
+ "train_steps_per_second": 0.064
799
  }
800
  ],
801
  "logging_steps": 10,
802
+ "max_steps": 516,
803
  "num_input_tokens_seen": 0,
804
  "num_train_epochs": 1,
805
  "save_steps": 100,
 
 
 
 
 
 
 
 
 
 
 
 
806
  "total_flos": 0.0,
807
  "train_batch_size": 4,
808
  "trial_name": null,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ea88b5abf47b03971988b97f0d835c6b3e16e6b3ba4dd4ea50e5d4a14e5831b
3
  size 6328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b0f084c2495920c1f1c1e0d0165b9a5be48649d9a063da77553719561d5c998
3
  size 6328