wzhouad commited on
Commit
1868c31
1 Parent(s): a87993b

Model save

Browse files
README.md CHANGED
@@ -37,7 +37,7 @@ The following hyperparameters were used during training:
37
  - learning_rate: 5e-07
38
  - train_batch_size: 8
39
  - eval_batch_size: 8
40
- - seed: 4
41
  - distributed_type: multi-GPU
42
  - num_devices: 8
43
  - gradient_accumulation_steps: 2
 
37
  - learning_rate: 5e-07
38
  - train_batch_size: 8
39
  - eval_batch_size: 8
40
+ - seed: 2
41
  - distributed_type: multi-GPU
42
  - num_devices: 8
43
  - gradient_accumulation_steps: 2
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.3777129889306127,
4
- "train_runtime": 2383.9738,
5
- "train_samples": 39494,
6
- "train_samples_per_second": 16.566,
7
- "train_steps_per_second": 0.13
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.08395375216871263,
4
+ "train_runtime": 6990.4881,
5
+ "train_samples": 113028,
6
+ "train_samples_per_second": 16.169,
7
+ "train_steps_per_second": 0.126
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e3cd7fb95d1f8e7496d67ae6da91127ac1b68d984262c173d0dbc9c4fdefec2
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b44ac21cf9cdfda1a74fca2e9fd5cf72676aa2a876bfbde6ce2b6be5cd738446
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51c8d3d5862def84d35eb99192c611bae08c9b3b157b6e6571ab9f2773bfc47d
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:833f242819a8426b1cff793b3130f0a72468fcea63b2a2626da5b05c7d7c7bf1
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2533672c62e1b1d85a9920d93d7222ed657bc44d2ceea09665a7bb19e029dc5d
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2af6c791fd1e731817d3206d95755c8d6dcfdc8d1654a6f08896b5156fe0547a
3
  size 4540516344
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.3777129889306127,
4
- "train_runtime": 2383.9738,
5
- "train_samples": 39494,
6
- "train_samples_per_second": 16.566,
7
- "train_steps_per_second": 0.13
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.08395375216871263,
4
+ "train_runtime": 6990.4881,
5
+ "train_samples": 113028,
6
+ "train_samples_per_second": 16.169,
7
+ "train_steps_per_second": 0.126
8
  }
trainer_state.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 1000,
6
- "global_step": 309,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 1.6129032258064514e-08,
14
- "logits/chosen": -2.861602783203125,
15
- "logits/rejected": -2.8706541061401367,
16
- "logps/chosen": -108.31307983398438,
17
- "logps/rejected": -122.1865234375,
18
- "loss": 0.5468,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
@@ -23,437 +23,1249 @@
23
  "step": 1
24
  },
25
  {
26
- "epoch": 0.03,
27
- "learning_rate": 1.6129032258064515e-07,
28
- "logits/chosen": -2.7833333015441895,
29
- "logits/rejected": -2.801340103149414,
30
- "logps/chosen": -103.0541000366211,
31
- "logps/rejected": -108.88811492919922,
32
- "loss": 0.5605,
33
- "rewards/accuracies": 0.5,
34
- "rewards/chosen": 0.0004288229683879763,
35
- "rewards/margins": 0.00026609853375703096,
36
- "rewards/rejected": 0.0001627243764232844,
37
  "step": 10
38
  },
39
  {
40
- "epoch": 0.06,
41
- "learning_rate": 3.225806451612903e-07,
42
- "logits/chosen": -2.798549175262451,
43
- "logits/rejected": -2.7971768379211426,
44
- "logps/chosen": -99.93330383300781,
45
- "logps/rejected": -100.25025177001953,
46
- "loss": 0.5601,
47
- "rewards/accuracies": 0.5874999761581421,
48
- "rewards/chosen": 0.0004881693166680634,
49
- "rewards/margins": 0.0005250017857179046,
50
- "rewards/rejected": -3.683247996377759e-05,
51
  "step": 20
52
  },
53
  {
54
- "epoch": 0.1,
55
- "learning_rate": 4.838709677419355e-07,
56
- "logits/chosen": -2.816439390182495,
57
- "logits/rejected": -2.815070867538452,
58
- "logps/chosen": -100.30064392089844,
59
- "logps/rejected": -93.25160217285156,
60
- "loss": 0.5575,
61
- "rewards/accuracies": 0.5375000238418579,
62
- "rewards/chosen": 0.0009209408308379352,
63
- "rewards/margins": 0.001128857722505927,
64
- "rewards/rejected": -0.00020791687711607665,
65
  "step": 30
66
  },
67
  {
68
- "epoch": 0.13,
69
- "learning_rate": 4.987080943856886e-07,
70
- "logits/chosen": -2.8334474563598633,
71
- "logits/rejected": -2.8349099159240723,
72
- "logps/chosen": -99.36518859863281,
73
- "logps/rejected": -105.44466400146484,
74
- "loss": 0.5683,
75
- "rewards/accuracies": 0.59375,
76
- "rewards/chosen": 0.01019326876848936,
77
- "rewards/margins": 0.00500866025686264,
78
- "rewards/rejected": 0.005184608977288008,
79
  "step": 40
80
  },
81
  {
82
- "epoch": 0.16,
83
- "learning_rate": 4.942593872763566e-07,
84
- "logits/chosen": -2.802186965942383,
85
- "logits/rejected": -2.7889404296875,
86
- "logps/chosen": -88.57516479492188,
87
- "logps/rejected": -89.25424194335938,
88
- "loss": 0.5728,
89
- "rewards/accuracies": 0.5874999761581421,
90
- "rewards/chosen": 0.017298558726906776,
91
- "rewards/margins": 0.008996127173304558,
92
- "rewards/rejected": 0.008302430622279644,
93
  "step": 50
94
  },
95
  {
96
- "epoch": 0.19,
97
- "learning_rate": 4.866946677079314e-07,
98
- "logits/chosen": -2.828808307647705,
99
- "logits/rejected": -2.8317344188690186,
100
- "logps/chosen": -85.52920532226562,
101
- "logps/rejected": -90.01313781738281,
102
- "loss": 0.6048,
103
- "rewards/accuracies": 0.606249988079071,
104
- "rewards/chosen": 0.05936245992779732,
105
- "rewards/margins": 0.013923106715083122,
106
- "rewards/rejected": 0.045439351350069046,
107
  "step": 60
108
  },
109
  {
110
- "epoch": 0.23,
111
- "learning_rate": 4.7611043866720737e-07,
112
- "logits/chosen": -2.7826099395751953,
113
- "logits/rejected": -2.7798409461975098,
114
- "logps/chosen": -91.76524353027344,
115
- "logps/rejected": -97.0423583984375,
116
- "loss": 0.5521,
117
- "rewards/accuracies": 0.65625,
118
- "rewards/chosen": 0.02179296873509884,
119
- "rewards/margins": 0.02971211075782776,
120
- "rewards/rejected": -0.007919139228761196,
121
  "step": 70
122
  },
123
  {
124
- "epoch": 0.26,
125
- "learning_rate": 4.6264172296714e-07,
126
- "logits/chosen": -2.759918689727783,
127
- "logits/rejected": -2.766831874847412,
128
- "logps/chosen": -101.35821533203125,
129
- "logps/rejected": -112.35334777832031,
130
- "loss": 0.4972,
131
- "rewards/accuracies": 0.550000011920929,
132
- "rewards/chosen": -0.044755686074495316,
133
- "rewards/margins": 0.022299829870462418,
134
- "rewards/rejected": -0.06705550849437714,
135
  "step": 80
136
  },
137
  {
138
- "epoch": 0.29,
139
- "learning_rate": 4.4646034076333254e-07,
140
- "logits/chosen": -2.7790584564208984,
141
- "logits/rejected": -2.76528263092041,
142
- "logps/chosen": -117.88861083984375,
143
- "logps/rejected": -124.70438385009766,
144
- "loss": 0.4598,
145
- "rewards/accuracies": 0.5375000238418579,
146
- "rewards/chosen": -0.11965729296207428,
147
- "rewards/margins": 0.023207509890198708,
148
- "rewards/rejected": -0.14286477863788605,
149
  "step": 90
150
  },
151
  {
152
- "epoch": 0.32,
153
- "learning_rate": 4.27772717647508e-07,
154
- "logits/chosen": -2.737382650375366,
155
- "logits/rejected": -2.748401641845703,
156
- "logps/chosen": -119.01786804199219,
157
- "logps/rejected": -131.80020141601562,
158
- "loss": 0.4473,
159
- "rewards/accuracies": 0.6312500238418579,
160
- "rewards/chosen": -0.135704904794693,
161
- "rewards/margins": 0.061350900679826736,
162
- "rewards/rejected": -0.19705583155155182,
163
  "step": 100
164
  },
165
  {
166
- "epoch": 0.36,
167
- "learning_rate": 4.068172512800759e-07,
168
- "logits/chosen": -2.703746795654297,
169
- "logits/rejected": -2.698978900909424,
170
- "logps/chosen": -125.7964096069336,
171
- "logps/rejected": -135.111328125,
172
- "loss": 0.4126,
173
- "rewards/accuracies": 0.5874999761581421,
174
- "rewards/chosen": -0.18998780846595764,
175
- "rewards/margins": 0.04047433286905289,
176
- "rewards/rejected": -0.23046214878559113,
177
  "step": 110
178
  },
179
  {
180
- "epoch": 0.39,
181
- "learning_rate": 3.8386127015561377e-07,
182
- "logits/chosen": -2.728740930557251,
183
- "logits/rejected": -2.703545331954956,
184
- "logps/chosen": -120.62841796875,
185
- "logps/rejected": -128.31187438964844,
186
- "loss": 0.4431,
187
- "rewards/accuracies": 0.606249988079071,
188
- "rewards/chosen": -0.15412963926792145,
189
- "rewards/margins": 0.07443422079086304,
190
- "rewards/rejected": -0.22856386005878448,
191
  "step": 120
192
  },
193
  {
194
- "epoch": 0.42,
195
- "learning_rate": 3.591976232982355e-07,
196
- "logits/chosen": -2.649157762527466,
197
- "logits/rejected": -2.644134044647217,
198
- "logps/chosen": -100.65953826904297,
199
- "logps/rejected": -113.25675964355469,
200
- "loss": 0.4739,
201
- "rewards/accuracies": 0.5874999761581421,
202
- "rewards/chosen": -0.10916206985712051,
203
- "rewards/margins": 0.07735568284988403,
204
- "rewards/rejected": -0.18651774525642395,
205
  "step": 130
206
  },
207
  {
208
- "epoch": 0.45,
209
- "learning_rate": 3.33140944392039e-07,
210
- "logits/chosen": -2.664492607116699,
211
- "logits/rejected": -2.631776809692383,
212
- "logps/chosen": -142.42654418945312,
213
- "logps/rejected": -146.1369171142578,
214
- "loss": 0.4342,
215
- "rewards/accuracies": 0.606249988079071,
216
- "rewards/chosen": -0.24194404482841492,
217
- "rewards/margins": 0.08811615407466888,
218
- "rewards/rejected": -0.3300601840019226,
219
  "step": 140
220
  },
221
  {
222
- "epoch": 0.49,
223
- "learning_rate": 3.060236380050519e-07,
224
- "logits/chosen": -2.662078380584717,
225
- "logits/rejected": -2.667717456817627,
226
- "logps/chosen": -139.26730346679688,
227
- "logps/rejected": -157.76010131835938,
228
- "loss": 0.3688,
229
- "rewards/accuracies": 0.5874999761581421,
230
- "rewards/chosen": -0.3599378764629364,
231
- "rewards/margins": 0.12340853363275528,
232
- "rewards/rejected": -0.48334646224975586,
233
  "step": 150
234
  },
235
  {
236
- "epoch": 0.52,
237
- "learning_rate": 2.781916391103417e-07,
238
- "logits/chosen": -2.6194424629211426,
239
- "logits/rejected": -2.588695526123047,
240
- "logps/chosen": -162.62075805664062,
241
- "logps/rejected": -163.50631713867188,
242
- "loss": 0.302,
243
- "rewards/accuracies": 0.5874999761581421,
244
- "rewards/chosen": -0.5319477319717407,
245
- "rewards/margins": 0.09751905500888824,
246
- "rewards/rejected": -0.6294667720794678,
247
  "step": 160
248
  },
249
  {
250
- "epoch": 0.55,
251
- "learning_rate": 2.5e-07,
252
- "logits/chosen": -2.631842851638794,
253
- "logits/rejected": -2.6272921562194824,
254
- "logps/chosen": -160.49258422851562,
255
- "logps/rejected": -172.91624450683594,
256
- "loss": 0.3113,
257
- "rewards/accuracies": 0.550000011920929,
258
- "rewards/chosen": -0.5451255440711975,
259
- "rewards/margins": 0.0745776817202568,
260
- "rewards/rejected": -0.6197031736373901,
261
  "step": 170
262
  },
263
  {
264
- "epoch": 0.58,
265
- "learning_rate": 2.218083608896583e-07,
266
- "logits/chosen": -2.5954980850219727,
267
- "logits/rejected": -2.6070828437805176,
268
- "logps/chosen": -163.6999969482422,
269
- "logps/rejected": -186.43154907226562,
270
- "loss": 0.2944,
271
- "rewards/accuracies": 0.5874999761581421,
272
- "rewards/chosen": -0.5556995868682861,
273
- "rewards/margins": 0.18364927172660828,
274
- "rewards/rejected": -0.7393488883972168,
275
  "step": 180
276
  },
277
  {
278
- "epoch": 0.61,
279
- "learning_rate": 1.9397636199494806e-07,
280
- "logits/chosen": -2.557199716567993,
281
- "logits/rejected": -2.5558159351348877,
282
- "logps/chosen": -162.37643432617188,
283
- "logps/rejected": -181.76522827148438,
284
- "loss": 0.2797,
285
- "rewards/accuracies": 0.5687500238418579,
286
- "rewards/chosen": -0.5982151627540588,
287
- "rewards/margins": 0.14243794977664948,
288
- "rewards/rejected": -0.7406530976295471,
289
  "step": 190
290
  },
291
  {
292
- "epoch": 0.65,
293
- "learning_rate": 1.6685905560796098e-07,
294
- "logits/chosen": -2.5707640647888184,
295
- "logits/rejected": -2.5763020515441895,
296
- "logps/chosen": -155.4540252685547,
297
- "logps/rejected": -169.99282836914062,
298
- "loss": 0.2629,
299
- "rewards/accuracies": 0.625,
300
- "rewards/chosen": -0.6057673692703247,
301
- "rewards/margins": 0.10265880823135376,
302
- "rewards/rejected": -0.7084261178970337,
303
  "step": 200
304
  },
305
  {
306
- "epoch": 0.68,
307
- "learning_rate": 1.4080237670176453e-07,
308
- "logits/chosen": -2.573857545852661,
309
- "logits/rejected": -2.563162088394165,
310
- "logps/chosen": -159.9458465576172,
311
- "logps/rejected": -175.49703979492188,
312
- "loss": 0.2737,
313
- "rewards/accuracies": 0.59375,
314
- "rewards/chosen": -0.6361742615699768,
315
- "rewards/margins": 0.15395954251289368,
316
- "rewards/rejected": -0.7901338338851929,
317
  "step": 210
318
  },
319
  {
320
- "epoch": 0.71,
321
- "learning_rate": 1.1613872984438628e-07,
322
- "logits/chosen": -2.5723612308502197,
323
- "logits/rejected": -2.554568290710449,
324
- "logps/chosen": -171.87423706054688,
325
- "logps/rejected": -192.27273559570312,
326
- "loss": 0.2636,
327
- "rewards/accuracies": 0.6187499761581421,
328
- "rewards/chosen": -0.7052888870239258,
329
- "rewards/margins": 0.1558634340763092,
330
- "rewards/rejected": -0.8611523509025574,
331
  "step": 220
332
  },
333
  {
334
- "epoch": 0.74,
335
- "learning_rate": 9.318274871992407e-08,
336
- "logits/chosen": -2.5391600131988525,
337
- "logits/rejected": -2.533364772796631,
338
- "logps/chosen": -184.72389221191406,
339
- "logps/rejected": -205.41574096679688,
340
- "loss": 0.2406,
341
- "rewards/accuracies": 0.6312500238418579,
342
- "rewards/chosen": -0.7657197713851929,
343
- "rewards/margins": 0.16964995861053467,
344
- "rewards/rejected": -0.9353697896003723,
345
  "step": 230
346
  },
347
  {
348
- "epoch": 0.78,
349
- "learning_rate": 7.222728235249195e-08,
350
- "logits/chosen": -2.4788975715637207,
351
- "logits/rejected": -2.4929776191711426,
352
- "logps/chosen": -185.28515625,
353
- "logps/rejected": -199.28814697265625,
354
- "loss": 0.2322,
355
- "rewards/accuracies": 0.6000000238418579,
356
- "rewards/chosen": -0.7988831996917725,
357
- "rewards/margins": 0.1449696272611618,
358
- "rewards/rejected": -0.9438527822494507,
359
  "step": 240
360
  },
361
  {
362
- "epoch": 0.81,
363
- "learning_rate": 5.353965923666742e-08,
364
- "logits/chosen": -2.5501561164855957,
365
- "logits/rejected": -2.537123441696167,
366
- "logps/chosen": -177.23464965820312,
367
- "logps/rejected": -201.24827575683594,
368
- "loss": 0.222,
369
- "rewards/accuracies": 0.612500011920929,
370
- "rewards/chosen": -0.8073725700378418,
371
- "rewards/margins": 0.1908084899187088,
372
- "rewards/rejected": -0.9981809854507446,
373
  "step": 250
374
  },
375
  {
376
- "epoch": 0.84,
377
- "learning_rate": 3.7358277032860016e-08,
378
- "logits/chosen": -2.6011037826538086,
379
- "logits/rejected": -2.5914735794067383,
380
- "logps/chosen": -193.97116088867188,
381
- "logps/rejected": -208.1275177001953,
382
- "loss": 0.2522,
383
- "rewards/accuracies": 0.6312500238418579,
384
- "rewards/chosen": -0.7856981158256531,
385
- "rewards/margins": 0.17516431212425232,
386
- "rewards/rejected": -0.9608623385429382,
387
  "step": 260
388
  },
389
  {
390
- "epoch": 0.87,
391
- "learning_rate": 2.3889561332792657e-08,
392
- "logits/chosen": -2.5599663257598877,
393
- "logits/rejected": -2.547809362411499,
394
- "logps/chosen": -166.72702026367188,
395
- "logps/rejected": -184.97462463378906,
396
- "loss": 0.2529,
397
- "rewards/accuracies": 0.606249988079071,
398
- "rewards/chosen": -0.6648862361907959,
399
- "rewards/margins": 0.1466326266527176,
400
- "rewards/rejected": -0.8115188479423523,
401
  "step": 270
402
  },
403
  {
404
- "epoch": 0.91,
405
- "learning_rate": 1.3305332292068705e-08,
406
- "logits/chosen": -2.482130527496338,
407
- "logits/rejected": -2.4812235832214355,
408
- "logps/chosen": -186.01287841796875,
409
- "logps/rejected": -200.09637451171875,
410
- "loss": 0.2508,
411
- "rewards/accuracies": 0.574999988079071,
412
- "rewards/chosen": -0.7596431970596313,
413
- "rewards/margins": 0.09221551567316055,
414
- "rewards/rejected": -0.8518587350845337,
415
  "step": 280
416
  },
417
  {
418
- "epoch": 0.94,
419
- "learning_rate": 5.740612723643401e-09,
420
- "logits/chosen": -2.5973570346832275,
421
- "logits/rejected": -2.5847537517547607,
422
- "logps/chosen": -185.65423583984375,
423
- "logps/rejected": -196.46435546875,
424
- "loss": 0.244,
425
- "rewards/accuracies": 0.512499988079071,
426
- "rewards/chosen": -0.770811915397644,
427
- "rewards/margins": 0.1569802314043045,
428
- "rewards/rejected": -0.9277920722961426,
429
  "step": 290
430
  },
431
  {
432
- "epoch": 0.97,
433
- "learning_rate": 1.2919056143113061e-09,
434
- "logits/chosen": -2.5530037879943848,
435
- "logits/rejected": -2.544957399368286,
436
- "logps/chosen": -184.2729034423828,
437
- "logps/rejected": -192.07211303710938,
438
- "loss": 0.2475,
439
- "rewards/accuracies": 0.6499999761581421,
440
- "rewards/chosen": -0.7763879895210266,
441
- "rewards/margins": 0.143040731549263,
442
- "rewards/rejected": -0.919428825378418,
443
  "step": 300
444
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
  {
446
  "epoch": 1.0,
447
- "step": 309,
448
  "total_flos": 0.0,
449
- "train_loss": 0.3777129889306127,
450
- "train_runtime": 2383.9738,
451
- "train_samples_per_second": 16.566,
452
- "train_steps_per_second": 0.13
453
  }
454
  ],
455
  "logging_steps": 10,
456
- "max_steps": 309,
457
  "num_train_epochs": 1,
458
  "save_steps": 1000,
459
  "total_flos": 0.0,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9994340690435767,
5
  "eval_steps": 1000,
6
+ "global_step": 883,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 5.617977528089887e-09,
14
+ "logits/chosen": -2.7943434715270996,
15
+ "logits/rejected": -2.817823886871338,
16
+ "logps/chosen": -334.107666015625,
17
+ "logps/rejected": -197.05621337890625,
18
+ "loss": 0.353,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
 
23
  "step": 1
24
  },
25
  {
26
+ "epoch": 0.01,
27
+ "learning_rate": 5.617977528089887e-08,
28
+ "logits/chosen": -2.8335936069488525,
29
+ "logits/rejected": -2.782947540283203,
30
+ "logps/chosen": -323.8160400390625,
31
+ "logps/rejected": -189.45599365234375,
32
+ "loss": 0.3374,
33
+ "rewards/accuracies": 0.4652777910232544,
34
+ "rewards/chosen": 0.0004737268900498748,
35
+ "rewards/margins": 0.000818206463009119,
36
+ "rewards/rejected": -0.00034447951475158334,
37
  "step": 10
38
  },
39
  {
40
+ "epoch": 0.02,
41
+ "learning_rate": 1.1235955056179774e-07,
42
+ "logits/chosen": -2.778644561767578,
43
+ "logits/rejected": -2.7627484798431396,
44
+ "logps/chosen": -323.32391357421875,
45
+ "logps/rejected": -168.40980529785156,
46
+ "loss": 0.3385,
47
+ "rewards/accuracies": 0.59375,
48
+ "rewards/chosen": 0.0011631561210379004,
49
+ "rewards/margins": 0.0020370460115373135,
50
+ "rewards/rejected": -0.0008738901233300567,
51
  "step": 20
52
  },
53
  {
54
+ "epoch": 0.03,
55
+ "learning_rate": 1.6853932584269663e-07,
56
+ "logits/chosen": -2.7863869667053223,
57
+ "logits/rejected": -2.7318475246429443,
58
+ "logps/chosen": -305.9979248046875,
59
+ "logps/rejected": -180.11251831054688,
60
+ "loss": 0.3398,
61
+ "rewards/accuracies": 0.6875,
62
+ "rewards/chosen": 0.009158268570899963,
63
+ "rewards/margins": 0.01635834574699402,
64
+ "rewards/rejected": -0.00720007810741663,
65
  "step": 30
66
  },
67
  {
68
+ "epoch": 0.05,
69
+ "learning_rate": 2.2471910112359549e-07,
70
+ "logits/chosen": -2.7181248664855957,
71
+ "logits/rejected": -2.71014142036438,
72
+ "logps/chosen": -314.97698974609375,
73
+ "logps/rejected": -178.74220275878906,
74
+ "loss": 0.3414,
75
+ "rewards/accuracies": 0.637499988079071,
76
+ "rewards/chosen": 0.033507008105516434,
77
+ "rewards/margins": 0.06880000978708267,
78
+ "rewards/rejected": -0.03529299795627594,
79
  "step": 40
80
  },
81
  {
82
+ "epoch": 0.06,
83
+ "learning_rate": 2.8089887640449437e-07,
84
+ "logits/chosen": -2.6565139293670654,
85
+ "logits/rejected": -2.6516811847686768,
86
+ "logps/chosen": -341.38330078125,
87
+ "logps/rejected": -193.6222381591797,
88
+ "loss": 0.352,
89
+ "rewards/accuracies": 0.65625,
90
+ "rewards/chosen": 0.03725530207157135,
91
+ "rewards/margins": 0.15392780303955078,
92
+ "rewards/rejected": -0.11667251586914062,
93
  "step": 50
94
  },
95
  {
96
+ "epoch": 0.07,
97
+ "learning_rate": 3.3707865168539325e-07,
98
+ "logits/chosen": -2.6464648246765137,
99
+ "logits/rejected": -2.6180057525634766,
100
+ "logps/chosen": -292.76690673828125,
101
+ "logps/rejected": -203.3881378173828,
102
+ "loss": 0.3395,
103
+ "rewards/accuracies": 0.6000000238418579,
104
+ "rewards/chosen": -0.06494523584842682,
105
+ "rewards/margins": 0.14356324076652527,
106
+ "rewards/rejected": -0.20850849151611328,
107
  "step": 60
108
  },
109
  {
110
+ "epoch": 0.08,
111
+ "learning_rate": 3.9325842696629214e-07,
112
+ "logits/chosen": -2.5645086765289307,
113
+ "logits/rejected": -2.5475826263427734,
114
+ "logps/chosen": -325.9661865234375,
115
+ "logps/rejected": -226.6337890625,
116
+ "loss": 0.289,
117
+ "rewards/accuracies": 0.6812499761581421,
118
+ "rewards/chosen": -0.1385149508714676,
119
+ "rewards/margins": 0.243437722325325,
120
+ "rewards/rejected": -0.3819526731967926,
121
  "step": 70
122
  },
123
  {
124
+ "epoch": 0.09,
125
+ "learning_rate": 4.4943820224719097e-07,
126
+ "logits/chosen": -2.5285604000091553,
127
+ "logits/rejected": -2.5033059120178223,
128
+ "logps/chosen": -408.848388671875,
129
+ "logps/rejected": -271.00811767578125,
130
+ "loss": 0.2518,
131
+ "rewards/accuracies": 0.7250000238418579,
132
+ "rewards/chosen": -0.13917537033557892,
133
+ "rewards/margins": 0.5459555387496948,
134
+ "rewards/rejected": -0.6851309537887573,
135
  "step": 80
136
  },
137
  {
138
+ "epoch": 0.1,
139
+ "learning_rate": 4.999980431020109e-07,
140
+ "logits/chosen": -2.5432324409484863,
141
+ "logits/rejected": -2.517310380935669,
142
+ "logps/chosen": -390.67340087890625,
143
+ "logps/rejected": -274.8931579589844,
144
+ "loss": 0.1964,
145
+ "rewards/accuracies": 0.6812499761581421,
146
+ "rewards/chosen": -0.34996914863586426,
147
+ "rewards/margins": 0.5996032953262329,
148
+ "rewards/rejected": -0.9495723843574524,
149
  "step": 90
150
  },
151
  {
152
+ "epoch": 0.11,
153
+ "learning_rate": 4.997632524101301e-07,
154
+ "logits/chosen": -2.5553176403045654,
155
+ "logits/rejected": -2.534501552581787,
156
+ "logps/chosen": -390.3453063964844,
157
+ "logps/rejected": -304.5262451171875,
158
+ "loss": 0.1877,
159
+ "rewards/accuracies": 0.6812499761581421,
160
+ "rewards/chosen": -0.45027050375938416,
161
+ "rewards/margins": 0.5258683562278748,
162
+ "rewards/rejected": -0.9761388897895813,
163
  "step": 100
164
  },
165
  {
166
+ "epoch": 0.12,
167
+ "learning_rate": 4.991375032514749e-07,
168
+ "logits/chosen": -2.485586643218994,
169
+ "logits/rejected": -2.4563088417053223,
170
+ "logps/chosen": -409.3569030761719,
171
+ "logps/rejected": -326.25390625,
172
+ "loss": 0.1404,
173
+ "rewards/accuracies": 0.7124999761581421,
174
+ "rewards/chosen": -0.779589056968689,
175
+ "rewards/margins": 0.6373748779296875,
176
+ "rewards/rejected": -1.4169639348983765,
177
  "step": 110
178
  },
179
  {
180
+ "epoch": 0.14,
181
+ "learning_rate": 4.98121775121344e-07,
182
+ "logits/chosen": -2.5598855018615723,
183
+ "logits/rejected": -2.524275541305542,
184
+ "logps/chosen": -468.26318359375,
185
+ "logps/rejected": -383.34808349609375,
186
+ "loss": 0.1197,
187
+ "rewards/accuracies": 0.6937500238418579,
188
+ "rewards/chosen": -0.8500933647155762,
189
+ "rewards/margins": 0.8599642515182495,
190
+ "rewards/rejected": -1.7100576162338257,
191
  "step": 120
192
  },
193
  {
194
+ "epoch": 0.15,
195
+ "learning_rate": 4.96717657955441e-07,
196
+ "logits/chosen": -2.5483665466308594,
197
+ "logits/rejected": -2.4886937141418457,
198
+ "logps/chosen": -449.65167236328125,
199
+ "logps/rejected": -355.64581298828125,
200
+ "loss": 0.1298,
201
+ "rewards/accuracies": 0.7562500238418579,
202
+ "rewards/chosen": -0.7365049719810486,
203
+ "rewards/margins": 0.864643394947052,
204
+ "rewards/rejected": -1.6011483669281006,
205
  "step": 130
206
  },
207
  {
208
+ "epoch": 0.16,
209
+ "learning_rate": 4.949273496411216e-07,
210
+ "logits/chosen": -2.4851553440093994,
211
+ "logits/rejected": -2.45190167427063,
212
+ "logps/chosen": -422.0953063964844,
213
+ "logps/rejected": -381.73773193359375,
214
+ "loss": 0.123,
215
+ "rewards/accuracies": 0.675000011920929,
216
+ "rewards/chosen": -0.8808937072753906,
217
+ "rewards/margins": 0.8638134002685547,
218
+ "rewards/rejected": -1.7447071075439453,
219
  "step": 140
220
  },
221
  {
222
+ "epoch": 0.17,
223
+ "learning_rate": 4.927536525770046e-07,
224
+ "logits/chosen": -2.4234509468078613,
225
+ "logits/rejected": -2.379763126373291,
226
+ "logps/chosen": -488.06671142578125,
227
+ "logps/rejected": -402.50836181640625,
228
+ "loss": 0.0874,
229
+ "rewards/accuracies": 0.7437499761581421,
230
+ "rewards/chosen": -1.1678707599639893,
231
+ "rewards/margins": 0.8644298315048218,
232
+ "rewards/rejected": -2.0323004722595215,
233
  "step": 150
234
  },
235
  {
236
+ "epoch": 0.18,
237
+ "learning_rate": 4.901999692863326e-07,
238
+ "logits/chosen": -2.438655138015747,
239
+ "logits/rejected": -2.369347095489502,
240
+ "logps/chosen": -527.1495361328125,
241
+ "logps/rejected": -413.07891845703125,
242
+ "loss": 0.0927,
243
+ "rewards/accuracies": 0.78125,
244
+ "rewards/chosen": -0.8715424537658691,
245
+ "rewards/margins": 1.1340700387954712,
246
+ "rewards/rejected": -2.0056121349334717,
247
  "step": 160
248
  },
249
  {
250
+ "epoch": 0.19,
251
+ "learning_rate": 4.872702970909464e-07,
252
+ "logits/chosen": -2.32255220413208,
253
+ "logits/rejected": -2.2418487071990967,
254
+ "logps/chosen": -489.12957763671875,
255
+ "logps/rejected": -404.12091064453125,
256
+ "loss": 0.0753,
257
+ "rewards/accuracies": 0.7437499761581421,
258
+ "rewards/chosen": -1.1407396793365479,
259
+ "rewards/margins": 1.0352163314819336,
260
+ "rewards/rejected": -2.1759560108184814,
261
  "step": 170
262
  },
263
  {
264
+ "epoch": 0.2,
265
+ "learning_rate": 4.839692218542131e-07,
266
+ "logits/chosen": -2.1711273193359375,
267
+ "logits/rejected": -2.1464552879333496,
268
+ "logps/chosen": -474.91253662109375,
269
+ "logps/rejected": -437.1839294433594,
270
+ "loss": 0.0542,
271
+ "rewards/accuracies": 0.706250011920929,
272
+ "rewards/chosen": -1.8761800527572632,
273
+ "rewards/margins": 0.8142536282539368,
274
+ "rewards/rejected": -2.690433979034424,
275
  "step": 180
276
  },
277
  {
278
+ "epoch": 0.22,
279
+ "learning_rate": 4.803019108026997e-07,
280
+ "logits/chosen": -2.144486904144287,
281
+ "logits/rejected": -2.093785285949707,
282
+ "logps/chosen": -481.8885192871094,
283
+ "logps/rejected": -427.78875732421875,
284
+ "loss": 0.0661,
285
+ "rewards/accuracies": 0.675000011920929,
286
+ "rewards/chosen": -1.5085182189941406,
287
+ "rewards/margins": 0.933529257774353,
288
+ "rewards/rejected": -2.442047595977783,
289
  "step": 190
290
  },
291
  {
292
+ "epoch": 0.23,
293
+ "learning_rate": 4.7627410443782887e-07,
294
+ "logits/chosen": -2.084139347076416,
295
+ "logits/rejected": -2.0504238605499268,
296
+ "logps/chosen": -442.94427490234375,
297
+ "logps/rejected": -425.2293395996094,
298
+ "loss": 0.0772,
299
+ "rewards/accuracies": 0.737500011920929,
300
+ "rewards/chosen": -1.3089433908462524,
301
+ "rewards/margins": 1.0439088344573975,
302
+ "rewards/rejected": -2.3528523445129395,
303
  "step": 200
304
  },
305
  {
306
+ "epoch": 0.24,
307
+ "learning_rate": 4.7189210755018034e-07,
308
+ "logits/chosen": -2.0335006713867188,
309
+ "logits/rejected": -1.9534251689910889,
310
+ "logps/chosen": -519.8855590820312,
311
+ "logps/rejected": -478.9068298339844,
312
+ "loss": 0.0603,
313
+ "rewards/accuracies": 0.78125,
314
+ "rewards/chosen": -1.4866015911102295,
315
+ "rewards/margins": 1.284881353378296,
316
+ "rewards/rejected": -2.7714829444885254,
317
  "step": 210
318
  },
319
  {
320
+ "epoch": 0.25,
321
+ "learning_rate": 4.671627793504988e-07,
322
+ "logits/chosen": -2.10530424118042,
323
+ "logits/rejected": -2.0339417457580566,
324
+ "logps/chosen": -527.8480834960938,
325
+ "logps/rejected": -493.0773010253906,
326
+ "loss": 0.0601,
327
+ "rewards/accuracies": 0.768750011920929,
328
+ "rewards/chosen": -1.485095500946045,
329
+ "rewards/margins": 1.381305456161499,
330
+ "rewards/rejected": -2.866400957107544,
331
  "step": 220
332
  },
333
  {
334
+ "epoch": 0.26,
335
+ "learning_rate": 4.6209352273286095e-07,
336
+ "logits/chosen": -2.0227694511413574,
337
+ "logits/rejected": -1.9594017267227173,
338
+ "logps/chosen": -497.4979553222656,
339
+ "logps/rejected": -502.72174072265625,
340
+ "loss": 0.0694,
341
+ "rewards/accuracies": 0.7437499761581421,
342
+ "rewards/chosen": -1.6086342334747314,
343
+ "rewards/margins": 1.0697346925735474,
344
+ "rewards/rejected": -2.6783690452575684,
345
  "step": 230
346
  },
347
  {
348
+ "epoch": 0.27,
349
+ "learning_rate": 4.56692272686805e-07,
350
+ "logits/chosen": -2.0572688579559326,
351
+ "logits/rejected": -2.007657289505005,
352
+ "logps/chosen": -469.2767028808594,
353
+ "logps/rejected": -439.0755920410156,
354
+ "loss": 0.0704,
355
+ "rewards/accuracies": 0.731249988079071,
356
+ "rewards/chosen": -1.490736961364746,
357
+ "rewards/margins": 1.089231252670288,
358
+ "rewards/rejected": -2.579967975616455,
359
  "step": 240
360
  },
361
  {
362
+ "epoch": 0.28,
363
+ "learning_rate": 4.5096748387656326e-07,
364
+ "logits/chosen": -1.9701417684555054,
365
+ "logits/rejected": -1.8864467144012451,
366
+ "logps/chosen": -502.487060546875,
367
+ "logps/rejected": -456.74957275390625,
368
+ "loss": 0.0586,
369
+ "rewards/accuracies": 0.6937500238418579,
370
+ "rewards/chosen": -1.8166824579238892,
371
+ "rewards/margins": 0.9140488505363464,
372
+ "rewards/rejected": -2.730731248855591,
373
  "step": 250
374
  },
375
  {
376
+ "epoch": 0.29,
377
+ "learning_rate": 4.4492811740683877e-07,
378
+ "logits/chosen": -1.9581239223480225,
379
+ "logits/rejected": -1.8596645593643188,
380
+ "logps/chosen": -496.4566345214844,
381
+ "logps/rejected": -464.6380310058594,
382
+ "loss": 0.0574,
383
+ "rewards/accuracies": 0.699999988079071,
384
+ "rewards/chosen": -1.9547615051269531,
385
+ "rewards/margins": 0.8960165977478027,
386
+ "rewards/rejected": -2.850778102874756,
387
  "step": 260
388
  },
389
  {
390
+ "epoch": 0.31,
391
+ "learning_rate": 4.3858362679584354e-07,
392
+ "logits/chosen": -2.0671660900115967,
393
+ "logits/rejected": -1.9386441707611084,
394
+ "logps/chosen": -496.42669677734375,
395
+ "logps/rejected": -445.92218017578125,
396
+ "loss": 0.0564,
397
+ "rewards/accuracies": 0.824999988079071,
398
+ "rewards/chosen": -1.5112464427947998,
399
+ "rewards/margins": 1.217174768447876,
400
+ "rewards/rejected": -2.728421688079834,
401
  "step": 270
402
  },
403
  {
404
+ "epoch": 0.32,
405
+ "learning_rate": 4.3194394317755245e-07,
406
+ "logits/chosen": -2.0471348762512207,
407
+ "logits/rejected": -1.9359214305877686,
408
+ "logps/chosen": -517.4642333984375,
409
+ "logps/rejected": -449.72607421875,
410
+ "loss": 0.059,
411
+ "rewards/accuracies": 0.6812499761581421,
412
+ "rewards/chosen": -1.718505859375,
413
+ "rewards/margins": 1.0684489011764526,
414
+ "rewards/rejected": -2.786954641342163,
415
  "step": 280
416
  },
417
  {
418
+ "epoch": 0.33,
419
+ "learning_rate": 4.2501945975633914e-07,
420
+ "logits/chosen": -2.1385433673858643,
421
+ "logits/rejected": -2.0321764945983887,
422
+ "logps/chosen": -524.6473388671875,
423
+ "logps/rejected": -449.234375,
424
+ "loss": 0.0705,
425
+ "rewards/accuracies": 0.78125,
426
+ "rewards/chosen": -1.6018638610839844,
427
+ "rewards/margins": 1.1273199319839478,
428
+ "rewards/rejected": -2.7291836738586426,
429
  "step": 290
430
  },
431
  {
432
+ "epoch": 0.34,
433
+ "learning_rate": 4.1782101553832405e-07,
434
+ "logits/chosen": -2.0962741374969482,
435
+ "logits/rejected": -2.0118086338043213,
436
+ "logps/chosen": -513.7835693359375,
437
+ "logps/rejected": -460.288330078125,
438
+ "loss": 0.0517,
439
+ "rewards/accuracies": 0.731249988079071,
440
+ "rewards/chosen": -1.9815715551376343,
441
+ "rewards/margins": 0.8931058645248413,
442
+ "rewards/rejected": -2.8746774196624756,
443
  "step": 300
444
  },
445
+ {
446
+ "epoch": 0.35,
447
+ "learning_rate": 4.103598783649029e-07,
448
+ "logits/chosen": -2.0219855308532715,
449
+ "logits/rejected": -1.9064286947250366,
450
+ "logps/chosen": -591.9143676757812,
451
+ "logps/rejected": -519.5982666015625,
452
+ "loss": 0.0463,
453
+ "rewards/accuracies": 0.6812499761581421,
454
+ "rewards/chosen": -2.2725110054016113,
455
+ "rewards/margins": 1.1850159168243408,
456
+ "rewards/rejected": -3.457526683807373,
457
+ "step": 310
458
+ },
459
+ {
460
+ "epoch": 0.36,
461
+ "learning_rate": 4.026477272750119e-07,
462
+ "logits/chosen": -1.9796558618545532,
463
+ "logits/rejected": -1.875626802444458,
464
+ "logps/chosen": -566.9561157226562,
465
+ "logps/rejected": -512.3988037109375,
466
+ "loss": 0.0417,
467
+ "rewards/accuracies": 0.699999988079071,
468
+ "rewards/chosen": -2.3572134971618652,
469
+ "rewards/margins": 0.9829473495483398,
470
+ "rewards/rejected": -3.340160846710205,
471
+ "step": 320
472
+ },
473
+ {
474
+ "epoch": 0.37,
475
+ "learning_rate": 3.9469663422373864e-07,
476
+ "logits/chosen": -2.0174968242645264,
477
+ "logits/rejected": -1.9458332061767578,
478
+ "logps/chosen": -552.6922607421875,
479
+ "logps/rejected": -521.9205322265625,
480
+ "loss": 0.0479,
481
+ "rewards/accuracies": 0.800000011920929,
482
+ "rewards/chosen": -2.164801836013794,
483
+ "rewards/margins": 1.167511224746704,
484
+ "rewards/rejected": -3.332312822341919,
485
+ "step": 330
486
+ },
487
+ {
488
+ "epoch": 0.38,
489
+ "learning_rate": 3.865190451858954e-07,
490
+ "logits/chosen": -2.0744147300720215,
491
+ "logits/rejected": -1.9883348941802979,
492
+ "logps/chosen": -588.4783935546875,
493
+ "logps/rejected": -547.9783325195312,
494
+ "loss": 0.0549,
495
+ "rewards/accuracies": 0.75,
496
+ "rewards/chosen": -2.0789949893951416,
497
+ "rewards/margins": 1.3585710525512695,
498
+ "rewards/rejected": -3.437565565109253,
499
+ "step": 340
500
+ },
501
+ {
502
+ "epoch": 0.4,
503
+ "learning_rate": 3.781277606741327e-07,
504
+ "logits/chosen": -2.0338993072509766,
505
+ "logits/rejected": -1.9896167516708374,
506
+ "logps/chosen": -492.18743896484375,
507
+ "logps/rejected": -475.5653381347656,
508
+ "loss": 0.0634,
509
+ "rewards/accuracies": 0.762499988079071,
510
+ "rewards/chosen": -1.845071792602539,
511
+ "rewards/margins": 1.0096709728240967,
512
+ "rewards/rejected": -2.854743003845215,
513
+ "step": 350
514
+ },
515
+ {
516
+ "epoch": 0.41,
517
+ "learning_rate": 3.6953591570208996e-07,
518
+ "logits/chosen": -2.158510684967041,
519
+ "logits/rejected": -2.0470757484436035,
520
+ "logps/chosen": -531.6489868164062,
521
+ "logps/rejected": -506.73388671875,
522
+ "loss": 0.0646,
523
+ "rewards/accuracies": 0.78125,
524
+ "rewards/chosen": -1.6158654689788818,
525
+ "rewards/margins": 1.420132040977478,
526
+ "rewards/rejected": -3.0359978675842285,
527
+ "step": 360
528
+ },
529
+ {
530
+ "epoch": 0.42,
531
+ "learning_rate": 3.607569592239452e-07,
532
+ "logits/chosen": -2.0580546855926514,
533
+ "logits/rejected": -1.9891626834869385,
534
+ "logps/chosen": -569.1202392578125,
535
+ "logps/rejected": -509.3130798339844,
536
+ "loss": 0.0481,
537
+ "rewards/accuracies": 0.768750011920929,
538
+ "rewards/chosen": -1.8223598003387451,
539
+ "rewards/margins": 1.3710863590240479,
540
+ "rewards/rejected": -3.193446636199951,
541
+ "step": 370
542
+ },
543
+ {
544
+ "epoch": 0.43,
545
+ "learning_rate": 3.518046330825494e-07,
546
+ "logits/chosen": -2.0791687965393066,
547
+ "logits/rejected": -1.981591820716858,
548
+ "logps/chosen": -581.3887329101562,
549
+ "logps/rejected": -514.6013793945312,
550
+ "loss": 0.0523,
551
+ "rewards/accuracies": 0.768750011920929,
552
+ "rewards/chosen": -2.0335278511047363,
553
+ "rewards/margins": 1.2213140726089478,
554
+ "rewards/rejected": -3.2548420429229736,
555
+ "step": 380
556
+ },
557
+ {
558
+ "epoch": 0.44,
559
+ "learning_rate": 3.4269295049909713e-07,
560
+ "logits/chosen": -2.034996747970581,
561
+ "logits/rejected": -1.9570029973983765,
562
+ "logps/chosen": -493.1822204589844,
563
+ "logps/rejected": -479.6783752441406,
564
+ "loss": 0.06,
565
+ "rewards/accuracies": 0.7875000238418579,
566
+ "rewards/chosen": -1.9078280925750732,
567
+ "rewards/margins": 1.182108759880066,
568
+ "rewards/rejected": -3.0899367332458496,
569
+ "step": 390
570
+ },
571
+ {
572
+ "epoch": 0.45,
573
+ "learning_rate": 3.3343617413800453e-07,
574
+ "logits/chosen": -2.1511921882629395,
575
+ "logits/rejected": -2.0434978008270264,
576
+ "logps/chosen": -550.9173583984375,
577
+ "logps/rejected": -495.8582458496094,
578
+ "loss": 0.0516,
579
+ "rewards/accuracies": 0.78125,
580
+ "rewards/chosen": -1.865997552871704,
581
+ "rewards/margins": 1.4241960048675537,
582
+ "rewards/rejected": -3.2901930809020996,
583
+ "step": 400
584
+ },
585
+ {
586
+ "epoch": 0.46,
587
+ "learning_rate": 3.2404879378132893e-07,
588
+ "logits/chosen": -2.0529592037200928,
589
+ "logits/rejected": -1.988389253616333,
590
+ "logps/chosen": -507.41534423828125,
591
+ "logps/rejected": -502.86517333984375,
592
+ "loss": 0.0495,
593
+ "rewards/accuracies": 0.737500011920929,
594
+ "rewards/chosen": -2.0051426887512207,
595
+ "rewards/margins": 1.3514044284820557,
596
+ "rewards/rejected": -3.3565471172332764,
597
+ "step": 410
598
+ },
599
+ {
600
+ "epoch": 0.48,
601
+ "learning_rate": 3.1454550364767894e-07,
602
+ "logits/chosen": -2.0849246978759766,
603
+ "logits/rejected": -1.9924323558807373,
604
+ "logps/chosen": -562.3327026367188,
605
+ "logps/rejected": -560.0533447265625,
606
+ "loss": 0.0558,
607
+ "rewards/accuracies": 0.706250011920929,
608
+ "rewards/chosen": -2.299450397491455,
609
+ "rewards/margins": 1.2988755702972412,
610
+ "rewards/rejected": -3.598325729370117,
611
+ "step": 420
612
+ },
613
+ {
614
+ "epoch": 0.49,
615
+ "learning_rate": 3.049411793911154e-07,
616
+ "logits/chosen": -2.082733631134033,
617
+ "logits/rejected": -2.014369487762451,
618
+ "logps/chosen": -515.5167236328125,
619
+ "logps/rejected": -504.70538330078125,
620
+ "loss": 0.0609,
621
+ "rewards/accuracies": 0.78125,
622
+ "rewards/chosen": -1.9115734100341797,
623
+ "rewards/margins": 1.2012805938720703,
624
+ "rewards/rejected": -3.112853765487671,
625
+ "step": 430
626
+ },
627
+ {
628
+ "epoch": 0.5,
629
+ "learning_rate": 2.9525085481604914e-07,
630
+ "logits/chosen": -2.0253331661224365,
631
+ "logits/rejected": -1.9336645603179932,
632
+ "logps/chosen": -499.57501220703125,
633
+ "logps/rejected": -480.38873291015625,
634
+ "loss": 0.0659,
635
+ "rewards/accuracies": 0.762499988079071,
636
+ "rewards/chosen": -1.7531890869140625,
637
+ "rewards/margins": 1.1956068277359009,
638
+ "rewards/rejected": -2.948795795440674,
639
+ "step": 440
640
+ },
641
+ {
642
+ "epoch": 0.51,
643
+ "learning_rate": 2.854896983445833e-07,
644
+ "logits/chosen": -1.9761031866073608,
645
+ "logits/rejected": -1.9000879526138306,
646
+ "logps/chosen": -585.724853515625,
647
+ "logps/rejected": -522.663818359375,
648
+ "loss": 0.0585,
649
+ "rewards/accuracies": 0.7124999761581421,
650
+ "rewards/chosen": -2.1158130168914795,
651
+ "rewards/margins": 1.2392785549163818,
652
+ "rewards/rejected": -3.3550915718078613,
653
+ "step": 450
654
+ },
655
+ {
656
+ "epoch": 0.52,
657
+ "learning_rate": 2.7567298927313654e-07,
658
+ "logits/chosen": -2.024075984954834,
659
+ "logits/rejected": -1.994297742843628,
660
+ "logps/chosen": -490.19671630859375,
661
+ "logps/rejected": -488.1590881347656,
662
+ "loss": 0.0698,
663
+ "rewards/accuracies": 0.737500011920929,
664
+ "rewards/chosen": -1.692265510559082,
665
+ "rewards/margins": 1.1264708042144775,
666
+ "rewards/rejected": -2.8187363147735596,
667
+ "step": 460
668
+ },
669
+ {
670
+ "epoch": 0.53,
671
+ "learning_rate": 2.658160938555123e-07,
672
+ "logits/chosen": -2.0123002529144287,
673
+ "logits/rejected": -1.9370654821395874,
674
+ "logps/chosen": -550.9243774414062,
675
+ "logps/rejected": -521.9213256835938,
676
+ "loss": 0.0535,
677
+ "rewards/accuracies": 0.78125,
678
+ "rewards/chosen": -1.8550266027450562,
679
+ "rewards/margins": 1.2366045713424683,
680
+ "rewards/rejected": -3.0916314125061035,
681
+ "step": 470
682
+ },
683
+ {
684
+ "epoch": 0.54,
685
+ "learning_rate": 2.559344412498532e-07,
686
+ "logits/chosen": -1.8818950653076172,
687
+ "logits/rejected": -1.8040504455566406,
688
+ "logps/chosen": -594.8109130859375,
689
+ "logps/rejected": -539.5858154296875,
690
+ "loss": 0.0436,
691
+ "rewards/accuracies": 0.668749988079071,
692
+ "rewards/chosen": -2.505542755126953,
693
+ "rewards/margins": 0.9630386233329773,
694
+ "rewards/rejected": -3.468581438064575,
695
+ "step": 480
696
+ },
697
+ {
698
+ "epoch": 0.55,
699
+ "learning_rate": 2.460434993671294e-07,
700
+ "logits/chosen": -1.8864244222640991,
701
+ "logits/rejected": -1.827561378479004,
702
+ "logps/chosen": -592.8574829101562,
703
+ "logps/rejected": -558.6998291015625,
704
+ "loss": 0.0362,
705
+ "rewards/accuracies": 0.731249988079071,
706
+ "rewards/chosen": -2.819347858428955,
707
+ "rewards/margins": 1.06614089012146,
708
+ "rewards/rejected": -3.885488510131836,
709
+ "step": 490
710
+ },
711
+ {
712
+ "epoch": 0.57,
713
+ "learning_rate": 2.361587506589672e-07,
714
+ "logits/chosen": -1.9717912673950195,
715
+ "logits/rejected": -1.8653414249420166,
716
+ "logps/chosen": -641.2883911132812,
717
+ "logps/rejected": -575.3276977539062,
718
+ "loss": 0.0414,
719
+ "rewards/accuracies": 0.768750011920929,
720
+ "rewards/chosen": -2.6623339653015137,
721
+ "rewards/margins": 1.177682638168335,
722
+ "rewards/rejected": -3.8400166034698486,
723
+ "step": 500
724
+ },
725
+ {
726
+ "epoch": 0.58,
727
+ "learning_rate": 2.2629566788271613e-07,
728
+ "logits/chosen": -1.9816091060638428,
729
+ "logits/rejected": -1.8679695129394531,
730
+ "logps/chosen": -567.7015991210938,
731
+ "logps/rejected": -527.6968383789062,
732
+ "loss": 0.0435,
733
+ "rewards/accuracies": 0.6875,
734
+ "rewards/chosen": -2.42421555519104,
735
+ "rewards/margins": 1.159529447555542,
736
+ "rewards/rejected": -3.583745241165161,
737
+ "step": 510
738
+ },
739
+ {
740
+ "epoch": 0.59,
741
+ "learning_rate": 2.1646968988169135e-07,
742
+ "logits/chosen": -2.013672113418579,
743
+ "logits/rejected": -1.9170252084732056,
744
+ "logps/chosen": -607.8392333984375,
745
+ "logps/rejected": -590.5955810546875,
746
+ "loss": 0.039,
747
+ "rewards/accuracies": 0.793749988079071,
748
+ "rewards/chosen": -2.4571421146392822,
749
+ "rewards/margins": 1.2694391012191772,
750
+ "rewards/rejected": -3.72658109664917,
751
+ "step": 520
752
+ },
753
+ {
754
+ "epoch": 0.6,
755
+ "learning_rate": 2.0669619741850232e-07,
756
+ "logits/chosen": -2.058300733566284,
757
+ "logits/rejected": -1.970088005065918,
758
+ "logps/chosen": -582.6038818359375,
759
+ "logps/rejected": -512.1895141601562,
760
+ "loss": 0.044,
761
+ "rewards/accuracies": 0.6875,
762
+ "rewards/chosen": -2.335970401763916,
763
+ "rewards/margins": 1.0607550144195557,
764
+ "rewards/rejected": -3.3967254161834717,
765
+ "step": 530
766
+ },
767
+ {
768
+ "epoch": 0.61,
769
+ "learning_rate": 1.9699048909929518e-07,
770
+ "logits/chosen": -2.0398476123809814,
771
+ "logits/rejected": -1.943453073501587,
772
+ "logps/chosen": -545.3948974609375,
773
+ "logps/rejected": -499.24444580078125,
774
+ "loss": 0.0497,
775
+ "rewards/accuracies": 0.7437499761581421,
776
+ "rewards/chosen": -2.121716022491455,
777
+ "rewards/margins": 0.9951685070991516,
778
+ "rewards/rejected": -3.116884231567383,
779
+ "step": 540
780
+ },
781
+ {
782
+ "epoch": 0.62,
783
+ "learning_rate": 1.8736775742659732e-07,
784
+ "logits/chosen": -2.01509952545166,
785
+ "logits/rejected": -1.9400924444198608,
786
+ "logps/chosen": -526.3953247070312,
787
+ "logps/rejected": -530.0167236328125,
788
+ "loss": 0.0524,
789
+ "rewards/accuracies": 0.793749988079071,
790
+ "rewards/chosen": -2.0396695137023926,
791
+ "rewards/margins": 1.3093147277832031,
792
+ "rewards/rejected": -3.3489837646484375,
793
+ "step": 550
794
+ },
795
+ {
796
+ "epoch": 0.63,
797
+ "learning_rate": 1.7784306501824616e-07,
798
+ "logits/chosen": -2.055781126022339,
799
+ "logits/rejected": -1.9646365642547607,
800
+ "logps/chosen": -597.9435424804688,
801
+ "logps/rejected": -532.0874633789062,
802
+ "loss": 0.0454,
803
+ "rewards/accuracies": 0.71875,
804
+ "rewards/chosen": -2.338007688522339,
805
+ "rewards/margins": 1.085502028465271,
806
+ "rewards/rejected": -3.4235095977783203,
807
+ "step": 560
808
+ },
809
+ {
810
+ "epoch": 0.65,
811
+ "learning_rate": 1.6843132102963025e-07,
812
+ "logits/chosen": -1.9799835681915283,
813
+ "logits/rejected": -1.9040464162826538,
814
+ "logps/chosen": -601.7384643554688,
815
+ "logps/rejected": -558.1231689453125,
816
+ "loss": 0.0397,
817
+ "rewards/accuracies": 0.768750011920929,
818
+ "rewards/chosen": -2.291151523590088,
819
+ "rewards/margins": 1.366019368171692,
820
+ "rewards/rejected": -3.657170534133911,
821
+ "step": 570
822
+ },
823
+ {
824
+ "epoch": 0.66,
825
+ "learning_rate": 1.591472578161458e-07,
826
+ "logits/chosen": -2.005979299545288,
827
+ "logits/rejected": -1.9207054376602173,
828
+ "logps/chosen": -568.8519287109375,
829
+ "logps/rejected": -540.73876953125,
830
+ "loss": 0.0424,
831
+ "rewards/accuracies": 0.793749988079071,
832
+ "rewards/chosen": -2.294837474822998,
833
+ "rewards/margins": 1.2771284580230713,
834
+ "rewards/rejected": -3.5719656944274902,
835
+ "step": 580
836
+ },
837
+ {
838
+ "epoch": 0.67,
839
+ "learning_rate": 1.5000540787240274e-07,
840
+ "logits/chosen": -2.0050690174102783,
841
+ "logits/rejected": -1.9278829097747803,
842
+ "logps/chosen": -566.6500244140625,
843
+ "logps/rejected": -545.9078369140625,
844
+ "loss": 0.045,
845
+ "rewards/accuracies": 0.699999988079071,
846
+ "rewards/chosen": -2.4012954235076904,
847
+ "rewards/margins": 1.2149646282196045,
848
+ "rewards/rejected": -3.616260051727295,
849
+ "step": 590
850
+ },
851
+ {
852
+ "epoch": 0.68,
853
+ "learning_rate": 1.410200810842749e-07,
854
+ "logits/chosen": -2.0124361515045166,
855
+ "logits/rejected": -1.9107856750488281,
856
+ "logps/chosen": -535.6881713867188,
857
+ "logps/rejected": -506.49676513671875,
858
+ "loss": 0.0511,
859
+ "rewards/accuracies": 0.737500011920929,
860
+ "rewards/chosen": -2.0316760540008545,
861
+ "rewards/margins": 1.1934245824813843,
862
+ "rewards/rejected": -3.2251009941101074,
863
+ "step": 600
864
+ },
865
+ {
866
+ "epoch": 0.69,
867
+ "learning_rate": 1.322053423294041e-07,
868
+ "logits/chosen": -2.0051183700561523,
869
+ "logits/rejected": -1.9159631729125977,
870
+ "logps/chosen": -529.7169799804688,
871
+ "logps/rejected": -532.310546875,
872
+ "loss": 0.0503,
873
+ "rewards/accuracies": 0.8187500238418579,
874
+ "rewards/chosen": -2.024324655532837,
875
+ "rewards/margins": 1.4423547983169556,
876
+ "rewards/rejected": -3.466679334640503,
877
+ "step": 610
878
+ },
879
+ {
880
+ "epoch": 0.7,
881
+ "learning_rate": 1.2357498946121905e-07,
882
+ "logits/chosen": -2.0541529655456543,
883
+ "logits/rejected": -1.964544653892517,
884
+ "logps/chosen": -566.3477783203125,
885
+ "logps/rejected": -529.6947021484375,
886
+ "loss": 0.0501,
887
+ "rewards/accuracies": 0.793749988079071,
888
+ "rewards/chosen": -2.169382095336914,
889
+ "rewards/margins": 1.2838075160980225,
890
+ "rewards/rejected": -3.4531898498535156,
891
+ "step": 620
892
+ },
893
+ {
894
+ "epoch": 0.71,
895
+ "learning_rate": 1.1514253171093161e-07,
896
+ "logits/chosen": -2.0569446086883545,
897
+ "logits/rejected": -1.9314994812011719,
898
+ "logps/chosen": -529.5950927734375,
899
+ "logps/rejected": -520.3991088867188,
900
+ "loss": 0.0504,
901
+ "rewards/accuracies": 0.768750011920929,
902
+ "rewards/chosen": -1.9739675521850586,
903
+ "rewards/margins": 1.362341284751892,
904
+ "rewards/rejected": -3.3363089561462402,
905
+ "step": 630
906
+ },
907
+ {
908
+ "epoch": 0.72,
909
+ "learning_rate": 1.0692116854131883e-07,
910
+ "logits/chosen": -2.0125861167907715,
911
+ "logits/rejected": -1.9420621395111084,
912
+ "logps/chosen": -524.7649536132812,
913
+ "logps/rejected": -528.4769287109375,
914
+ "loss": 0.0505,
915
+ "rewards/accuracies": 0.7749999761581421,
916
+ "rewards/chosen": -2.092562675476074,
917
+ "rewards/margins": 1.1686738729476929,
918
+ "rewards/rejected": -3.2612366676330566,
919
+ "step": 640
920
+ },
921
+ {
922
+ "epoch": 0.74,
923
+ "learning_rate": 9.89237689853889e-08,
924
+ "logits/chosen": -2.0095937252044678,
925
+ "logits/rejected": -1.942928671836853,
926
+ "logps/chosen": -512.7869873046875,
927
+ "logps/rejected": -487.248046875,
928
+ "loss": 0.0502,
929
+ "rewards/accuracies": 0.78125,
930
+ "rewards/chosen": -1.9598802328109741,
931
+ "rewards/margins": 1.2647455930709839,
932
+ "rewards/rejected": -3.2246253490448,
933
+ "step": 650
934
+ },
935
+ {
936
+ "epoch": 0.75,
937
+ "learning_rate": 9.11628515022765e-08,
938
+ "logits/chosen": -2.037865400314331,
939
+ "logits/rejected": -1.9250373840332031,
940
+ "logps/chosen": -533.2716674804688,
941
+ "logps/rejected": -528.3024291992188,
942
+ "loss": 0.0492,
943
+ "rewards/accuracies": 0.831250011920929,
944
+ "rewards/chosen": -1.9517757892608643,
945
+ "rewards/margins": 1.4488084316253662,
946
+ "rewards/rejected": -3.4005839824676514,
947
+ "step": 660
948
+ },
949
+ {
950
+ "epoch": 0.76,
951
+ "learning_rate": 8.365056438189486e-08,
952
+ "logits/chosen": -2.0413284301757812,
953
+ "logits/rejected": -1.9342155456542969,
954
+ "logps/chosen": -561.7138671875,
955
+ "logps/rejected": -549.7591552734375,
956
+ "loss": 0.0507,
957
+ "rewards/accuracies": 0.7562500238418579,
958
+ "rewards/chosen": -2.183668851852417,
959
+ "rewards/margins": 1.2829086780548096,
960
+ "rewards/rejected": -3.4665775299072266,
961
+ "step": 670
962
+ },
963
+ {
964
+ "epoch": 0.77,
965
+ "learning_rate": 7.639866672902101e-08,
966
+ "logits/chosen": -2.0364794731140137,
967
+ "logits/rejected": -1.9118995666503906,
968
+ "logps/chosen": -569.7603149414062,
969
+ "logps/rejected": -554.3267822265625,
970
+ "loss": 0.0559,
971
+ "rewards/accuracies": 0.8125,
972
+ "rewards/chosen": -2.052215099334717,
973
+ "rewards/margins": 1.4867579936981201,
974
+ "rewards/rejected": -3.538973569869995,
975
+ "step": 680
976
+ },
977
+ {
978
+ "epoch": 0.78,
979
+ "learning_rate": 6.941851005657851e-08,
980
+ "logits/chosen": -2.0689234733581543,
981
+ "logits/rejected": -1.9631000757217407,
982
+ "logps/chosen": -537.5634765625,
983
+ "logps/rejected": -517.525146484375,
984
+ "loss": 0.0508,
985
+ "rewards/accuracies": 0.8125,
986
+ "rewards/chosen": -2.1864445209503174,
987
+ "rewards/margins": 1.1380784511566162,
988
+ "rewards/rejected": -3.3245227336883545,
989
+ "step": 690
990
+ },
991
+ {
992
+ "epoch": 0.79,
993
+ "learning_rate": 6.272102051693051e-08,
994
+ "logits/chosen": -2.0574021339416504,
995
+ "logits/rejected": -1.998944640159607,
996
+ "logps/chosen": -598.1875,
997
+ "logps/rejected": -531.4752807617188,
998
+ "loss": 0.0525,
999
+ "rewards/accuracies": 0.75,
1000
+ "rewards/chosen": -2.1735377311706543,
1001
+ "rewards/margins": 1.160771369934082,
1002
+ "rewards/rejected": -3.3343091011047363,
1003
+ "step": 700
1004
+ },
1005
+ {
1006
+ "epoch": 0.8,
1007
+ "learning_rate": 5.6316681798995844e-08,
1008
+ "logits/chosen": -2.01491379737854,
1009
+ "logits/rejected": -1.9562265872955322,
1010
+ "logps/chosen": -530.241455078125,
1011
+ "logps/rejected": -535.0760498046875,
1012
+ "loss": 0.0428,
1013
+ "rewards/accuracies": 0.762499988079071,
1014
+ "rewards/chosen": -2.1387624740600586,
1015
+ "rewards/margins": 1.4437265396118164,
1016
+ "rewards/rejected": -3.582489013671875,
1017
+ "step": 710
1018
+ },
1019
+ {
1020
+ "epoch": 0.81,
1021
+ "learning_rate": 5.0215518717961256e-08,
1022
+ "logits/chosen": -1.996011734008789,
1023
+ "logits/rejected": -1.9084312915802002,
1024
+ "logps/chosen": -559.4378662109375,
1025
+ "logps/rejected": -546.0494995117188,
1026
+ "loss": 0.0449,
1027
+ "rewards/accuracies": 0.78125,
1028
+ "rewards/chosen": -2.1538968086242676,
1029
+ "rewards/margins": 1.5510085821151733,
1030
+ "rewards/rejected": -3.7049052715301514,
1031
+ "step": 720
1032
+ },
1033
+ {
1034
+ "epoch": 0.83,
1035
+ "learning_rate": 4.4427081523275925e-08,
1036
+ "logits/chosen": -1.9798328876495361,
1037
+ "logits/rejected": -1.915225625038147,
1038
+ "logps/chosen": -537.6295166015625,
1039
+ "logps/rejected": -551.8641357421875,
1040
+ "loss": 0.0414,
1041
+ "rewards/accuracies": 0.800000011920929,
1042
+ "rewards/chosen": -2.211373805999756,
1043
+ "rewards/margins": 1.3579763174057007,
1044
+ "rewards/rejected": -3.569350004196167,
1045
+ "step": 730
1046
+ },
1047
+ {
1048
+ "epoch": 0.84,
1049
+ "learning_rate": 3.896043094949061e-08,
1050
+ "logits/chosen": -2.0534861087799072,
1051
+ "logits/rejected": -1.947218894958496,
1052
+ "logps/chosen": -572.7033081054688,
1053
+ "logps/rejected": -552.8514404296875,
1054
+ "loss": 0.0491,
1055
+ "rewards/accuracies": 0.7749999761581421,
1056
+ "rewards/chosen": -2.2802767753601074,
1057
+ "rewards/margins": 1.3190699815750122,
1058
+ "rewards/rejected": -3.599346876144409,
1059
+ "step": 740
1060
+ },
1061
+ {
1062
+ "epoch": 0.85,
1063
+ "learning_rate": 3.3824124033343557e-08,
1064
+ "logits/chosen": -2.0076358318328857,
1065
+ "logits/rejected": -1.946637511253357,
1066
+ "logps/chosen": -583.5534057617188,
1067
+ "logps/rejected": -575.9400024414062,
1068
+ "loss": 0.0441,
1069
+ "rewards/accuracies": 0.768750011920929,
1070
+ "rewards/chosen": -2.3145105838775635,
1071
+ "rewards/margins": 1.4341070652008057,
1072
+ "rewards/rejected": -3.748617649078369,
1073
+ "step": 750
1074
+ },
1075
+ {
1076
+ "epoch": 0.86,
1077
+ "learning_rate": 2.9026200719291904e-08,
1078
+ "logits/chosen": -2.0077717304229736,
1079
+ "logits/rejected": -1.9336059093475342,
1080
+ "logps/chosen": -549.6090698242188,
1081
+ "logps/rejected": -544.0575561523438,
1082
+ "loss": 0.0506,
1083
+ "rewards/accuracies": 0.7437499761581421,
1084
+ "rewards/chosen": -2.3540844917297363,
1085
+ "rewards/margins": 1.253685712814331,
1086
+ "rewards/rejected": -3.6077704429626465,
1087
+ "step": 760
1088
+ },
1089
+ {
1090
+ "epoch": 0.87,
1091
+ "learning_rate": 2.4574171274456433e-08,
1092
+ "logits/chosen": -2.040750503540039,
1093
+ "logits/rejected": -1.956707239151001,
1094
+ "logps/chosen": -548.0321655273438,
1095
+ "logps/rejected": -525.2894287109375,
1096
+ "loss": 0.043,
1097
+ "rewards/accuracies": 0.78125,
1098
+ "rewards/chosen": -2.166091203689575,
1099
+ "rewards/margins": 1.3392621278762817,
1100
+ "rewards/rejected": -3.5053532123565674,
1101
+ "step": 770
1102
+ },
1103
+ {
1104
+ "epoch": 0.88,
1105
+ "learning_rate": 2.047500453267881e-08,
1106
+ "logits/chosen": -2.015049457550049,
1107
+ "logits/rejected": -1.9243285655975342,
1108
+ "logps/chosen": -562.6207275390625,
1109
+ "logps/rejected": -560.2086181640625,
1110
+ "loss": 0.0461,
1111
+ "rewards/accuracies": 0.7875000238418579,
1112
+ "rewards/chosen": -2.27638578414917,
1113
+ "rewards/margins": 1.389906644821167,
1114
+ "rewards/rejected": -3.666292190551758,
1115
+ "step": 780
1116
+ },
1117
+ {
1118
+ "epoch": 0.89,
1119
+ "learning_rate": 1.673511698609292e-08,
1120
+ "logits/chosen": -1.9878380298614502,
1121
+ "logits/rejected": -1.9071290493011475,
1122
+ "logps/chosen": -591.1964721679688,
1123
+ "logps/rejected": -575.9303588867188,
1124
+ "loss": 0.0436,
1125
+ "rewards/accuracies": 0.7562500238418579,
1126
+ "rewards/chosen": -2.2814955711364746,
1127
+ "rewards/margins": 1.4738110303878784,
1128
+ "rewards/rejected": -3.7553062438964844,
1129
+ "step": 790
1130
+ },
1131
+ {
1132
+ "epoch": 0.91,
1133
+ "learning_rate": 1.3360362741285769e-08,
1134
+ "logits/chosen": -1.998380422592163,
1135
+ "logits/rejected": -1.910592794418335,
1136
+ "logps/chosen": -541.6677856445312,
1137
+ "logps/rejected": -533.7774047851562,
1138
+ "loss": 0.0425,
1139
+ "rewards/accuracies": 0.7250000238418579,
1140
+ "rewards/chosen": -2.2717394828796387,
1141
+ "rewards/margins": 1.2483633756637573,
1142
+ "rewards/rejected": -3.5201034545898438,
1143
+ "step": 800
1144
+ },
1145
+ {
1146
+ "epoch": 0.92,
1147
+ "learning_rate": 1.0356024355769433e-08,
1148
+ "logits/chosen": -2.0231940746307373,
1149
+ "logits/rejected": -1.9754924774169922,
1150
+ "logps/chosen": -576.0597534179688,
1151
+ "logps/rejected": -548.6602172851562,
1152
+ "loss": 0.0447,
1153
+ "rewards/accuracies": 0.75,
1154
+ "rewards/chosen": -2.2617714405059814,
1155
+ "rewards/margins": 1.3260424137115479,
1156
+ "rewards/rejected": -3.58781361579895,
1157
+ "step": 810
1158
+ },
1159
+ {
1160
+ "epoch": 0.93,
1161
+ "learning_rate": 7.726804569108597e-09,
1162
+ "logits/chosen": -2.033327102661133,
1163
+ "logits/rejected": -1.9357963800430298,
1164
+ "logps/chosen": -587.4612426757812,
1165
+ "logps/rejected": -580.3048095703125,
1166
+ "loss": 0.0476,
1167
+ "rewards/accuracies": 0.71875,
1168
+ "rewards/chosen": -2.3224897384643555,
1169
+ "rewards/margins": 1.3850138187408447,
1170
+ "rewards/rejected": -3.7075035572052,
1171
+ "step": 820
1172
+ },
1173
+ {
1174
+ "epoch": 0.94,
1175
+ "learning_rate": 5.476818941645561e-09,
1176
+ "logits/chosen": -2.0493969917297363,
1177
+ "logits/rejected": -1.9231374263763428,
1178
+ "logps/chosen": -621.8226318359375,
1179
+ "logps/rejected": -563.610595703125,
1180
+ "loss": 0.0437,
1181
+ "rewards/accuracies": 0.7562500238418579,
1182
+ "rewards/chosen": -2.3366944789886475,
1183
+ "rewards/margins": 1.3244359493255615,
1184
+ "rewards/rejected": -3.661130428314209,
1185
+ "step": 830
1186
+ },
1187
+ {
1188
+ "epoch": 0.95,
1189
+ "learning_rate": 3.609589412347347e-09,
1190
+ "logits/chosen": -1.9982448816299438,
1191
+ "logits/rejected": -1.920509696006775,
1192
+ "logps/chosen": -563.5140380859375,
1193
+ "logps/rejected": -574.1109619140625,
1194
+ "loss": 0.0384,
1195
+ "rewards/accuracies": 0.7875000238418579,
1196
+ "rewards/chosen": -2.203871488571167,
1197
+ "rewards/margins": 1.5941288471221924,
1198
+ "rewards/rejected": -3.7980003356933594,
1199
+ "step": 840
1200
+ },
1201
+ {
1202
+ "epoch": 0.96,
1203
+ "learning_rate": 2.1280387858572667e-09,
1204
+ "logits/chosen": -2.0206615924835205,
1205
+ "logits/rejected": -1.9191792011260986,
1206
+ "logps/chosen": -529.5126342773438,
1207
+ "logps/rejected": -513.9107666015625,
1208
+ "loss": 0.0425,
1209
+ "rewards/accuracies": 0.71875,
1210
+ "rewards/chosen": -2.1540675163269043,
1211
+ "rewards/margins": 1.2586270570755005,
1212
+ "rewards/rejected": -3.4126949310302734,
1213
+ "step": 850
1214
+ },
1215
+ {
1216
+ "epoch": 0.97,
1217
+ "learning_rate": 1.03448615738172e-09,
1218
+ "logits/chosen": -1.9819049835205078,
1219
+ "logits/rejected": -1.8794002532958984,
1220
+ "logps/chosen": -554.608642578125,
1221
+ "logps/rejected": -551.7212524414062,
1222
+ "loss": 0.0506,
1223
+ "rewards/accuracies": 0.793749988079071,
1224
+ "rewards/chosen": -2.191262722015381,
1225
+ "rewards/margins": 1.487153172492981,
1226
+ "rewards/rejected": -3.6784160137176514,
1227
+ "step": 860
1228
+ },
1229
+ {
1230
+ "epoch": 0.98,
1231
+ "learning_rate": 3.3064328257259575e-10,
1232
+ "logits/chosen": -2.011251211166382,
1233
+ "logits/rejected": -1.9388446807861328,
1234
+ "logps/chosen": -545.1489868164062,
1235
+ "logps/rejected": -542.4119873046875,
1236
+ "loss": 0.0461,
1237
+ "rewards/accuracies": 0.7875000238418579,
1238
+ "rewards/chosen": -2.230153799057007,
1239
+ "rewards/margins": 1.3988468647003174,
1240
+ "rewards/rejected": -3.6290009021759033,
1241
+ "step": 870
1242
+ },
1243
+ {
1244
+ "epoch": 1.0,
1245
+ "learning_rate": 1.7611898088715216e-11,
1246
+ "logits/chosen": -1.9472382068634033,
1247
+ "logits/rejected": -1.8980674743652344,
1248
+ "logps/chosen": -579.8809814453125,
1249
+ "logps/rejected": -560.6386108398438,
1250
+ "loss": 0.0525,
1251
+ "rewards/accuracies": 0.75,
1252
+ "rewards/chosen": -2.2138962745666504,
1253
+ "rewards/margins": 1.272986650466919,
1254
+ "rewards/rejected": -3.4868831634521484,
1255
+ "step": 880
1256
+ },
1257
  {
1258
  "epoch": 1.0,
1259
+ "step": 883,
1260
  "total_flos": 0.0,
1261
+ "train_loss": 0.08395375216871263,
1262
+ "train_runtime": 6990.4881,
1263
+ "train_samples_per_second": 16.169,
1264
+ "train_steps_per_second": 0.126
1265
  }
1266
  ],
1267
  "logging_steps": 10,
1268
+ "max_steps": 883,
1269
  "num_train_epochs": 1,
1270
  "save_steps": 1000,
1271
  "total_flos": 0.0,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69f452db3f4d6246f461dfea40f35365fa560c9e1d85d96dab21f005c15bec92
3
  size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8842f3333f3a8ac784afd1829b17d717e3c9fd1d2f4fe455962584f6e4e1556d
3
  size 5944