RikkiXu commited on
Commit
29a92ad
1 Parent(s): 8b7fe7f

Model save

Browse files
README.md CHANGED
@@ -52,7 +52,7 @@ The following hyperparameters were used during training:
52
 
53
  ### Framework versions
54
 
55
- - Transformers 4.39.3
56
  - Pytorch 2.1.2+cu118
57
  - Datasets 2.16.1
58
- - Tokenizers 0.15.2
 
52
 
53
  ### Framework versions
54
 
55
+ - Transformers 4.41.1
56
  - Pytorch 2.1.2+cu118
57
  - Datasets 2.16.1
58
+ - Tokenizers 0.19.1
all_results.json CHANGED
@@ -1,8 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 1.2283447361731714,
4
- "train_runtime": 7984.1405,
 
5
  "train_samples": 66084,
6
- "train_samples_per_second": 8.277,
7
- "train_steps_per_second": 0.065
8
  }
 
1
  {
2
+ "epoch": 0.9990319457889641,
3
+ "total_flos": 0.0,
4
+ "train_loss": 1.2622329578843228,
5
+ "train_runtime": 8815.3772,
6
  "train_samples": 66084,
7
+ "train_samples_per_second": 7.496,
8
+ "train_steps_per_second": 0.059
9
  }
config.json CHANGED
@@ -20,7 +20,7 @@
20
  "sliding_window": 4096,
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
- "transformers_version": "4.39.3",
24
  "use_cache": false,
25
  "vocab_size": 32000
26
  }
 
20
  "sliding_window": 4096,
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.41.1",
24
  "use_cache": false,
25
  "vocab_size": 32000
26
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
- "transformers_version": "4.39.3"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
+ "transformers_version": "4.41.1"
6
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b6137f8b0700bad4f2654974773825d428b43e1035f3f4e0d37e8828850a29b
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea42776f0193c66b4e372e22097d2fc65400f7bdb2f3ca46f982c79196cf49f7
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1ed4892d990ee5ca205366b7df90cb53ab573a0b504df190226b7d026710941
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eb3a99a80f98514c49768d558a81159ceee2e5cf406379207b21f1de70e3733
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1213c9ea174e5541cc65d907a55897621ce9630ffd180bef0bfb80e5753ab0b
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c08bb1fbdcc4816d468af7fec45a822a9da800bb4f19eff2e0946970a0e8a2a
3
  size 4540516344
runs/Jul08_05-29-38_n136-082-130/events.out.tfevents.1720387900.n136-082-130.2711574.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03326fe07ddd4b507a3ec9ac088acef22cba1ea772f58653e02f8ea1833f74a1
3
- size 40126
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8233040679669edc5de74dbf2c369f093b9a8fb17949255be5d238d6a8a3326b
3
+ size 41168
tokenizer.json CHANGED
@@ -134,6 +134,7 @@
134
  "end_of_word_suffix": null,
135
  "fuse_unk": true,
136
  "byte_fallback": true,
 
137
  "vocab": {
138
  "<unk>": 0,
139
  "<s>": 1,
 
134
  "end_of_word_suffix": null,
135
  "fuse_unk": true,
136
  "byte_fallback": true,
137
+ "ignore_merges": false,
138
  "vocab": {
139
  "<unk>": 0,
140
  "<s>": 1,
train_results.json CHANGED
@@ -1,8 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 1.2283447361731714,
4
- "train_runtime": 7984.1405,
 
5
  "train_samples": 66084,
6
- "train_samples_per_second": 8.277,
7
- "train_steps_per_second": 0.065
8
  }
 
1
  {
2
+ "epoch": 0.9990319457889641,
3
+ "total_flos": 0.0,
4
+ "train_loss": 1.2622329578843228,
5
+ "train_runtime": 8815.3772,
6
  "train_samples": 66084,
7
+ "train_samples_per_second": 7.496,
8
+ "train_steps_per_second": 0.059
9
  }
trainer_state.json CHANGED
@@ -9,8 +9,8 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0,
13
- "grad_norm": 3433.252872729438,
14
  "learning_rate": 1.9230769230769234e-11,
15
  "logits/chosen": -1.8683955669403076,
16
  "logits/rejected": -1.7658718824386597,
@@ -24,778 +24,778 @@
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.02,
28
- "grad_norm": 3412.8742259369806,
29
  "learning_rate": 1.9230769230769234e-10,
30
- "logits/chosen": -1.6615941524505615,
31
- "logits/rejected": -1.6194777488708496,
32
- "logps/chosen": -0.9486016035079956,
33
- "logps/rejected": -0.9299763441085815,
34
- "loss": 1.1995,
35
- "rewards/accuracies": 0.3541666567325592,
36
- "rewards/chosen": -0.02906438708305359,
37
- "rewards/margins": -0.07331237941980362,
38
- "rewards/rejected": 0.044248003512620926,
39
  "step": 10
40
  },
41
  {
42
- "epoch": 0.04,
43
- "grad_norm": 2858.7837493194793,
44
  "learning_rate": 3.8461538461538467e-10,
45
- "logits/chosen": -1.5834972858428955,
46
- "logits/rejected": -1.535605549812317,
47
- "logps/chosen": -1.024587631225586,
48
- "logps/rejected": -0.970314621925354,
49
- "loss": 1.2258,
50
- "rewards/accuracies": 0.5,
51
- "rewards/chosen": -0.014270206913352013,
52
- "rewards/margins": 0.010384765453636646,
53
- "rewards/rejected": -0.02465497888624668,
54
  "step": 20
55
  },
56
  {
57
- "epoch": 0.06,
58
- "grad_norm": 3802.89735188374,
59
  "learning_rate": 5.769230769230769e-10,
60
- "logits/chosen": -1.5293736457824707,
61
- "logits/rejected": -1.4700099229812622,
62
- "logps/chosen": -1.003350019454956,
63
- "logps/rejected": -0.9810702204704285,
64
- "loss": 1.2405,
65
- "rewards/accuracies": 0.4937500059604645,
66
- "rewards/chosen": -0.04970189183950424,
67
- "rewards/margins": 0.030842384323477745,
68
- "rewards/rejected": -0.08054427057504654,
69
  "step": 30
70
  },
71
  {
72
- "epoch": 0.08,
73
- "grad_norm": 3128.321622751937,
74
  "learning_rate": 7.692307692307693e-10,
75
- "logits/chosen": -1.554677963256836,
76
- "logits/rejected": -1.496991515159607,
77
- "logps/chosen": -1.0003422498703003,
78
- "logps/rejected": -0.9374068379402161,
79
- "loss": 1.216,
80
- "rewards/accuracies": 0.45625001192092896,
81
- "rewards/chosen": -0.05540431663393974,
82
- "rewards/margins": -0.12218408286571503,
83
- "rewards/rejected": 0.06677977740764618,
84
  "step": 40
85
  },
86
  {
87
- "epoch": 0.1,
88
- "grad_norm": 3052.7964454970747,
89
  "learning_rate": 9.615384615384616e-10,
90
- "logits/chosen": -1.649513840675354,
91
- "logits/rejected": -1.6041488647460938,
92
- "logps/chosen": -0.9904917478561401,
93
- "logps/rejected": -0.9689529538154602,
94
- "loss": 1.2316,
95
- "rewards/accuracies": 0.5562499761581421,
96
- "rewards/chosen": -0.022712722420692444,
97
- "rewards/margins": 0.059487856924533844,
98
- "rewards/rejected": -0.08220058679580688,
99
  "step": 50
100
  },
101
  {
102
- "epoch": 0.12,
103
- "grad_norm": 2902.252196246418,
104
  "learning_rate": 9.99266706925562e-10,
105
- "logits/chosen": -1.6003596782684326,
106
- "logits/rejected": -1.547429084777832,
107
- "logps/chosen": -0.9953937530517578,
108
- "logps/rejected": -0.9311510324478149,
109
- "loss": 1.2409,
110
- "rewards/accuracies": 0.5249999761581421,
111
- "rewards/chosen": 0.011818816885352135,
112
- "rewards/margins": -0.030796002596616745,
113
- "rewards/rejected": 0.04261482506990433,
114
  "step": 60
115
  },
116
  {
117
- "epoch": 0.14,
118
- "grad_norm": 3820.124830724004,
119
  "learning_rate": 9.96291389741603e-10,
120
- "logits/chosen": -1.5929685831069946,
121
- "logits/rejected": -1.5064890384674072,
122
- "logps/chosen": -0.9960752725601196,
123
- "logps/rejected": -0.9465099573135376,
124
- "loss": 1.2095,
125
- "rewards/accuracies": 0.5249999761581421,
126
- "rewards/chosen": -0.005996152758598328,
127
- "rewards/margins": 0.0470072403550148,
128
- "rewards/rejected": -0.05300339311361313,
129
  "step": 70
130
  },
131
  {
132
- "epoch": 0.15,
133
- "grad_norm": 3594.9924550893647,
134
  "learning_rate": 9.91041841371078e-10,
135
- "logits/chosen": -1.5460221767425537,
136
- "logits/rejected": -1.5286107063293457,
137
- "logps/chosen": -1.0364538431167603,
138
- "logps/rejected": -0.9859156608581543,
139
- "loss": 1.2414,
140
- "rewards/accuracies": 0.5249999761581421,
141
- "rewards/chosen": -0.10518498718738556,
142
- "rewards/margins": -0.0029987855814397335,
143
- "rewards/rejected": -0.10218620300292969,
144
  "step": 80
145
  },
146
  {
147
- "epoch": 0.17,
148
- "grad_norm": 3142.656864448476,
149
  "learning_rate": 9.835421176144035e-10,
150
- "logits/chosen": -1.6783430576324463,
151
- "logits/rejected": -1.6198842525482178,
152
- "logps/chosen": -1.0106076002120972,
153
- "logps/rejected": -0.9245842099189758,
154
- "loss": 1.2421,
155
- "rewards/accuracies": 0.53125,
156
- "rewards/chosen": 0.11870207637548447,
157
- "rewards/margins": 0.038075316697359085,
158
- "rewards/rejected": 0.08062675595283508,
159
  "step": 90
160
  },
161
  {
162
- "epoch": 0.19,
163
- "grad_norm": 3296.038760800627,
164
  "learning_rate": 9.738265855914014e-10,
165
- "logits/chosen": -1.6321983337402344,
166
- "logits/rejected": -1.5683090686798096,
167
- "logps/chosen": -0.9782571792602539,
168
- "logps/rejected": -0.9384816288948059,
169
- "loss": 1.2618,
170
- "rewards/accuracies": 0.46875,
171
- "rewards/chosen": -0.02403583563864231,
172
- "rewards/margins": -0.07467617094516754,
173
- "rewards/rejected": 0.05064033344388008,
174
  "step": 100
175
  },
176
  {
177
- "epoch": 0.21,
178
- "grad_norm": 4108.091542633096,
179
  "learning_rate": 9.619397662556434e-10,
180
- "logits/chosen": -1.6637523174285889,
181
- "logits/rejected": -1.5871461629867554,
182
- "logps/chosen": -0.8984575271606445,
183
- "logps/rejected": -0.8728235363960266,
184
- "loss": 1.2356,
185
- "rewards/accuracies": 0.518750011920929,
186
- "rewards/chosen": 0.010543391108512878,
187
- "rewards/margins": -0.026898717507719994,
188
- "rewards/rejected": 0.03744211047887802,
189
  "step": 110
190
  },
191
  {
192
- "epoch": 0.23,
193
- "grad_norm": 3179.4902720939494,
194
  "learning_rate": 9.47936130379344e-10,
195
- "logits/chosen": -1.542689323425293,
196
- "logits/rejected": -1.5236142873764038,
197
- "logps/chosen": -0.9706069827079773,
198
- "logps/rejected": -0.9399257898330688,
199
- "loss": 1.2417,
200
- "rewards/accuracies": 0.53125,
201
- "rewards/chosen": 0.06870836764574051,
202
- "rewards/margins": 0.023814253509044647,
203
- "rewards/rejected": 0.04489411413669586,
204
  "step": 120
205
  },
206
  {
207
- "epoch": 0.25,
208
- "grad_norm": 3098.1125119687267,
209
  "learning_rate": 9.318798489436919e-10,
210
- "logits/chosen": -1.5796587467193604,
211
- "logits/rejected": -1.489713430404663,
212
- "logps/chosen": -0.9658042788505554,
213
- "logps/rejected": -0.9291502237319946,
214
- "loss": 1.2341,
215
- "rewards/accuracies": 0.5874999761581421,
216
- "rewards/chosen": 0.04142928868532181,
217
- "rewards/margins": 0.09168253093957901,
218
- "rewards/rejected": -0.050253234803676605,
219
  "step": 130
220
  },
221
  {
222
- "epoch": 0.27,
223
- "grad_norm": 3014.945040340012,
224
  "learning_rate": 9.138444990784454e-10,
225
- "logits/chosen": -1.575457215309143,
226
- "logits/rejected": -1.5276520252227783,
227
- "logps/chosen": -0.993288516998291,
228
- "logps/rejected": -0.9939154386520386,
229
- "loss": 1.2323,
230
- "rewards/accuracies": 0.46875,
231
- "rewards/chosen": -0.0799776241183281,
232
- "rewards/margins": -0.10535556077957153,
233
- "rewards/rejected": 0.025377947837114334,
234
  "step": 140
235
  },
236
  {
237
- "epoch": 0.29,
238
- "grad_norm": 3223.119338557624,
239
  "learning_rate": 8.939127268983109e-10,
240
- "logits/chosen": -1.5617806911468506,
241
- "logits/rejected": -1.5375396013259888,
242
- "logps/chosen": -1.0731405019760132,
243
- "logps/rejected": -0.9959096908569336,
244
- "loss": 1.2408,
245
- "rewards/accuracies": 0.512499988079071,
246
- "rewards/chosen": 0.13223543763160706,
247
- "rewards/margins": 0.09327533096075058,
248
- "rewards/rejected": 0.038960110396146774,
249
  "step": 150
250
  },
251
  {
252
- "epoch": 0.31,
253
- "grad_norm": 3104.4317123654555,
254
  "learning_rate": 8.721758687811352e-10,
255
- "logits/chosen": -1.6764981746673584,
256
- "logits/rejected": -1.6017143726348877,
257
- "logps/chosen": -0.9720491170883179,
258
- "logps/rejected": -0.9432562589645386,
259
- "loss": 1.2573,
260
- "rewards/accuracies": 0.41874998807907104,
261
- "rewards/chosen": -0.04389803856611252,
262
- "rewards/margins": -0.03539767116308212,
263
- "rewards/rejected": -0.008500367403030396,
264
  "step": 160
265
  },
266
  {
267
- "epoch": 0.33,
268
- "grad_norm": 3100.0404111969096,
269
  "learning_rate": 8.487335328233912e-10,
270
- "logits/chosen": -1.5355565547943115,
271
- "logits/rejected": -1.4408550262451172,
272
- "logps/chosen": -0.9955608248710632,
273
- "logps/rejected": -0.9735702276229858,
274
- "loss": 1.2433,
275
- "rewards/accuracies": 0.512499988079071,
276
- "rewards/chosen": 0.03805801272392273,
277
- "rewards/margins": 0.007754004094749689,
278
- "rewards/rejected": 0.030304009094834328,
279
  "step": 170
280
  },
281
  {
282
- "epoch": 0.35,
283
- "grad_norm": 3504.3078457085194,
284
  "learning_rate": 8.236931423909139e-10,
285
- "logits/chosen": -1.6720529794692993,
286
- "logits/rejected": -1.574114203453064,
287
- "logps/chosen": -0.9757201075553894,
288
- "logps/rejected": -0.9518011808395386,
289
- "loss": 1.2196,
290
- "rewards/accuracies": 0.512499988079071,
291
- "rewards/chosen": 0.01711990498006344,
292
- "rewards/margins": 0.07902441173791885,
293
- "rewards/rejected": -0.06190451234579086,
294
  "step": 180
295
  },
296
  {
297
- "epoch": 0.37,
298
- "grad_norm": 3090.0873072741374,
299
  "learning_rate": 7.971694438565449e-10,
300
- "logits/chosen": -1.6263911724090576,
301
- "logits/rejected": -1.563900113105774,
302
- "logps/chosen": -0.973171055316925,
303
- "logps/rejected": -0.9747421145439148,
304
- "loss": 1.226,
305
- "rewards/accuracies": 0.5375000238418579,
306
- "rewards/chosen": -0.01540590263903141,
307
- "rewards/margins": 0.06777185946702957,
308
- "rewards/rejected": -0.08317776024341583,
309
  "step": 190
310
  },
311
  {
312
- "epoch": 0.39,
313
- "grad_norm": 3280.208428429287,
314
  "learning_rate": 7.692839807804521e-10,
315
- "logits/chosen": -1.630990982055664,
316
- "logits/rejected": -1.5965864658355713,
317
- "logps/chosen": -0.973315417766571,
318
- "logps/rejected": -0.9266045689582825,
319
- "loss": 1.2517,
320
- "rewards/accuracies": 0.40625,
321
- "rewards/chosen": -0.06684601306915283,
322
- "rewards/margins": -0.1311662197113037,
323
- "rewards/rejected": 0.06432018429040909,
324
  "step": 200
325
  },
326
  {
327
- "epoch": 0.41,
328
- "grad_norm": 2910.1442572596316,
329
  "learning_rate": 7.401645369426697e-10,
330
- "logits/chosen": -1.606448769569397,
331
- "logits/rejected": -1.540171504020691,
332
- "logps/chosen": -0.9637352824211121,
333
- "logps/rejected": -0.9145343899726868,
334
- "loss": 1.215,
335
- "rewards/accuracies": 0.45625001192092896,
336
- "rewards/chosen": 0.009862110018730164,
337
- "rewards/margins": -0.023509711027145386,
338
- "rewards/rejected": 0.03337181732058525,
339
  "step": 210
340
  },
341
  {
342
- "epoch": 0.43,
343
- "grad_norm": 2974.409845511936,
344
  "learning_rate": 7.099445507801324e-10,
345
- "logits/chosen": -1.6375468969345093,
346
- "logits/rejected": -1.5900648832321167,
347
- "logps/chosen": -0.9990792274475098,
348
- "logps/rejected": -0.9409869909286499,
349
- "loss": 1.2144,
350
- "rewards/accuracies": 0.5249999761581421,
351
- "rewards/chosen": 0.09886778891086578,
352
- "rewards/margins": 0.0811978206038475,
353
- "rewards/rejected": 0.01766996458172798,
354
  "step": 220
355
  },
356
  {
357
- "epoch": 0.45,
358
- "grad_norm": 3525.800329043237,
359
  "learning_rate": 6.7876250391152e-10,
360
- "logits/chosen": -1.5894132852554321,
361
- "logits/rejected": -1.5480855703353882,
362
- "logps/chosen": -0.951377272605896,
363
- "logps/rejected": -0.9713876843452454,
364
- "loss": 1.2267,
365
- "rewards/accuracies": 0.53125,
366
- "rewards/chosen": -0.002721360418945551,
367
- "rewards/margins": 0.05332719162106514,
368
- "rewards/rejected": -0.05604856088757515,
369
  "step": 230
370
  },
371
  {
372
- "epoch": 0.46,
373
- "grad_norm": 3442.0681577443684,
374
  "learning_rate": 6.467612865519674e-10,
375
- "logits/chosen": -1.6259033679962158,
376
- "logits/rejected": -1.600040078163147,
377
- "logps/chosen": -0.9880603551864624,
378
- "logps/rejected": -0.9100335836410522,
379
- "loss": 1.2605,
380
- "rewards/accuracies": 0.4749999940395355,
381
- "rewards/chosen": -0.07632036507129669,
382
- "rewards/margins": -0.12639155983924866,
383
- "rewards/rejected": 0.050071217119693756,
384
  "step": 240
385
  },
386
  {
387
- "epoch": 0.48,
388
- "grad_norm": 3065.812622616001,
389
  "learning_rate": 6.14087542725593e-10,
390
- "logits/chosen": -1.6481239795684814,
391
- "logits/rejected": -1.6146821975708008,
392
- "logps/chosen": -1.0201656818389893,
393
- "logps/rejected": -0.9347532391548157,
394
- "loss": 1.2724,
395
- "rewards/accuracies": 0.4312500059604645,
396
- "rewards/chosen": -0.04044584929943085,
397
- "rewards/margins": -0.13707883656024933,
398
- "rewards/rejected": 0.09663298726081848,
399
  "step": 250
400
  },
401
  {
402
- "epoch": 0.5,
403
- "grad_norm": 2828.71184697319,
404
  "learning_rate": 5.808909982763825e-10,
405
- "logits/chosen": -1.6454213857650757,
406
- "logits/rejected": -1.5548063516616821,
407
- "logps/chosen": -0.9938017129898071,
408
- "logps/rejected": -0.9485760927200317,
409
- "loss": 1.2281,
410
- "rewards/accuracies": 0.4937500059604645,
411
- "rewards/chosen": 0.030869582667946815,
412
- "rewards/margins": 0.039691779762506485,
413
- "rewards/rejected": -0.00882219523191452,
414
  "step": 260
415
  },
416
  {
417
- "epoch": 0.52,
418
- "grad_norm": 3024.1321839759044,
419
  "learning_rate": 5.473237747567806e-10,
420
- "logits/chosen": -1.6354761123657227,
421
- "logits/rejected": -1.5714191198349,
422
- "logps/chosen": -0.9696062803268433,
423
- "logps/rejected": -0.9574581980705261,
424
- "loss": 1.2206,
425
- "rewards/accuracies": 0.512499988079071,
426
- "rewards/chosen": -0.027052313089370728,
427
- "rewards/margins": -0.07983547449111938,
428
- "rewards/rejected": 0.05278315395116806,
429
  "step": 270
430
  },
431
  {
432
- "epoch": 0.54,
433
- "grad_norm": 3087.829887959573,
434
  "learning_rate": 5.135396923380673e-10,
435
- "logits/chosen": -1.568947434425354,
436
- "logits/rejected": -1.495872974395752,
437
- "logps/chosen": -0.9841231107711792,
438
- "logps/rejected": -0.9465670585632324,
439
- "loss": 1.2426,
440
  "rewards/accuracies": 0.4749999940395355,
441
- "rewards/chosen": 0.05226878449320793,
442
- "rewards/margins": -0.027065182104706764,
443
- "rewards/rejected": 0.07933396100997925,
444
  "step": 280
445
  },
446
  {
447
- "epoch": 0.56,
448
- "grad_norm": 3324.8251041085505,
449
  "learning_rate": 4.796935649368935e-10,
450
- "logits/chosen": -1.574679970741272,
451
- "logits/rejected": -1.4959328174591064,
452
- "logps/chosen": -1.0472538471221924,
453
- "logps/rejected": -0.993445098400116,
454
- "loss": 1.2037,
455
- "rewards/accuracies": 0.574999988079071,
456
- "rewards/chosen": 0.01891358196735382,
457
- "rewards/margins": 0.11695466190576553,
458
- "rewards/rejected": -0.09804105758666992,
459
  "step": 290
460
  },
461
  {
462
- "epoch": 0.58,
463
- "grad_norm": 3258.5962797542716,
464
  "learning_rate": 4.4594049078802925e-10,
465
- "logits/chosen": -1.599393606185913,
466
- "logits/rejected": -1.5013492107391357,
467
- "logps/chosen": -0.9566828012466431,
468
- "logps/rejected": -0.9199537038803101,
469
- "loss": 1.216,
470
- "rewards/accuracies": 0.574999988079071,
471
- "rewards/chosen": 0.06600774824619293,
472
- "rewards/margins": 0.06115701049566269,
473
- "rewards/rejected": 0.004850737750530243,
474
  "step": 300
475
  },
476
  {
477
- "epoch": 0.6,
478
- "grad_norm": 4156.011936794382,
479
  "learning_rate": 4.1243514171423466e-10,
480
- "logits/chosen": -1.5815703868865967,
481
- "logits/rejected": -1.5408540964126587,
482
- "logps/chosen": -0.9799707531929016,
483
- "logps/rejected": -0.9496101140975952,
484
- "loss": 1.2371,
485
- "rewards/accuracies": 0.45625001192092896,
486
- "rewards/chosen": -0.013506487011909485,
487
- "rewards/margins": -0.11432155221700668,
488
- "rewards/rejected": 0.10081508010625839,
489
  "step": 310
490
  },
491
  {
492
- "epoch": 0.62,
493
- "grad_norm": 3302.7976660213035,
494
  "learning_rate": 3.793310543501473e-10,
495
- "logits/chosen": -1.6618106365203857,
496
- "logits/rejected": -1.5896971225738525,
497
- "logps/chosen": -0.9786826968193054,
498
- "logps/rejected": -0.9558340907096863,
499
- "loss": 1.2255,
500
- "rewards/accuracies": 0.543749988079071,
501
- "rewards/chosen": -0.017104968428611755,
502
- "rewards/margins": 0.0024414777290076017,
503
- "rewards/rejected": -0.019546449184417725,
504
  "step": 320
505
  },
506
  {
507
- "epoch": 0.64,
508
- "grad_norm": 3509.209967592506,
509
  "learning_rate": 3.4677992656811053e-10,
510
- "logits/chosen": -1.6321818828582764,
511
- "logits/rejected": -1.6005859375,
512
- "logps/chosen": -1.0184519290924072,
513
- "logps/rejected": -0.9701558947563171,
514
- "loss": 1.2375,
515
- "rewards/accuracies": 0.518750011920929,
516
- "rewards/chosen": -0.007846951484680176,
517
- "rewards/margins": 0.018898997455835342,
518
- "rewards/rejected": -0.026745948940515518,
519
  "step": 330
520
  },
521
  {
522
- "epoch": 0.66,
523
- "grad_norm": 3710.6450277616004,
524
  "learning_rate": 3.149309223300428e-10,
525
- "logits/chosen": -1.5329395532608032,
526
- "logits/rejected": -1.5036195516586304,
527
- "logps/chosen": -1.0616767406463623,
528
- "logps/rejected": -0.9750264883041382,
529
- "loss": 1.2303,
530
- "rewards/accuracies": 0.53125,
531
- "rewards/chosen": 0.15331539511680603,
532
- "rewards/margins": 0.07396470010280609,
533
- "rewards/rejected": 0.07935069501399994,
534
  "step": 340
535
  },
536
  {
537
- "epoch": 0.68,
538
- "grad_norm": 3547.547904460223,
539
  "learning_rate": 2.8392998815082717e-10,
540
- "logits/chosen": -1.65792715549469,
541
- "logits/rejected": -1.564651608467102,
542
- "logps/chosen": -1.0541561841964722,
543
- "logps/rejected": -1.0552833080291748,
544
- "loss": 1.2058,
545
- "rewards/accuracies": 0.574999988079071,
546
- "rewards/chosen": 0.09947662055492401,
547
- "rewards/margins": 0.0944891944527626,
548
- "rewards/rejected": 0.0049874186515808105,
549
  "step": 350
550
  },
551
  {
552
- "epoch": 0.7,
553
- "grad_norm": 3220.5618942165656,
554
  "learning_rate": 2.5391918430549634e-10,
555
- "logits/chosen": -1.691556692123413,
556
- "logits/rejected": -1.629815697669983,
557
- "logps/chosen": -1.0243303775787354,
558
- "logps/rejected": -0.9425641894340515,
559
- "loss": 1.2104,
560
- "rewards/accuracies": 0.5375000238418579,
561
- "rewards/chosen": 0.0219818577170372,
562
- "rewards/margins": 0.0395941361784935,
563
- "rewards/rejected": -0.017612282186746597,
564
  "step": 360
565
  },
566
  {
567
- "epoch": 0.72,
568
- "grad_norm": 3066.9468052938237,
569
  "learning_rate": 2.250360338449226e-10,
570
- "logits/chosen": -1.7176358699798584,
571
- "logits/rejected": -1.705020546913147,
572
- "logps/chosen": -0.9747244119644165,
573
- "logps/rejected": -0.9268182516098022,
574
- "loss": 1.2136,
575
- "rewards/accuracies": 0.518750011920929,
576
- "rewards/chosen": 0.00225704163312912,
577
- "rewards/margins": 0.037154071033000946,
578
- "rewards/rejected": -0.034897033125162125,
579
  "step": 370
580
  },
581
  {
582
- "epoch": 0.74,
583
- "grad_norm": 3016.00166770142,
584
  "learning_rate": 1.9741289240311756e-10,
585
- "logits/chosen": -1.6244051456451416,
586
- "logits/rejected": -1.5759707689285278,
587
- "logps/chosen": -0.9922860860824585,
588
- "logps/rejected": -0.9526718854904175,
589
- "loss": 1.2207,
590
- "rewards/accuracies": 0.543749988079071,
591
- "rewards/chosen": 0.030346695333719254,
592
- "rewards/margins": 0.0755487009882927,
593
- "rewards/rejected": -0.04520200192928314,
594
  "step": 380
595
  },
596
  {
597
- "epoch": 0.76,
598
- "grad_norm": 2468.5679131308602,
599
  "learning_rate": 1.7117634168396773e-10,
600
- "logits/chosen": -1.6217002868652344,
601
- "logits/rejected": -1.5572141408920288,
602
- "logps/chosen": -1.0028107166290283,
603
- "logps/rejected": -0.9800904989242554,
604
- "loss": 1.224,
605
- "rewards/accuracies": 0.543749988079071,
606
- "rewards/chosen": 0.1051943302154541,
607
- "rewards/margins": 0.10344062000513077,
608
- "rewards/rejected": 0.0017537251114845276,
609
  "step": 390
610
  },
611
  {
612
- "epoch": 0.77,
613
- "grad_norm": 2795.143859667373,
614
  "learning_rate": 1.4644660940672628e-10,
615
- "logits/chosen": -1.6635463237762451,
616
- "logits/rejected": -1.5782734155654907,
617
- "logps/chosen": -0.9968119859695435,
618
- "logps/rejected": -0.95280522108078,
619
- "loss": 1.2085,
620
- "rewards/accuracies": 0.518750011920929,
621
- "rewards/chosen": 0.030616816133260727,
622
- "rewards/margins": -0.05183488130569458,
623
- "rewards/rejected": 0.08245169371366501,
624
  "step": 400
625
  },
626
  {
627
- "epoch": 0.79,
628
- "grad_norm": 4061.2855167991283,
629
  "learning_rate": 1.2333701836832813e-10,
630
- "logits/chosen": -1.6263242959976196,
631
- "logits/rejected": -1.561570405960083,
632
- "logps/chosen": -0.9721932411193848,
633
- "logps/rejected": -0.9435796737670898,
634
- "loss": 1.207,
635
- "rewards/accuracies": 0.48750001192092896,
636
- "rewards/chosen": 0.027876421809196472,
637
- "rewards/margins": 0.05892045423388481,
638
- "rewards/rejected": -0.031044036149978638,
639
  "step": 410
640
  },
641
  {
642
- "epoch": 0.81,
643
- "grad_norm": 3787.0389105243557,
644
  "learning_rate": 1.0195346714717813e-10,
645
- "logits/chosen": -1.5375678539276123,
646
- "logits/rejected": -1.5268394947052002,
647
- "logps/chosen": -0.9648769497871399,
648
- "logps/rejected": -0.9464458227157593,
649
- "loss": 1.2424,
650
- "rewards/accuracies": 0.4937500059604645,
651
- "rewards/chosen": -0.017999447882175446,
652
- "rewards/margins": -0.051319561898708344,
653
- "rewards/rejected": 0.0333201140165329,
654
  "step": 420
655
  },
656
  {
657
- "epoch": 0.83,
658
- "grad_norm": 3059.7951831954097,
659
  "learning_rate": 8.239394482805996e-11,
660
- "logits/chosen": -1.593246340751648,
661
- "logits/rejected": -1.5492476224899292,
662
- "logps/chosen": -1.0189164876937866,
663
- "logps/rejected": -0.9625579118728638,
664
- "loss": 1.2289,
665
- "rewards/accuracies": 0.5874999761581421,
666
- "rewards/chosen": 0.06651192903518677,
667
- "rewards/margins": 0.12220799922943115,
668
- "rewards/rejected": -0.05569607764482498,
669
  "step": 430
670
  },
671
  {
672
- "epoch": 0.85,
673
- "grad_norm": 3247.488164594565,
674
  "learning_rate": 6.474808197191401e-11,
675
- "logits/chosen": -1.6315078735351562,
676
- "logits/rejected": -1.5818345546722412,
677
- "logps/chosen": -1.046481966972351,
678
- "logps/rejected": -0.9752964973449707,
679
- "loss": 1.2091,
680
- "rewards/accuracies": 0.550000011920929,
681
- "rewards/chosen": 0.041832178831100464,
682
- "rewards/margins": 0.09823881089687347,
683
- "rewards/rejected": -0.05640662834048271,
684
  "step": 440
685
  },
686
  {
687
- "epoch": 0.87,
688
- "grad_norm": 3377.303480206014,
689
  "learning_rate": 4.9096739888146e-11,
690
- "logits/chosen": -1.6218818426132202,
691
- "logits/rejected": -1.5289857387542725,
692
- "logps/chosen": -1.0135427713394165,
693
- "logps/rejected": -0.9735710024833679,
694
- "loss": 1.2589,
695
- "rewards/accuracies": 0.550000011920929,
696
- "rewards/chosen": 0.15450438857078552,
697
- "rewards/margins": 0.1440540999174118,
698
- "rewards/rejected": 0.01045025885105133,
699
  "step": 450
700
  },
701
  {
702
- "epoch": 0.89,
703
- "grad_norm": 3083.4878845586436,
704
  "learning_rate": 3.5511640091604293e-11,
705
- "logits/chosen": -1.5636273622512817,
706
- "logits/rejected": -1.5262852907180786,
707
- "logps/chosen": -1.056396722793579,
708
- "logps/rejected": -0.9384132623672485,
709
- "loss": 1.2066,
710
- "rewards/accuracies": 0.48750001192092896,
711
- "rewards/chosen": -0.023233562707901,
712
- "rewards/margins": 0.048523545265197754,
713
- "rewards/rejected": -0.07175710052251816,
714
  "step": 460
715
  },
716
  {
717
- "epoch": 0.91,
718
- "grad_norm": 3283.918271980478,
719
  "learning_rate": 2.4055035642222225e-11,
720
- "logits/chosen": -1.637670874595642,
721
- "logits/rejected": -1.5689923763275146,
722
- "logps/chosen": -0.9840338826179504,
723
- "logps/rejected": -0.9136601686477661,
724
- "loss": 1.2087,
725
- "rewards/accuracies": 0.4625000059604645,
726
- "rewards/chosen": 0.046017058193683624,
727
- "rewards/margins": -0.03135695680975914,
728
- "rewards/rejected": 0.07737401872873306,
729
  "step": 470
730
  },
731
  {
732
- "epoch": 0.93,
733
- "grad_norm": 2984.402833421987,
734
  "learning_rate": 1.477942587339426e-11,
735
- "logits/chosen": -1.6090877056121826,
736
- "logits/rejected": -1.5660176277160645,
737
- "logps/chosen": -0.9489718675613403,
738
- "logps/rejected": -0.9073503613471985,
739
- "loss": 1.2087,
740
- "rewards/accuracies": 0.5249999761581421,
741
- "rewards/chosen": -0.04631403833627701,
742
- "rewards/margins": -0.06630536168813705,
743
- "rewards/rejected": 0.019991323351860046,
744
  "step": 480
745
  },
746
  {
747
- "epoch": 0.95,
748
- "grad_norm": 3305.4541171804303,
749
  "learning_rate": 7.727315816331515e-12,
750
- "logits/chosen": -1.657076120376587,
751
- "logits/rejected": -1.607134461402893,
752
- "logps/chosen": -1.0778063535690308,
753
- "logps/rejected": -1.0003712177276611,
754
- "loss": 1.2118,
755
- "rewards/accuracies": 0.518750011920929,
756
- "rewards/chosen": 0.044107623398303986,
757
- "rewards/margins": 0.06955888122320175,
758
- "rewards/rejected": -0.025451254099607468,
759
  "step": 490
760
  },
761
  {
762
- "epoch": 0.97,
763
- "grad_norm": 3282.0354146265427,
764
  "learning_rate": 2.9310214228202016e-12,
765
- "logits/chosen": -1.6225248575210571,
766
- "logits/rejected": -1.5484001636505127,
767
- "logps/chosen": -0.9589713215827942,
768
- "logps/rejected": -0.9175511598587036,
769
- "loss": 1.2114,
770
- "rewards/accuracies": 0.42500001192092896,
771
- "rewards/chosen": -0.06731443107128143,
772
- "rewards/margins": -0.1021970883011818,
773
- "rewards/rejected": 0.034882646054029465,
774
  "step": 500
775
  },
776
  {
777
- "epoch": 0.99,
778
- "grad_norm": 3258.514052979456,
779
  "learning_rate": 4.125214789427734e-13,
780
- "logits/chosen": -1.564023733139038,
781
- "logits/rejected": -1.50169837474823,
782
- "logps/chosen": -0.9931696653366089,
783
- "logps/rejected": -0.9385188221931458,
784
- "loss": 1.2461,
785
- "rewards/accuracies": 0.45625001192092896,
786
- "rewards/chosen": 0.05308717489242554,
787
- "rewards/margins": 0.01174162793904543,
788
- "rewards/rejected": 0.041345540434122086,
789
  "step": 510
790
  },
791
  {
792
- "epoch": 1.0,
793
  "step": 516,
794
  "total_flos": 0.0,
795
- "train_loss": 1.2283447361731714,
796
- "train_runtime": 7984.1405,
797
- "train_samples_per_second": 8.277,
798
- "train_steps_per_second": 0.065
799
  }
800
  ],
801
  "logging_steps": 10,
@@ -803,6 +803,18 @@
803
  "num_input_tokens_seen": 0,
804
  "num_train_epochs": 1,
805
  "save_steps": 100,
 
 
 
 
 
 
 
 
 
 
 
 
806
  "total_flos": 0.0,
807
  "train_batch_size": 4,
808
  "trial_name": null,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.001936108422071636,
13
+ "grad_norm": 4289.771574316242,
14
  "learning_rate": 1.9230769230769234e-11,
15
  "logits/chosen": -1.8683955669403076,
16
  "logits/rejected": -1.7658718824386597,
 
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.01936108422071636,
28
+ "grad_norm": 4190.035270648744,
29
  "learning_rate": 1.9230769230769234e-10,
30
+ "logits/chosen": -1.6616647243499756,
31
+ "logits/rejected": -1.6193790435791016,
32
+ "logps/chosen": -0.9486603140830994,
33
+ "logps/rejected": -0.9298955202102661,
34
+ "loss": 1.2029,
35
+ "rewards/accuracies": 0.3333333432674408,
36
+ "rewards/chosen": -0.0510055311024189,
37
+ "rewards/margins": -0.1265178769826889,
38
+ "rewards/rejected": 0.0755123570561409,
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.03872216844143272,
43
+ "grad_norm": 3083.054373722184,
44
  "learning_rate": 3.8461538461538467e-10,
45
+ "logits/chosen": -1.5834848880767822,
46
+ "logits/rejected": -1.5355803966522217,
47
+ "logps/chosen": -1.0245015621185303,
48
+ "logps/rejected": -0.9704240560531616,
49
+ "loss": 1.2623,
50
+ "rewards/accuracies": 0.5687500238418579,
51
+ "rewards/chosen": 0.0036691308487206697,
52
+ "rewards/margins": 0.06183544546365738,
53
+ "rewards/rejected": -0.05816630274057388,
54
  "step": 20
55
  },
56
  {
57
+ "epoch": 0.05808325266214908,
58
+ "grad_norm": 4908.628774557542,
59
  "learning_rate": 5.769230769230769e-10,
60
+ "logits/chosen": -1.5294702053070068,
61
+ "logits/rejected": -1.4707310199737549,
62
+ "logps/chosen": -1.0035126209259033,
63
+ "logps/rejected": -0.9810468554496765,
64
+ "loss": 1.2911,
65
+ "rewards/accuracies": 0.512499988079071,
66
+ "rewards/chosen": -0.10282345861196518,
67
+ "rewards/margins": -0.00801654439419508,
68
+ "rewards/rejected": -0.09480690211057663,
69
  "step": 30
70
  },
71
  {
72
+ "epoch": 0.07744433688286544,
73
+ "grad_norm": 3634.399853197314,
74
  "learning_rate": 7.692307692307693e-10,
75
+ "logits/chosen": -1.5545880794525146,
76
+ "logits/rejected": -1.4971392154693604,
77
+ "logps/chosen": -1.0002901554107666,
78
+ "logps/rejected": -0.9374505281448364,
79
+ "loss": 1.2544,
80
+ "rewards/accuracies": 0.4375,
81
+ "rewards/chosen": -0.056221622973680496,
82
+ "rewards/margins": -0.12876693904399872,
83
+ "rewards/rejected": 0.07254532724618912,
84
  "step": 40
85
  },
86
  {
87
+ "epoch": 0.0968054211035818,
88
+ "grad_norm": 3743.017714256134,
89
  "learning_rate": 9.615384615384616e-10,
90
+ "logits/chosen": -1.6496717929840088,
91
+ "logits/rejected": -1.6047435998916626,
92
+ "logps/chosen": -0.990554928779602,
93
+ "logps/rejected": -0.968321681022644,
94
+ "loss": 1.2784,
95
+ "rewards/accuracies": 0.512499988079071,
96
+ "rewards/chosen": -0.044189296662807465,
97
+ "rewards/margins": -0.09925868362188339,
98
+ "rewards/rejected": 0.05506938695907593,
99
  "step": 50
100
  },
101
  {
102
+ "epoch": 0.11616650532429816,
103
+ "grad_norm": 3989.4789916346926,
104
  "learning_rate": 9.99266706925562e-10,
105
+ "logits/chosen": -1.6014108657836914,
106
+ "logits/rejected": -1.5485652685165405,
107
+ "logps/chosen": -0.9952412843704224,
108
+ "logps/rejected": -0.9312202334403992,
109
+ "loss": 1.2834,
110
+ "rewards/accuracies": 0.53125,
111
+ "rewards/chosen": 0.05288747698068619,
112
+ "rewards/margins": 0.01693376898765564,
113
+ "rewards/rejected": 0.03595370799303055,
114
  "step": 60
115
  },
116
  {
117
+ "epoch": 0.1355275895450145,
118
+ "grad_norm": 4801.740697750488,
119
  "learning_rate": 9.96291389741603e-10,
120
+ "logits/chosen": -1.5917268991470337,
121
+ "logits/rejected": -1.505897045135498,
122
+ "logps/chosen": -0.9961442947387695,
123
+ "logps/rejected": -0.946877658367157,
124
+ "loss": 1.2574,
125
+ "rewards/accuracies": 0.550000011920929,
126
+ "rewards/chosen": -0.024737417697906494,
127
+ "rewards/margins": 0.13343551754951477,
128
+ "rewards/rejected": -0.15817293524742126,
129
  "step": 70
130
  },
131
  {
132
+ "epoch": 0.15488867376573087,
133
+ "grad_norm": 4317.0356676312595,
134
  "learning_rate": 9.91041841371078e-10,
135
+ "logits/chosen": -1.5460880994796753,
136
+ "logits/rejected": -1.5285828113555908,
137
+ "logps/chosen": -1.0357868671417236,
138
+ "logps/rejected": -0.985648512840271,
139
+ "loss": 1.2938,
140
+ "rewards/accuracies": 0.512499988079071,
141
+ "rewards/chosen": 0.03525885194540024,
142
+ "rewards/margins": 0.09618574380874634,
143
+ "rewards/rejected": -0.0609268844127655,
144
  "step": 80
145
  },
146
  {
147
+ "epoch": 0.17424975798644723,
148
+ "grad_norm": 4079.0129090665787,
149
  "learning_rate": 9.835421176144035e-10,
150
+ "logits/chosen": -1.678056001663208,
151
+ "logits/rejected": -1.6193243265151978,
152
+ "logps/chosen": -1.0110713243484497,
153
+ "logps/rejected": -0.9249661564826965,
154
+ "loss": 1.2586,
155
+ "rewards/accuracies": 0.512499988079071,
156
+ "rewards/chosen": 0.03245333582162857,
157
+ "rewards/margins": 0.027137070894241333,
158
+ "rewards/rejected": 0.005316261202096939,
159
  "step": 90
160
  },
161
  {
162
+ "epoch": 0.1936108422071636,
163
+ "grad_norm": 4056.2821430675544,
164
  "learning_rate": 9.738265855914014e-10,
165
+ "logits/chosen": -1.6319992542266846,
166
+ "logits/rejected": -1.5686506032943726,
167
+ "logps/chosen": -0.978125274181366,
168
+ "logps/rejected": -0.9383082389831543,
169
+ "loss": 1.2723,
170
+ "rewards/accuracies": 0.42500001192092896,
171
+ "rewards/chosen": 0.002950614783912897,
172
+ "rewards/margins": -0.10372404754161835,
173
+ "rewards/rejected": 0.10667465627193451,
174
  "step": 100
175
  },
176
  {
177
+ "epoch": 0.21297192642787996,
178
+ "grad_norm": 5228.540901226267,
179
  "learning_rate": 9.619397662556434e-10,
180
+ "logits/chosen": -1.6645421981811523,
181
+ "logits/rejected": -1.588181495666504,
182
+ "logps/chosen": -0.8982473611831665,
183
+ "logps/rejected": -0.8726890683174133,
184
+ "loss": 1.2749,
185
+ "rewards/accuracies": 0.5062500238418579,
186
+ "rewards/chosen": 0.06572394073009491,
187
+ "rewards/margins": -0.014699941501021385,
188
+ "rewards/rejected": 0.08042389899492264,
189
  "step": 110
190
  },
191
  {
192
+ "epoch": 0.23233301064859632,
193
+ "grad_norm": 3837.5381027526573,
194
  "learning_rate": 9.47936130379344e-10,
195
+ "logits/chosen": -1.5425523519515991,
196
+ "logits/rejected": -1.5234339237213135,
197
+ "logps/chosen": -0.9705616235733032,
198
+ "logps/rejected": -0.9401592016220093,
199
+ "loss": 1.2288,
200
+ "rewards/accuracies": 0.543749988079071,
201
+ "rewards/chosen": 0.0972176343202591,
202
+ "rewards/margins": 0.09945462644100189,
203
+ "rewards/rejected": -0.0022369951475411654,
204
  "step": 120
205
  },
206
  {
207
+ "epoch": 0.25169409486931266,
208
+ "grad_norm": 3814.5542415994746,
209
  "learning_rate": 9.318798489436919e-10,
210
+ "logits/chosen": -1.579369306564331,
211
+ "logits/rejected": -1.4889588356018066,
212
+ "logps/chosen": -0.9654415845870972,
213
+ "logps/rejected": -0.9286600351333618,
214
+ "loss": 1.2977,
215
+ "rewards/accuracies": 0.53125,
216
+ "rewards/chosen": 0.14246916770935059,
217
+ "rewards/margins": 0.08274303376674652,
218
+ "rewards/rejected": 0.05972614139318466,
219
  "step": 130
220
  },
221
  {
222
+ "epoch": 0.271055179090029,
223
+ "grad_norm": 3751.478373618925,
224
  "learning_rate": 9.138444990784454e-10,
225
+ "logits/chosen": -1.5760021209716797,
226
+ "logits/rejected": -1.5278505086898804,
227
+ "logps/chosen": -0.9930068850517273,
228
+ "logps/rejected": -0.9939811825752258,
229
+ "loss": 1.2583,
230
+ "rewards/accuracies": 0.4625000059604645,
231
+ "rewards/chosen": -0.029570287093520164,
232
+ "rewards/margins": -0.044873449951410294,
233
+ "rewards/rejected": 0.015303166583180428,
234
  "step": 140
235
  },
236
  {
237
+ "epoch": 0.2904162633107454,
238
+ "grad_norm": 3907.5968331867316,
239
  "learning_rate": 8.939127268983109e-10,
240
+ "logits/chosen": -1.5609729290008545,
241
+ "logits/rejected": -1.5375111103057861,
242
+ "logps/chosen": -1.0732953548431396,
243
+ "logps/rejected": -0.9959889650344849,
244
+ "loss": 1.2631,
245
+ "rewards/accuracies": 0.53125,
246
+ "rewards/chosen": 0.12664008140563965,
247
+ "rewards/margins": 0.0977608785033226,
248
+ "rewards/rejected": 0.02887919172644615,
249
  "step": 150
250
  },
251
  {
252
+ "epoch": 0.30977734753146174,
253
+ "grad_norm": 4306.071797965924,
254
  "learning_rate": 8.721758687811352e-10,
255
+ "logits/chosen": -1.6766704320907593,
256
+ "logits/rejected": -1.6015819311141968,
257
+ "logps/chosen": -0.9719392657279968,
258
+ "logps/rejected": -0.9425565600395203,
259
+ "loss": 1.3024,
260
+ "rewards/accuracies": 0.44999998807907104,
261
+ "rewards/chosen": -0.02740299701690674,
262
+ "rewards/margins": -0.19174733757972717,
263
+ "rewards/rejected": 0.16434435546398163,
264
  "step": 160
265
  },
266
  {
267
+ "epoch": 0.3291384317521781,
268
+ "grad_norm": 4085.902013165705,
269
  "learning_rate": 8.487335328233912e-10,
270
+ "logits/chosen": -1.5358489751815796,
271
+ "logits/rejected": -1.4411264657974243,
272
+ "logps/chosen": -0.9960495829582214,
273
+ "logps/rejected": -0.9736671447753906,
274
+ "loss": 1.2834,
275
+ "rewards/accuracies": 0.5,
276
+ "rewards/chosen": -0.07462203502655029,
277
+ "rewards/margins": -0.08827298879623413,
278
+ "rewards/rejected": 0.013650953769683838,
279
  "step": 170
280
  },
281
  {
282
+ "epoch": 0.34849951597289447,
283
+ "grad_norm": 4830.939949744352,
284
  "learning_rate": 8.236931423909139e-10,
285
+ "logits/chosen": -1.6721004247665405,
286
+ "logits/rejected": -1.5736545324325562,
287
+ "logps/chosen": -0.9753511548042297,
288
+ "logps/rejected": -0.9517275094985962,
289
+ "loss": 1.2611,
290
+ "rewards/accuracies": 0.5562499761581421,
291
+ "rewards/chosen": 0.11362428963184357,
292
+ "rewards/margins": 0.17257475852966309,
293
+ "rewards/rejected": -0.05895046144723892,
294
  "step": 180
295
  },
296
  {
297
+ "epoch": 0.36786060019361083,
298
+ "grad_norm": 4117.362535725707,
299
  "learning_rate": 7.971694438565449e-10,
300
+ "logits/chosen": -1.6258817911148071,
301
+ "logits/rejected": -1.563320517539978,
302
+ "logps/chosen": -0.9729933738708496,
303
+ "logps/rejected": -0.9748104810714722,
304
+ "loss": 1.2509,
305
+ "rewards/accuracies": 0.5562499761581421,
306
+ "rewards/chosen": 0.025180306285619736,
307
+ "rewards/margins": 0.146215558052063,
308
+ "rewards/rejected": -0.12103524059057236,
309
  "step": 190
310
  },
311
  {
312
+ "epoch": 0.3872216844143272,
313
+ "grad_norm": 4321.444634590991,
314
  "learning_rate": 7.692839807804521e-10,
315
+ "logits/chosen": -1.6311115026474,
316
+ "logits/rejected": -1.5964945554733276,
317
+ "logps/chosen": -0.9733870625495911,
318
+ "logps/rejected": -0.9268864393234253,
319
+ "loss": 1.2991,
320
+ "rewards/accuracies": 0.4937500059604645,
321
+ "rewards/chosen": -0.10145823657512665,
322
+ "rewards/margins": -0.11137993633747101,
323
+ "rewards/rejected": 0.009921704418957233,
324
  "step": 200
325
  },
326
  {
327
+ "epoch": 0.40658276863504356,
328
+ "grad_norm": 4028.0205254352395,
329
  "learning_rate": 7.401645369426697e-10,
330
+ "logits/chosen": -1.6063648462295532,
331
+ "logits/rejected": -1.5398415327072144,
332
+ "logps/chosen": -0.9638331532478333,
333
+ "logps/rejected": -0.9147375822067261,
334
+ "loss": 1.2499,
335
+ "rewards/accuracies": 0.512499988079071,
336
+ "rewards/chosen": -0.012107854709029198,
337
+ "rewards/margins": -0.00303336838260293,
338
+ "rewards/rejected": -0.009074489586055279,
339
  "step": 210
340
  },
341
  {
342
+ "epoch": 0.4259438528557599,
343
+ "grad_norm": 4531.775441431939,
344
  "learning_rate": 7.099445507801324e-10,
345
+ "logits/chosen": -1.6380093097686768,
346
+ "logits/rejected": -1.589648962020874,
347
+ "logps/chosen": -0.999239444732666,
348
+ "logps/rejected": -0.9409993886947632,
349
+ "loss": 1.2661,
350
+ "rewards/accuracies": 0.543749988079071,
351
+ "rewards/chosen": 0.08351825177669525,
352
+ "rewards/margins": 0.06452666223049164,
353
+ "rewards/rejected": 0.018991602584719658,
354
  "step": 220
355
  },
356
  {
357
+ "epoch": 0.4453049370764763,
358
+ "grad_norm": 4190.546129288723,
359
  "learning_rate": 6.7876250391152e-10,
360
+ "logits/chosen": -1.5896263122558594,
361
+ "logits/rejected": -1.5483357906341553,
362
+ "logps/chosen": -0.9513187408447266,
363
+ "logps/rejected": -0.9711192846298218,
364
+ "loss": 1.2465,
365
+ "rewards/accuracies": 0.512499988079071,
366
+ "rewards/chosen": 0.011254754848778248,
367
+ "rewards/margins": 0.014234659262001514,
368
+ "rewards/rejected": -0.002979907440021634,
369
  "step": 230
370
  },
371
  {
372
+ "epoch": 0.46466602129719264,
373
+ "grad_norm": 4241.882144515242,
374
  "learning_rate": 6.467612865519674e-10,
375
+ "logits/chosen": -1.6262489557266235,
376
+ "logits/rejected": -1.6008937358856201,
377
+ "logps/chosen": -0.9875959157943726,
378
+ "logps/rejected": -0.9101887941360474,
379
+ "loss": 1.2863,
380
+ "rewards/accuracies": 0.48124998807907104,
381
+ "rewards/chosen": 0.02070781961083412,
382
+ "rewards/margins": -0.0031043351627886295,
383
+ "rewards/rejected": 0.023812144994735718,
384
  "step": 240
385
  },
386
  {
387
+ "epoch": 0.484027105517909,
388
+ "grad_norm": 3918.096198448725,
389
  "learning_rate": 6.14087542725593e-10,
390
+ "logits/chosen": -1.6483112573623657,
391
+ "logits/rejected": -1.6147336959838867,
392
+ "logps/chosen": -1.020185112953186,
393
+ "logps/rejected": -0.9349339604377747,
394
+ "loss": 1.2945,
395
+ "rewards/accuracies": 0.42500001192092896,
396
+ "rewards/chosen": -0.05541396886110306,
397
+ "rewards/margins": -0.13103903830051422,
398
+ "rewards/rejected": 0.07562507688999176,
399
  "step": 250
400
  },
401
  {
402
+ "epoch": 0.5033881897386253,
403
+ "grad_norm": 3626.964808344265,
404
  "learning_rate": 5.808909982763825e-10,
405
+ "logits/chosen": -1.6457252502441406,
406
+ "logits/rejected": -1.5552377700805664,
407
+ "logps/chosen": -0.9940276145935059,
408
+ "logps/rejected": -0.9484678506851196,
409
+ "loss": 1.274,
410
+ "rewards/accuracies": 0.45625001192092896,
411
+ "rewards/chosen": -0.017910266295075417,
412
+ "rewards/margins": -0.03398443013429642,
413
+ "rewards/rejected": 0.016074160113930702,
414
  "step": 260
415
  },
416
  {
417
+ "epoch": 0.5227492739593417,
418
+ "grad_norm": 3993.5474645065597,
419
  "learning_rate": 5.473237747567806e-10,
420
+ "logits/chosen": -1.6354029178619385,
421
+ "logits/rejected": -1.5715049505233765,
422
+ "logps/chosen": -0.9696807861328125,
423
+ "logps/rejected": -0.9575467109680176,
424
+ "loss": 1.2403,
425
+ "rewards/accuracies": 0.5249999761581421,
426
+ "rewards/chosen": -0.052422285079956055,
427
+ "rewards/margins": -0.0962902158498764,
428
+ "rewards/rejected": 0.04386794939637184,
429
  "step": 270
430
  },
431
  {
432
+ "epoch": 0.542110358180058,
433
+ "grad_norm": 4125.713570850562,
434
  "learning_rate": 5.135396923380673e-10,
435
+ "logits/chosen": -1.5687922239303589,
436
+ "logits/rejected": -1.495790719985962,
437
+ "logps/chosen": -0.9840999841690063,
438
+ "logps/rejected": -0.9467118978500366,
439
+ "loss": 1.274,
440
  "rewards/accuracies": 0.4749999940395355,
441
+ "rewards/chosen": 0.07109468430280685,
442
+ "rewards/margins": 0.008125528693199158,
443
+ "rewards/rejected": 0.0629691407084465,
444
  "step": 280
445
  },
446
  {
447
+ "epoch": 0.5614714424007744,
448
+ "grad_norm": 4172.164367513521,
449
  "learning_rate": 4.796935649368935e-10,
450
+ "logits/chosen": -1.5752254724502563,
451
+ "logits/rejected": -1.4961917400360107,
452
+ "logps/chosen": -1.047147512435913,
453
+ "logps/rejected": -0.9933904409408569,
454
+ "loss": 1.2562,
455
+ "rewards/accuracies": 0.6000000238418579,
456
+ "rewards/chosen": 0.0501960925757885,
457
+ "rewards/margins": 0.15902431309223175,
458
+ "rewards/rejected": -0.10882820934057236,
459
  "step": 290
460
  },
461
  {
462
+ "epoch": 0.5808325266214908,
463
+ "grad_norm": 4075.4312369744443,
464
  "learning_rate": 4.4594049078802925e-10,
465
+ "logits/chosen": -1.5983613729476929,
466
+ "logits/rejected": -1.501022458076477,
467
+ "logps/chosen": -0.9565431475639343,
468
+ "logps/rejected": -0.9199585914611816,
469
+ "loss": 1.251,
470
+ "rewards/accuracies": 0.518750011920929,
471
+ "rewards/chosen": 0.11743637174367905,
472
+ "rewards/margins": 0.11259187757968903,
473
+ "rewards/rejected": 0.004844509996473789,
474
  "step": 300
475
  },
476
  {
477
+ "epoch": 0.6001936108422071,
478
+ "grad_norm": 4415.771404967329,
479
  "learning_rate": 4.1243514171423466e-10,
480
+ "logits/chosen": -1.5818376541137695,
481
+ "logits/rejected": -1.5412604808807373,
482
+ "logps/chosen": -0.9797090291976929,
483
+ "logps/rejected": -0.9499006271362305,
484
+ "loss": 1.2733,
485
+ "rewards/accuracies": 0.46875,
486
+ "rewards/chosen": 0.04855315759778023,
487
+ "rewards/margins": -0.004827280528843403,
488
+ "rewards/rejected": 0.053380437195301056,
489
  "step": 310
490
  },
491
  {
492
+ "epoch": 0.6195546950629235,
493
+ "grad_norm": 4020.4622713375675,
494
  "learning_rate": 3.793310543501473e-10,
495
+ "logits/chosen": -1.661425232887268,
496
+ "logits/rejected": -1.589224100112915,
497
+ "logps/chosen": -0.9785275459289551,
498
+ "logps/rejected": -0.9554710388183594,
499
+ "loss": 1.281,
500
+ "rewards/accuracies": 0.48124998807907104,
501
+ "rewards/chosen": 0.017457595095038414,
502
+ "rewards/margins": -0.04888144135475159,
503
+ "rewards/rejected": 0.06633903831243515,
504
  "step": 320
505
  },
506
  {
507
+ "epoch": 0.6389157792836399,
508
+ "grad_norm": 4011.0955669502655,
509
  "learning_rate": 3.4677992656811053e-10,
510
+ "logits/chosen": -1.6328061819076538,
511
+ "logits/rejected": -1.6010538339614868,
512
+ "logps/chosen": -1.0184727907180786,
513
+ "logps/rejected": -0.9700371623039246,
514
+ "loss": 1.2624,
515
+ "rewards/accuracies": 0.46875,
516
+ "rewards/chosen": -0.015060502104461193,
517
+ "rewards/margins": -0.0113009512424469,
518
+ "rewards/rejected": -0.0037595562171190977,
519
  "step": 330
520
  },
521
  {
522
+ "epoch": 0.6582768635043562,
523
+ "grad_norm": 4234.390545263189,
524
  "learning_rate": 3.149309223300428e-10,
525
+ "logits/chosen": -1.532965064048767,
526
+ "logits/rejected": -1.5039539337158203,
527
+ "logps/chosen": -1.0615012645721436,
528
+ "logps/rejected": -0.975587010383606,
529
+ "loss": 1.2536,
530
+ "rewards/accuracies": 0.6187499761581421,
531
+ "rewards/chosen": 0.23547323048114777,
532
+ "rewards/margins": 0.2764241397380829,
533
+ "rewards/rejected": -0.04095090553164482,
534
  "step": 340
535
  },
536
  {
537
+ "epoch": 0.6776379477250726,
538
+ "grad_norm": 4310.225886680266,
539
  "learning_rate": 2.8392998815082717e-10,
540
+ "logits/chosen": -1.6574161052703857,
541
+ "logits/rejected": -1.5642915964126587,
542
+ "logps/chosen": -1.0545518398284912,
543
+ "logps/rejected": -1.055479645729065,
544
+ "loss": 1.2634,
545
+ "rewards/accuracies": 0.518750011920929,
546
+ "rewards/chosen": 0.025425389409065247,
547
+ "rewards/margins": 0.06826835870742798,
548
+ "rewards/rejected": -0.04284298047423363,
549
  "step": 350
550
  },
551
  {
552
+ "epoch": 0.6969990319457889,
553
+ "grad_norm": 3627.3654619519734,
554
  "learning_rate": 2.5391918430549634e-10,
555
+ "logits/chosen": -1.6917625665664673,
556
+ "logits/rejected": -1.6304069757461548,
557
+ "logps/chosen": -1.0242336988449097,
558
+ "logps/rejected": -0.9425589442253113,
559
+ "loss": 1.2358,
560
+ "rewards/accuracies": 0.550000011920929,
561
+ "rewards/chosen": 0.0516384020447731,
562
+ "rewards/margins": 0.07235859334468842,
563
+ "rewards/rejected": -0.020720209926366806,
564
  "step": 360
565
  },
566
  {
567
+ "epoch": 0.7163601161665053,
568
+ "grad_norm": 3729.350909018597,
569
  "learning_rate": 2.250360338449226e-10,
570
+ "logits/chosen": -1.7181346416473389,
571
+ "logits/rejected": -1.7049201726913452,
572
+ "logps/chosen": -0.9749780893325806,
573
+ "logps/rejected": -0.9269036054611206,
574
+ "loss": 1.2295,
575
+ "rewards/accuracies": 0.5249999761581421,
576
+ "rewards/chosen": -0.060578178614377975,
577
+ "rewards/margins": 0.004380314145237207,
578
+ "rewards/rejected": -0.06495849788188934,
579
  "step": 370
580
  },
581
  {
582
+ "epoch": 0.7357212003872217,
583
+ "grad_norm": 3635.364845851746,
584
  "learning_rate": 1.9741289240311756e-10,
585
+ "logits/chosen": -1.6250957250595093,
586
+ "logits/rejected": -1.5766202211380005,
587
+ "logps/chosen": -0.9923038482666016,
588
+ "logps/rejected": -0.95228511095047,
589
+ "loss": 1.279,
590
+ "rewards/accuracies": 0.5062500238418579,
591
+ "rewards/chosen": 0.03353533893823624,
592
+ "rewards/margins": -0.006679633166640997,
593
+ "rewards/rejected": 0.04021497443318367,
594
  "step": 380
595
  },
596
  {
597
+ "epoch": 0.755082284607938,
598
+ "grad_norm": 3489.267034208794,
599
  "learning_rate": 1.7117634168396773e-10,
600
+ "logits/chosen": -1.6205106973648071,
601
+ "logits/rejected": -1.5565264225006104,
602
+ "logps/chosen": -1.0025845766067505,
603
+ "logps/rejected": -0.9802389144897461,
604
+ "loss": 1.2588,
605
+ "rewards/accuracies": 0.53125,
606
+ "rewards/chosen": 0.1880512535572052,
607
+ "rewards/margins": 0.2229468822479248,
608
+ "rewards/rejected": -0.034895628690719604,
609
  "step": 390
610
  },
611
  {
612
+ "epoch": 0.7744433688286544,
613
+ "grad_norm": 3578.1741607281892,
614
  "learning_rate": 1.4644660940672628e-10,
615
+ "logits/chosen": -1.6632875204086304,
616
+ "logits/rejected": -1.5778895616531372,
617
+ "logps/chosen": -0.9968886375427246,
618
+ "logps/rejected": -0.9527746438980103,
619
+ "loss": 1.2605,
620
+ "rewards/accuracies": 0.48750001192092896,
621
+ "rewards/chosen": 0.01907891035079956,
622
+ "rewards/margins": -0.09163277596235275,
623
+ "rewards/rejected": 0.11071167141199112,
624
  "step": 400
625
  },
626
  {
627
+ "epoch": 0.7938044530493708,
628
+ "grad_norm": 4697.104208557154,
629
  "learning_rate": 1.2333701836832813e-10,
630
+ "logits/chosen": -1.6258773803710938,
631
+ "logits/rejected": -1.5613044500350952,
632
+ "logps/chosen": -0.9721547365188599,
633
+ "logps/rejected": -0.9434949159622192,
634
+ "loss": 1.2668,
635
+ "rewards/accuracies": 0.5249999761581421,
636
+ "rewards/chosen": 0.04442809149622917,
637
+ "rewards/margins": 0.06200051307678223,
638
+ "rewards/rejected": -0.01757242903113365,
639
  "step": 410
640
  },
641
  {
642
+ "epoch": 0.8131655372700871,
643
+ "grad_norm": 4633.7276813164935,
644
  "learning_rate": 1.0195346714717813e-10,
645
+ "logits/chosen": -1.537536859512329,
646
+ "logits/rejected": -1.5266730785369873,
647
+ "logps/chosen": -0.965029239654541,
648
+ "logps/rejected": -0.9463868141174316,
649
+ "loss": 1.2845,
650
+ "rewards/accuracies": 0.4437499940395355,
651
+ "rewards/chosen": -0.0605890154838562,
652
+ "rewards/margins": -0.1169745922088623,
653
+ "rewards/rejected": 0.05638556554913521,
654
  "step": 420
655
  },
656
  {
657
+ "epoch": 0.8325266214908035,
658
+ "grad_norm": 4035.95185171689,
659
  "learning_rate": 8.239394482805996e-11,
660
+ "logits/chosen": -1.5938284397125244,
661
+ "logits/rejected": -1.5492713451385498,
662
+ "logps/chosen": -1.0192432403564453,
663
+ "logps/rejected": -0.9626362919807434,
664
+ "loss": 1.2589,
665
+ "rewards/accuracies": 0.518750011920929,
666
+ "rewards/chosen": 0.001451274729333818,
667
+ "rewards/margins": 0.09064897149801254,
668
+ "rewards/rejected": -0.08919770270586014,
669
  "step": 430
670
  },
671
  {
672
+ "epoch": 0.8518877057115198,
673
+ "grad_norm": 4317.7906947065,
674
  "learning_rate": 6.474808197191401e-11,
675
+ "logits/chosen": -1.631870985031128,
676
+ "logits/rejected": -1.5819487571716309,
677
+ "logps/chosen": -1.0460567474365234,
678
+ "logps/rejected": -0.9752202033996582,
679
+ "loss": 1.2476,
680
+ "rewards/accuracies": 0.53125,
681
+ "rewards/chosen": 0.15858839452266693,
682
+ "rewards/margins": 0.21004195511341095,
683
+ "rewards/rejected": -0.051453519612550735,
684
  "step": 440
685
  },
686
  {
687
+ "epoch": 0.8712487899322362,
688
+ "grad_norm": 4211.087790819232,
689
  "learning_rate": 4.9096739888146e-11,
690
+ "logits/chosen": -1.6219565868377686,
691
+ "logits/rejected": -1.5289008617401123,
692
+ "logps/chosen": -1.013887643814087,
693
+ "logps/rejected": -0.9734300374984741,
694
+ "loss": 1.2654,
695
+ "rewards/accuracies": 0.518750011920929,
696
+ "rewards/chosen": 0.10689739137887955,
697
+ "rewards/margins": 0.05857623741030693,
698
+ "rewards/rejected": 0.04832116514444351,
699
  "step": 450
700
  },
701
  {
702
+ "epoch": 0.8906098741529526,
703
+ "grad_norm": 3769.26467222043,
704
  "learning_rate": 3.5511640091604293e-11,
705
+ "logits/chosen": -1.5646103620529175,
706
+ "logits/rejected": -1.5271342992782593,
707
+ "logps/chosen": -1.0559688806533813,
708
+ "logps/rejected": -0.9385896921157837,
709
+ "loss": 1.2187,
710
+ "rewards/accuracies": 0.543749988079071,
711
+ "rewards/chosen": 0.07790811359882355,
712
+ "rewards/margins": 0.21173524856567383,
713
+ "rewards/rejected": -0.13382713496685028,
714
  "step": 460
715
  },
716
  {
717
+ "epoch": 0.9099709583736689,
718
+ "grad_norm": 4757.142899979977,
719
  "learning_rate": 2.4055035642222225e-11,
720
+ "logits/chosen": -1.6382324695587158,
721
+ "logits/rejected": -1.5700418949127197,
722
+ "logps/chosen": -0.9839082956314087,
723
+ "logps/rejected": -0.9138771295547485,
724
+ "loss": 1.2532,
725
+ "rewards/accuracies": 0.4749999940395355,
726
+ "rewards/chosen": 0.08895201981067657,
727
+ "rewards/margins": 0.04646927863359451,
728
+ "rewards/rejected": 0.04248274117708206,
729
  "step": 470
730
  },
731
  {
732
+ "epoch": 0.9293320425943853,
733
+ "grad_norm": 3836.5764604419264,
734
  "learning_rate": 1.477942587339426e-11,
735
+ "logits/chosen": -1.6090694665908813,
736
+ "logits/rejected": -1.5662223100662231,
737
+ "logps/chosen": -0.9489457011222839,
738
+ "logps/rejected": -0.90757817029953,
739
+ "loss": 1.2642,
740
+ "rewards/accuracies": 0.48750001192092896,
741
+ "rewards/chosen": -0.05137147754430771,
742
+ "rewards/margins": -0.01941109262406826,
743
+ "rewards/rejected": -0.0319603867828846,
744
  "step": 480
745
  },
746
  {
747
+ "epoch": 0.9486931268151017,
748
+ "grad_norm": 4250.965954777766,
749
  "learning_rate": 7.727315816331515e-12,
750
+ "logits/chosen": -1.6554877758026123,
751
+ "logits/rejected": -1.6055545806884766,
752
+ "logps/chosen": -1.0777404308319092,
753
+ "logps/rejected": -1.0004560947418213,
754
+ "loss": 1.2376,
755
+ "rewards/accuracies": 0.543749988079071,
756
+ "rewards/chosen": 0.07162754982709885,
757
+ "rewards/margins": 0.12462921440601349,
758
+ "rewards/rejected": -0.05300166457891464,
759
  "step": 490
760
  },
761
  {
762
+ "epoch": 0.968054211035818,
763
+ "grad_norm": 3914.5437511702235,
764
  "learning_rate": 2.9310214228202016e-12,
765
+ "logits/chosen": -1.6229422092437744,
766
+ "logits/rejected": -1.5482664108276367,
767
+ "logps/chosen": -0.9587628245353699,
768
+ "logps/rejected": -0.9172463417053223,
769
+ "loss": 1.2265,
770
+ "rewards/accuracies": 0.44999998807907104,
771
+ "rewards/chosen": -0.032034747302532196,
772
+ "rewards/margins": -0.1518467366695404,
773
+ "rewards/rejected": 0.1198119968175888,
774
  "step": 500
775
  },
776
  {
777
+ "epoch": 0.9874152952565344,
778
+ "grad_norm": 4236.531801010516,
779
  "learning_rate": 4.125214789427734e-13,
780
+ "logits/chosen": -1.5640289783477783,
781
+ "logits/rejected": -1.5025882720947266,
782
+ "logps/chosen": -0.9930515289306641,
783
+ "logps/rejected": -0.9385469555854797,
784
+ "loss": 1.254,
785
+ "rewards/accuracies": 0.48750001192092896,
786
+ "rewards/chosen": 0.09590280055999756,
787
+ "rewards/margins": 0.0512530580163002,
788
+ "rewards/rejected": 0.04464975371956825,
789
  "step": 510
790
  },
791
  {
792
+ "epoch": 0.9990319457889641,
793
  "step": 516,
794
  "total_flos": 0.0,
795
+ "train_loss": 1.2622329578843228,
796
+ "train_runtime": 8815.3772,
797
+ "train_samples_per_second": 7.496,
798
+ "train_steps_per_second": 0.059
799
  }
800
  ],
801
  "logging_steps": 10,
 
803
  "num_input_tokens_seen": 0,
804
  "num_train_epochs": 1,
805
  "save_steps": 100,
806
+ "stateful_callbacks": {
807
+ "TrainerControl": {
808
+ "args": {
809
+ "should_epoch_stop": false,
810
+ "should_evaluate": false,
811
+ "should_log": false,
812
+ "should_save": true,
813
+ "should_training_stop": false
814
+ },
815
+ "attributes": {}
816
+ }
817
+ },
818
  "total_flos": 0.0,
819
  "train_batch_size": 4,
820
  "trial_name": null,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b98ffc796b97b2fbb0115b6523bcf5cf2f91af7851d4af544ee956eaa778a5f3
3
- size 6328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd72d8e0cd286217c9a7fddf58a057686f97c0f94432feaca0ca148ca8dbc12e
3
+ size 6520