wzhouad commited on
Commit
8a50c97
1 Parent(s): e378454

Model save

Browse files
README.md CHANGED
@@ -35,14 +35,14 @@ More information needed
35
 
36
  The following hyperparameters were used during training:
37
  - learning_rate: 5e-07
38
- - train_batch_size: 8
39
- - eval_batch_size: 8
40
- - seed: 5
41
  - distributed_type: multi-GPU
42
  - num_devices: 8
43
- - gradient_accumulation_steps: 2
44
  - total_train_batch_size: 128
45
- - total_eval_batch_size: 64
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: cosine
48
  - lr_scheduler_warmup_ratio: 0.1
@@ -54,7 +54,7 @@ The following hyperparameters were used during training:
54
 
55
  ### Framework versions
56
 
57
- - Transformers 4.35.2
58
  - Pytorch 2.1.2+cu121
59
  - Datasets 2.14.6
60
- - Tokenizers 0.14.1
 
35
 
36
  The following hyperparameters were used during training:
37
  - learning_rate: 5e-07
38
+ - train_batch_size: 1
39
+ - eval_batch_size: 1
40
+ - seed: 42
41
  - distributed_type: multi-GPU
42
  - num_devices: 8
43
+ - gradient_accumulation_steps: 16
44
  - total_train_batch_size: 128
45
+ - total_eval_batch_size: 8
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: cosine
48
  - lr_scheduler_warmup_ratio: 0.1
 
54
 
55
  ### Framework versions
56
 
57
+ - Transformers 4.41.1
58
  - Pytorch 2.1.2+cu121
59
  - Datasets 2.14.6
60
+ - Tokenizers 0.19.1
all_results.json CHANGED
@@ -1,8 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 0.0972615369308142,
4
- "train_runtime": 3208.985,
5
- "train_samples": 51894,
6
- "train_samples_per_second": 16.171,
7
- "train_steps_per_second": 0.126
 
8
  }
 
1
  {
2
+ "epoch": 0.9998009950248756,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.56497666932595,
5
+ "train_runtime": 62670.8183,
6
+ "train_samples": 160800,
7
+ "train_samples_per_second": 2.566,
8
+ "train_steps_per_second": 0.02
9
  }
config.json CHANGED
@@ -3,6 +3,7 @@
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
 
6
  "bos_token_id": 1,
7
  "eos_token_id": 2,
8
  "hidden_act": "silu",
@@ -19,7 +20,7 @@
19
  "sliding_window": 4096,
20
  "tie_word_embeddings": false,
21
  "torch_dtype": "bfloat16",
22
- "transformers_version": "4.35.2",
23
  "use_cache": false,
24
  "vocab_size": 32000
25
  }
 
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
6
+ "attention_dropout": 0.0,
7
  "bos_token_id": 1,
8
  "eos_token_id": 2,
9
  "hidden_act": "silu",
 
20
  "sliding_window": 4096,
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.41.1",
24
  "use_cache": false,
25
  "vocab_size": 32000
26
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
- "transformers_version": "4.35.2"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
+ "transformers_version": "4.41.1"
6
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e0a7d68eaea3c4ea34548f08b3bfd85cbc9c97f6f470fa5af241332e1cd329c
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e11dcd61cc10207ffa429232e22f4cd1209b869b363230b1b23c8229fec53ac
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c22cbc782090250ae06de6e3181fa820c3c731fc028cf167d49f33ecfb1be66c
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:182675acf06d7733583a038c80e4ca834644ab7b1ee0b07cac182d9ff6d6bfce
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f4ac57d104c68b576a9438ec037c2727070a309956439d4dd7f1e28baf706ff
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b33509881c7f28a3653c4f33545c0462135ed2ffe7316c0e4ac0f29f06a94027
3
  size 4540516344
tokenizer.json CHANGED
@@ -134,6 +134,7 @@
134
  "end_of_word_suffix": null,
135
  "fuse_unk": true,
136
  "byte_fallback": true,
 
137
  "vocab": {
138
  "<unk>": 0,
139
  "<s>": 1,
 
134
  "end_of_word_suffix": null,
135
  "fuse_unk": true,
136
  "byte_fallback": true,
137
+ "ignore_merges": false,
138
  "vocab": {
139
  "<unk>": 0,
140
  "<s>": 1,
tokenizer_config.json CHANGED
@@ -1,4 +1,6 @@
1
  {
 
 
2
  "added_tokens_decoder": {
3
  "0": {
4
  "content": "<unk>",
 
1
  {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",
train_results.json CHANGED
@@ -1,8 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 0.0972615369308142,
4
- "train_runtime": 3208.985,
5
- "train_samples": 51894,
6
- "train_samples_per_second": 16.171,
7
- "train_steps_per_second": 0.126
 
8
  }
 
1
  {
2
+ "epoch": 0.9998009950248756,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.56497666932595,
5
+ "train_runtime": 62670.8183,
6
+ "train_samples": 160800,
7
+ "train_samples_per_second": 2.566,
8
+ "train_steps_per_second": 0.02
9
  }
trainer_state.json CHANGED
@@ -1,21 +1,22 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.998766954377312,
5
- "eval_steps": 1000,
6
- "global_step": 405,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0,
13
- "learning_rate": 1.2195121951219512e-08,
14
- "logits/chosen": -2.8695335388183594,
15
- "logits/rejected": -2.8522377014160156,
16
- "logps/chosen": -537.80126953125,
17
- "logps/rejected": -108.91968536376953,
18
- "loss": 0.4013,
 
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
@@ -23,580 +24,1909 @@
23
  "step": 1
24
  },
25
  {
26
- "epoch": 0.02,
27
- "learning_rate": 1.219512195121951e-07,
28
- "logits/chosen": -2.800579786300659,
29
- "logits/rejected": -2.7510969638824463,
30
- "logps/chosen": -339.13104248046875,
31
- "logps/rejected": -113.41000366210938,
32
- "loss": 0.4182,
33
- "rewards/accuracies": 0.5486111044883728,
34
- "rewards/chosen": 0.0010713347001001239,
35
- "rewards/margins": 0.0017763269133865833,
36
- "rewards/rejected": -0.0007049919222481549,
 
37
  "step": 10
38
  },
39
  {
40
- "epoch": 0.05,
41
- "learning_rate": 2.439024390243902e-07,
42
- "logits/chosen": -2.816149950027466,
43
- "logits/rejected": -2.8076975345611572,
44
- "logps/chosen": -435.25836181640625,
45
- "logps/rejected": -116.08283996582031,
46
- "loss": 0.4089,
47
- "rewards/accuracies": 0.800000011920929,
48
- "rewards/chosen": 0.02127786912024021,
49
- "rewards/margins": 0.03858271613717079,
50
- "rewards/rejected": -0.01730484515428543,
 
51
  "step": 20
52
  },
53
  {
54
- "epoch": 0.07,
55
- "learning_rate": 3.6585365853658536e-07,
56
- "logits/chosen": -2.7240824699401855,
57
- "logits/rejected": -2.686880111694336,
58
- "logps/chosen": -437.80010986328125,
59
- "logps/rejected": -141.4352264404297,
60
- "loss": 0.392,
61
- "rewards/accuracies": 0.831250011920929,
62
- "rewards/chosen": 0.08666273951530457,
63
- "rewards/margins": 0.2057490050792694,
64
- "rewards/rejected": -0.11908626556396484,
 
65
  "step": 30
66
  },
67
  {
68
- "epoch": 0.1,
69
- "learning_rate": 4.878048780487804e-07,
70
- "logits/chosen": -2.5924689769744873,
71
- "logits/rejected": -2.5725345611572266,
72
- "logps/chosen": -414.5943908691406,
73
- "logps/rejected": -170.07090759277344,
74
- "loss": 0.3413,
75
- "rewards/accuracies": 0.7749999761581421,
76
- "rewards/chosen": 0.019387617707252502,
77
- "rewards/margins": 0.47896456718444824,
78
- "rewards/rejected": -0.45957690477371216,
 
79
  "step": 40
80
  },
81
  {
82
- "epoch": 0.12,
83
- "learning_rate": 4.992461696250783e-07,
84
- "logits/chosen": -2.465122938156128,
85
- "logits/rejected": -2.4514966011047363,
86
- "logps/chosen": -422.89739990234375,
87
- "logps/rejected": -215.4567108154297,
88
- "loss": 0.2558,
89
- "rewards/accuracies": 0.7250000238418579,
90
- "rewards/chosen": -0.1378343403339386,
91
- "rewards/margins": 0.8191972970962524,
92
- "rewards/rejected": -0.9570316076278687,
 
93
  "step": 50
94
  },
95
  {
96
- "epoch": 0.15,
97
- "learning_rate": 4.966461721767899e-07,
98
- "logits/chosen": -2.404264450073242,
99
- "logits/rejected": -2.3718972206115723,
100
- "logps/chosen": -380.6066589355469,
101
- "logps/rejected": -213.6814727783203,
102
- "loss": 0.1652,
103
- "rewards/accuracies": 0.75,
104
- "rewards/chosen": -0.3525749742984772,
105
- "rewards/margins": 0.8845361471176147,
106
- "rewards/rejected": -1.2371110916137695,
 
107
  "step": 60
108
  },
109
  {
110
- "epoch": 0.17,
111
- "learning_rate": 4.922100518015975e-07,
112
- "logits/chosen": -2.402127742767334,
113
- "logits/rejected": -2.364607572555542,
114
- "logps/chosen": -452.73553466796875,
115
- "logps/rejected": -278.8863220214844,
116
- "loss": 0.1213,
117
- "rewards/accuracies": 0.762499988079071,
118
- "rewards/chosen": -0.5289198160171509,
119
- "rewards/margins": 1.2455707788467407,
120
- "rewards/rejected": -1.7744905948638916,
 
121
  "step": 70
122
  },
123
  {
124
- "epoch": 0.2,
125
- "learning_rate": 4.859708325770919e-07,
126
- "logits/chosen": -2.3720970153808594,
127
- "logits/rejected": -2.31769061088562,
128
- "logps/chosen": -442.95330810546875,
129
- "logps/rejected": -290.1528625488281,
130
- "loss": 0.1225,
131
- "rewards/accuracies": 0.8374999761581421,
132
- "rewards/chosen": -0.32007747888565063,
133
- "rewards/margins": 1.4891936779022217,
134
- "rewards/rejected": -1.809271216392517,
 
135
  "step": 80
136
  },
137
  {
138
- "epoch": 0.22,
139
- "learning_rate": 4.779749614980225e-07,
140
- "logits/chosen": -2.404604434967041,
141
- "logits/rejected": -2.3565943241119385,
142
- "logps/chosen": -460.10040283203125,
143
- "logps/rejected": -335.51055908203125,
144
- "loss": 0.0885,
145
- "rewards/accuracies": 0.793749988079071,
146
- "rewards/chosen": -0.7478345632553101,
147
- "rewards/margins": 1.5196093320846558,
148
- "rewards/rejected": -2.2674436569213867,
 
149
  "step": 90
150
  },
151
  {
152
- "epoch": 0.25,
153
- "learning_rate": 4.682819627081427e-07,
154
- "logits/chosen": -2.369783878326416,
155
- "logits/rejected": -2.2862191200256348,
156
- "logps/chosen": -513.9479370117188,
157
- "logps/rejected": -387.48724365234375,
158
- "loss": 0.0749,
159
- "rewards/accuracies": 0.7250000238418579,
160
- "rewards/chosen": -0.9055101275444031,
161
- "rewards/margins": 1.8521873950958252,
162
- "rewards/rejected": -2.757697582244873,
 
163
  "step": 100
164
  },
165
  {
166
- "epoch": 0.27,
167
- "learning_rate": 4.569639943810477e-07,
168
- "logits/chosen": -2.3708443641662598,
169
- "logits/rejected": -2.2844862937927246,
170
- "logps/chosen": -536.1383056640625,
171
- "logps/rejected": -396.71923828125,
172
- "loss": 0.0627,
173
- "rewards/accuracies": 0.8187500238418579,
174
- "rewards/chosen": -0.9352647066116333,
175
- "rewards/margins": 1.9346840381622314,
176
- "rewards/rejected": -2.869948625564575,
 
177
  "step": 110
178
  },
179
  {
180
- "epoch": 0.3,
181
- "learning_rate": 4.4410531154874543e-07,
182
- "logits/chosen": -2.360198497772217,
183
- "logits/rejected": -2.2735414505004883,
184
- "logps/chosen": -555.21142578125,
185
- "logps/rejected": -448.2564392089844,
186
- "loss": 0.0447,
187
- "rewards/accuracies": 0.7875000238418579,
188
- "rewards/chosen": -1.2120835781097412,
189
- "rewards/margins": 2.0949184894561768,
190
- "rewards/rejected": -3.307002305984497,
 
191
  "step": 120
192
  },
193
  {
194
- "epoch": 0.32,
195
- "learning_rate": 4.298016388768561e-07,
196
- "logits/chosen": -2.346752643585205,
197
- "logits/rejected": -2.2457778453826904,
198
- "logps/chosen": -547.1868286132812,
199
- "logps/rejected": -450.1656188964844,
200
- "loss": 0.0567,
201
- "rewards/accuracies": 0.800000011920929,
202
- "rewards/chosen": -1.2032897472381592,
203
- "rewards/margins": 2.0359203815460205,
204
- "rewards/rejected": -3.2392101287841797,
 
205
  "step": 130
206
  },
207
  {
208
- "epoch": 0.35,
209
- "learning_rate": 4.1415945805573005e-07,
210
- "logits/chosen": -2.378035306930542,
211
- "logits/rejected": -2.312330484390259,
212
- "logps/chosen": -524.4827880859375,
213
- "logps/rejected": -409.43292236328125,
214
- "loss": 0.0564,
215
- "rewards/accuracies": 0.824999988079071,
216
- "rewards/chosen": -0.7949830889701843,
217
- "rewards/margins": 2.1438651084899902,
218
- "rewards/rejected": -2.9388484954833984,
 
219
  "step": 140
220
  },
221
  {
222
- "epoch": 0.37,
223
- "learning_rate": 3.972952151123984e-07,
224
- "logits/chosen": -2.367128849029541,
225
- "logits/rejected": -2.2979280948638916,
226
- "logps/chosen": -505.3435974121094,
227
- "logps/rejected": -405.11370849609375,
228
- "loss": 0.0673,
229
- "rewards/accuracies": 0.8812500238418579,
230
- "rewards/chosen": -0.8093282580375671,
231
- "rewards/margins": 1.9671752452850342,
232
- "rewards/rejected": -2.776503324508667,
 
233
  "step": 150
234
  },
235
  {
236
- "epoch": 0.39,
237
- "learning_rate": 3.793344535444142e-07,
238
- "logits/chosen": -2.397578239440918,
239
- "logits/rejected": -2.326962947845459,
240
- "logps/chosen": -546.9036865234375,
241
- "logps/rejected": -425.72119140625,
242
- "loss": 0.0522,
243
- "rewards/accuracies": 0.856249988079071,
244
- "rewards/chosen": -0.8053304553031921,
245
- "rewards/margins": 2.2930407524108887,
246
- "rewards/rejected": -3.098371744155884,
 
247
  "step": 160
248
  },
249
  {
250
- "epoch": 0.42,
251
- "learning_rate": 3.604108797288461e-07,
252
- "logits/chosen": -2.3404340744018555,
253
- "logits/rejected": -2.2825286388397217,
254
- "logps/chosen": -510.0309143066406,
255
- "logps/rejected": -448.40496826171875,
256
- "loss": 0.0549,
257
- "rewards/accuracies": 0.8125,
258
- "rewards/chosen": -1.0596367120742798,
259
- "rewards/margins": 2.231541633605957,
260
- "rewards/rejected": -3.2911782264709473,
 
261
  "step": 170
262
  },
263
  {
264
- "epoch": 0.44,
265
- "learning_rate": 3.40665367563858e-07,
266
- "logits/chosen": -2.36769437789917,
267
- "logits/rejected": -2.2915241718292236,
268
- "logps/chosen": -449.4844665527344,
269
- "logps/rejected": -359.50042724609375,
270
- "loss": 0.0779,
271
- "rewards/accuracies": 0.800000011920929,
272
- "rewards/chosen": -0.7590703964233398,
273
- "rewards/margins": 1.8968210220336914,
274
- "rewards/rejected": -2.655891180038452,
 
275
  "step": 180
276
  },
277
  {
278
- "epoch": 0.47,
279
- "learning_rate": 3.202449097526798e-07,
280
- "logits/chosen": -2.353964328765869,
281
- "logits/rejected": -2.2665138244628906,
282
- "logps/chosen": -505.6192932128906,
283
- "logps/rejected": -436.03594970703125,
284
- "loss": 0.0627,
285
- "rewards/accuracies": 0.800000011920929,
286
- "rewards/chosen": -1.1873983144760132,
287
- "rewards/margins": 2.045551300048828,
288
- "rewards/rejected": -3.2329494953155518,
 
289
  "step": 190
290
  },
291
  {
292
- "epoch": 0.49,
293
- "learning_rate": 2.993015235369905e-07,
294
- "logits/chosen": -2.31779146194458,
295
- "logits/rejected": -2.2251808643341064,
296
- "logps/chosen": -563.7548217773438,
297
- "logps/rejected": -458.24029541015625,
298
- "loss": 0.047,
299
- "rewards/accuracies": 0.8125,
300
- "rewards/chosen": -1.2938554286956787,
301
- "rewards/margins": 2.102210283279419,
302
- "rewards/rejected": -3.3960654735565186,
 
303
  "step": 200
304
  },
305
  {
306
- "epoch": 0.52,
307
- "learning_rate": 2.7799111902582693e-07,
308
- "logits/chosen": -2.2480106353759766,
309
- "logits/rejected": -2.1538870334625244,
310
- "logps/chosen": -541.3148193359375,
311
- "logps/rejected": -456.4246520996094,
312
- "loss": 0.0443,
313
- "rewards/accuracies": 0.7749999761581421,
314
- "rewards/chosen": -1.5714356899261475,
315
- "rewards/margins": 1.8790645599365234,
316
- "rewards/rejected": -3.45050048828125,
 
317
  "step": 210
318
  },
319
  {
320
- "epoch": 0.54,
321
- "learning_rate": 2.564723385445869e-07,
322
- "logits/chosen": -2.252617597579956,
323
- "logits/rejected": -2.1605441570281982,
324
- "logps/chosen": -563.362060546875,
325
- "logps/rejected": -470.49188232421875,
326
- "loss": 0.0489,
327
- "rewards/accuracies": 0.8187500238418579,
328
- "rewards/chosen": -1.2414028644561768,
329
- "rewards/margins": 2.133410930633545,
330
- "rewards/rejected": -3.3748135566711426,
 
331
  "step": 220
332
  },
333
  {
334
- "epoch": 0.57,
335
- "learning_rate": 2.3490537564442845e-07,
336
- "logits/chosen": -2.3041675090789795,
337
- "logits/rejected": -2.227656602859497,
338
- "logps/chosen": -516.6383056640625,
339
- "logps/rejected": -429.8155212402344,
340
- "loss": 0.0476,
341
- "rewards/accuracies": 0.7749999761581421,
342
- "rewards/chosen": -1.1023118495941162,
343
- "rewards/margins": 2.013040542602539,
344
- "rewards/rejected": -3.1153526306152344,
 
345
  "step": 230
346
  },
347
  {
348
- "epoch": 0.59,
349
- "learning_rate": 2.1345078256378801e-07,
350
- "logits/chosen": -2.3148865699768066,
351
- "logits/rejected": -2.2272677421569824,
352
- "logps/chosen": -514.4750366210938,
353
- "logps/rejected": -460.95562744140625,
354
- "loss": 0.0404,
355
- "rewards/accuracies": 0.831250011920929,
356
- "rewards/chosen": -1.0974665880203247,
357
- "rewards/margins": 2.3115158081054688,
358
- "rewards/rejected": -3.408982753753662,
 
359
  "step": 240
360
  },
361
  {
362
- "epoch": 0.62,
363
- "learning_rate": 1.9226827501969865e-07,
364
- "logits/chosen": -2.286684513092041,
365
- "logits/rejected": -2.1823794841766357,
366
- "logps/chosen": -561.267333984375,
367
- "logps/rejected": -515.2468872070312,
368
- "loss": 0.0491,
369
- "rewards/accuracies": 0.84375,
370
- "rewards/chosen": -1.3303524255752563,
371
- "rewards/margins": 2.644932985305786,
372
- "rewards/rejected": -3.975285291671753,
 
373
  "step": 250
374
  },
375
  {
376
- "epoch": 0.64,
377
- "learning_rate": 1.715155432264775e-07,
378
- "logits/chosen": -2.276379108428955,
379
- "logits/rejected": -2.1837644577026367,
380
- "logps/chosen": -511.8189392089844,
381
- "logps/rejected": -438.93511962890625,
382
- "loss": 0.0623,
383
- "rewards/accuracies": 0.831250011920929,
384
- "rewards/chosen": -1.2629977464675903,
385
- "rewards/margins": 2.0327014923095703,
386
- "rewards/rejected": -3.2956995964050293,
 
387
  "step": 260
388
  },
389
  {
390
- "epoch": 0.67,
391
- "learning_rate": 1.51347077992983e-07,
392
- "logits/chosen": -2.321298360824585,
393
- "logits/rejected": -2.2319440841674805,
394
- "logps/chosen": -495.376708984375,
395
- "logps/rejected": -422.41448974609375,
396
- "loss": 0.0556,
397
- "rewards/accuracies": 0.84375,
398
- "rewards/chosen": -0.9975100755691528,
399
- "rewards/margins": 2.0017402172088623,
400
- "rewards/rejected": -2.9992504119873047,
 
401
  "step": 270
402
  },
403
  {
404
- "epoch": 0.69,
405
- "learning_rate": 1.3191302063739906e-07,
406
- "logits/chosen": -2.3268749713897705,
407
- "logits/rejected": -2.2400355339050293,
408
- "logps/chosen": -513.5865478515625,
409
- "logps/rejected": -432.70721435546875,
410
- "loss": 0.0528,
411
- "rewards/accuracies": 0.824999988079071,
412
- "rewards/chosen": -1.1590217351913452,
413
- "rewards/margins": 2.0167365074157715,
414
- "rewards/rejected": -3.175758123397827,
 
415
  "step": 280
416
  },
417
  {
418
- "epoch": 0.72,
419
- "learning_rate": 1.1335804528119475e-07,
420
- "logits/chosen": -2.3036351203918457,
421
- "logits/rejected": -2.1903886795043945,
422
- "logps/chosen": -558.5098876953125,
423
- "logps/rejected": -453.27960205078125,
424
- "loss": 0.0408,
425
- "rewards/accuracies": 0.862500011920929,
426
- "rewards/chosen": -1.1243644952774048,
427
- "rewards/margins": 2.320279598236084,
428
- "rewards/rejected": -3.4446442127227783,
 
429
  "step": 290
430
  },
431
  {
432
- "epoch": 0.74,
433
- "learning_rate": 9.582028184286423e-08,
434
- "logits/chosen": -2.2981297969818115,
435
- "logits/rejected": -2.222533702850342,
436
- "logps/chosen": -494.6380310058594,
437
- "logps/rejected": -438.770263671875,
438
- "loss": 0.0439,
439
- "rewards/accuracies": 0.768750011920929,
440
- "rewards/chosen": -1.3560435771942139,
441
- "rewards/margins": 1.9616880416870117,
442
- "rewards/rejected": -3.3177313804626465,
 
443
  "step": 300
444
  },
445
  {
446
- "epoch": 0.76,
447
- "learning_rate": 7.943028774907065e-08,
448
- "logits/chosen": -2.288895845413208,
449
- "logits/rejected": -2.2124645709991455,
450
- "logps/chosen": -521.3748168945312,
451
- "logps/rejected": -440.52191162109375,
452
- "loss": 0.0439,
453
- "rewards/accuracies": 0.8062499761581421,
454
- "rewards/chosen": -1.1570526361465454,
455
- "rewards/margins": 2.175691604614258,
456
- "rewards/rejected": -3.3327438831329346,
 
457
  "step": 310
458
  },
459
  {
460
- "epoch": 0.79,
461
- "learning_rate": 6.431007601814637e-08,
462
- "logits/chosen": -2.3875412940979004,
463
- "logits/rejected": -2.293565511703491,
464
- "logps/chosen": -557.4725341796875,
465
- "logps/rejected": -475.80682373046875,
466
- "loss": 0.0443,
467
- "rewards/accuracies": 0.8062499761581421,
468
- "rewards/chosen": -1.295273780822754,
469
- "rewards/margins": 2.2585067749023438,
470
- "rewards/rejected": -3.5537807941436768,
 
471
  "step": 320
472
  },
473
  {
474
- "epoch": 0.81,
475
- "learning_rate": 5.0572206951246e-08,
476
- "logits/chosen": -2.3252406120300293,
477
- "logits/rejected": -2.215536594390869,
478
- "logps/chosen": -561.9892578125,
479
- "logps/rejected": -471.8111267089844,
480
- "loss": 0.0462,
481
- "rewards/accuracies": 0.856249988079071,
482
- "rewards/chosen": -1.0945953130722046,
483
- "rewards/margins": 2.469850540161133,
484
- "rewards/rejected": -3.564445972442627,
 
485
  "step": 330
486
  },
487
  {
488
- "epoch": 0.84,
489
- "learning_rate": 3.831895019292897e-08,
490
- "logits/chosen": -2.359334707260132,
491
- "logits/rejected": -2.2646141052246094,
492
- "logps/chosen": -575.1267700195312,
493
- "logps/rejected": -495.489990234375,
494
- "loss": 0.0449,
495
- "rewards/accuracies": 0.84375,
496
- "rewards/chosen": -1.101499080657959,
497
- "rewards/margins": 2.532172679901123,
498
- "rewards/rejected": -3.633671283721924,
 
499
  "step": 340
500
  },
501
  {
502
- "epoch": 0.86,
503
- "learning_rate": 2.764152339909756e-08,
504
- "logits/chosen": -2.3170971870422363,
505
- "logits/rejected": -2.2212605476379395,
506
- "logps/chosen": -514.919677734375,
507
- "logps/rejected": -437.05328369140625,
508
- "loss": 0.0519,
509
- "rewards/accuracies": 0.8374999761581421,
510
- "rewards/chosen": -1.214154601097107,
511
- "rewards/margins": 2.114192008972168,
512
- "rewards/rejected": -3.3283467292785645,
 
513
  "step": 350
514
  },
515
  {
516
- "epoch": 0.89,
517
- "learning_rate": 1.861941317991664e-08,
518
- "logits/chosen": -2.298218011856079,
519
- "logits/rejected": -2.2111034393310547,
520
- "logps/chosen": -522.8499755859375,
521
- "logps/rejected": -448.1224060058594,
522
- "loss": 0.0456,
523
- "rewards/accuracies": 0.7875000238418579,
524
- "rewards/chosen": -1.3821567296981812,
525
- "rewards/margins": 1.8705806732177734,
526
- "rewards/rejected": -3.252737522125244,
 
527
  "step": 360
528
  },
529
  {
530
- "epoch": 0.91,
531
- "learning_rate": 1.13197833728636e-08,
532
- "logits/chosen": -2.302549362182617,
533
- "logits/rejected": -2.2120444774627686,
534
- "logps/chosen": -537.9904174804688,
535
- "logps/rejected": -457.0699768066406,
536
- "loss": 0.0438,
537
- "rewards/accuracies": 0.8187500238418579,
538
- "rewards/chosen": -1.2071287631988525,
539
- "rewards/margins": 2.221251964569092,
540
- "rewards/rejected": -3.4283804893493652,
 
541
  "step": 370
542
  },
543
  {
544
- "epoch": 0.94,
545
- "learning_rate": 5.79697505093521e-09,
546
- "logits/chosen": -2.279075860977173,
547
- "logits/rejected": -2.203881025314331,
548
- "logps/chosen": -499.07574462890625,
549
- "logps/rejected": -420.45391845703125,
550
- "loss": 0.0403,
551
- "rewards/accuracies": 0.8187500238418579,
552
- "rewards/chosen": -1.144665241241455,
553
- "rewards/margins": 2.001462459564209,
554
- "rewards/rejected": -3.146127939224243,
 
555
  "step": 380
556
  },
557
  {
558
- "epoch": 0.96,
559
- "learning_rate": 2.092101988131256e-09,
560
- "logits/chosen": -2.362111806869507,
561
- "logits/rejected": -2.266643524169922,
562
- "logps/chosen": -548.8084716796875,
563
- "logps/rejected": -460.02392578125,
564
- "loss": 0.0407,
565
- "rewards/accuracies": 0.800000011920929,
566
- "rewards/chosen": -1.122992753982544,
567
- "rewards/margins": 2.259247064590454,
568
- "rewards/rejected": -3.382239818572998,
 
569
  "step": 390
570
  },
571
  {
572
- "epoch": 0.99,
573
- "learning_rate": 2.327445937151673e-10,
574
- "logits/chosen": -2.2871899604797363,
575
- "logits/rejected": -2.2090656757354736,
576
- "logps/chosen": -536.6136474609375,
577
- "logps/rejected": -451.55950927734375,
578
- "loss": 0.0506,
579
- "rewards/accuracies": 0.8374999761581421,
580
- "rewards/chosen": -1.1807712316513062,
581
- "rewards/margins": 2.100186824798584,
582
- "rewards/rejected": -3.2809581756591797,
 
583
  "step": 400
584
  },
585
  {
586
- "epoch": 1.0,
587
- "step": 405,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
588
  "total_flos": 0.0,
589
- "train_loss": 0.0972615369308142,
590
- "train_runtime": 3208.985,
591
- "train_samples_per_second": 16.171,
592
- "train_steps_per_second": 0.126
593
  }
594
  ],
595
  "logging_steps": 10,
596
- "max_steps": 405,
 
597
  "num_train_epochs": 1,
598
- "save_steps": 1000,
 
 
 
 
 
 
 
 
 
 
 
 
599
  "total_flos": 0.0,
 
600
  "trial_name": null,
601
  "trial_params": null
602
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9998009950248756,
5
+ "eval_steps": 100,
6
+ "global_step": 1256,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0007960199004975124,
13
+ "grad_norm": 4.987965979355318,
14
+ "learning_rate": 3.968253968253968e-09,
15
+ "logits/chosen": -2.866555690765381,
16
+ "logits/rejected": -2.8678386211395264,
17
+ "logps/chosen": -150.79409790039062,
18
+ "logps/rejected": -167.60751342773438,
19
+ "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
22
  "rewards/margins": 0.0,
 
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.007960199004975124,
28
+ "grad_norm": 5.158661804258714,
29
+ "learning_rate": 3.968253968253968e-08,
30
+ "logits/chosen": -2.905927896499634,
31
+ "logits/rejected": -2.877537250518799,
32
+ "logps/chosen": -169.23851013183594,
33
+ "logps/rejected": -156.21502685546875,
34
+ "loss": 0.6932,
35
+ "rewards/accuracies": 0.4444444477558136,
36
+ "rewards/chosen": 0.00022146817354951054,
37
+ "rewards/margins": 7.194941281341016e-05,
38
+ "rewards/rejected": 0.00014951876073610038,
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.015920398009950248,
43
+ "grad_norm": 4.941389073277953,
44
+ "learning_rate": 7.936507936507936e-08,
45
+ "logits/chosen": -2.8461222648620605,
46
+ "logits/rejected": -2.8314127922058105,
47
+ "logps/chosen": -143.2448272705078,
48
+ "logps/rejected": -139.14114379882812,
49
+ "loss": 0.6932,
50
+ "rewards/accuracies": 0.48750001192092896,
51
+ "rewards/chosen": -3.118843596894294e-05,
52
+ "rewards/margins": -0.00014748479588888586,
53
+ "rewards/rejected": 0.00011629634536802769,
54
  "step": 20
55
  },
56
  {
57
+ "epoch": 0.023880597014925373,
58
+ "grad_norm": 4.789407789219409,
59
+ "learning_rate": 1.1904761904761903e-07,
60
+ "logits/chosen": -2.8516290187835693,
61
+ "logits/rejected": -2.8340375423431396,
62
+ "logps/chosen": -133.89068603515625,
63
+ "logps/rejected": -137.05783081054688,
64
+ "loss": 0.6931,
65
+ "rewards/accuracies": 0.5375000238418579,
66
+ "rewards/chosen": 0.00035750039387494326,
67
+ "rewards/margins": 0.00034277202212251723,
68
+ "rewards/rejected": 1.4728342648595572e-05,
69
  "step": 30
70
  },
71
  {
72
+ "epoch": 0.031840796019900496,
73
+ "grad_norm": 5.0684575947852935,
74
+ "learning_rate": 1.5873015873015872e-07,
75
+ "logits/chosen": -2.8869121074676514,
76
+ "logits/rejected": -2.8650405406951904,
77
+ "logps/chosen": -149.87741088867188,
78
+ "logps/rejected": -154.5808563232422,
79
+ "loss": 0.6928,
80
+ "rewards/accuracies": 0.5562499761581421,
81
+ "rewards/chosen": 0.0008229534141719341,
82
+ "rewards/margins": 0.0007549519650638103,
83
+ "rewards/rejected": 6.800144910812378e-05,
84
  "step": 40
85
  },
86
  {
87
+ "epoch": 0.03980099502487562,
88
+ "grad_norm": 4.886701576208831,
89
+ "learning_rate": 1.984126984126984e-07,
90
+ "logits/chosen": -2.8915724754333496,
91
+ "logits/rejected": -2.8401710987091064,
92
+ "logps/chosen": -153.5601348876953,
93
+ "logps/rejected": -127.82198333740234,
94
+ "loss": 0.6923,
95
+ "rewards/accuracies": 0.5562499761581421,
96
+ "rewards/chosen": -0.00028051590197719634,
97
+ "rewards/margins": 0.0019810814410448074,
98
+ "rewards/rejected": -0.002261597430333495,
99
  "step": 50
100
  },
101
  {
102
+ "epoch": 0.04776119402985075,
103
+ "grad_norm": 4.80539942483475,
104
+ "learning_rate": 2.3809523809523806e-07,
105
+ "logits/chosen": -2.8685171604156494,
106
+ "logits/rejected": -2.885859966278076,
107
+ "logps/chosen": -138.50997924804688,
108
+ "logps/rejected": -153.02711486816406,
109
+ "loss": 0.6913,
110
+ "rewards/accuracies": 0.625,
111
+ "rewards/chosen": 0.0011361455544829369,
112
+ "rewards/margins": 0.003960339818149805,
113
+ "rewards/rejected": -0.002824194496497512,
114
  "step": 60
115
  },
116
  {
117
+ "epoch": 0.05572139303482587,
118
+ "grad_norm": 4.993634749933866,
119
+ "learning_rate": 2.7777777777777776e-07,
120
+ "logits/chosen": -2.8937714099884033,
121
+ "logits/rejected": -2.8720123767852783,
122
+ "logps/chosen": -140.58717346191406,
123
+ "logps/rejected": -159.0962677001953,
124
+ "loss": 0.6893,
125
+ "rewards/accuracies": 0.612500011920929,
126
+ "rewards/chosen": -0.006692352704703808,
127
+ "rewards/margins": 0.008710912428796291,
128
+ "rewards/rejected": -0.015403266064822674,
129
  "step": 70
130
  },
131
  {
132
+ "epoch": 0.06368159203980099,
133
+ "grad_norm": 5.018829367878226,
134
+ "learning_rate": 3.1746031746031743e-07,
135
+ "logits/chosen": -2.8606536388397217,
136
+ "logits/rejected": -2.8511064052581787,
137
+ "logps/chosen": -143.28701782226562,
138
+ "logps/rejected": -143.73817443847656,
139
+ "loss": 0.6878,
140
+ "rewards/accuracies": 0.6000000238418579,
141
+ "rewards/chosen": -0.019417399540543556,
142
+ "rewards/margins": 0.015563729219138622,
143
+ "rewards/rejected": -0.034981124103069305,
144
  "step": 80
145
  },
146
  {
147
+ "epoch": 0.07164179104477612,
148
+ "grad_norm": 5.3232578633563445,
149
+ "learning_rate": 3.5714285714285716e-07,
150
+ "logits/chosen": -2.8441901206970215,
151
+ "logits/rejected": -2.841014862060547,
152
+ "logps/chosen": -154.00717163085938,
153
+ "logps/rejected": -169.87318420410156,
154
+ "loss": 0.683,
155
+ "rewards/accuracies": 0.6187499761581421,
156
+ "rewards/chosen": -0.03319092467427254,
157
+ "rewards/margins": 0.028958678245544434,
158
+ "rewards/rejected": -0.06214960291981697,
159
  "step": 90
160
  },
161
  {
162
+ "epoch": 0.07960199004975124,
163
+ "grad_norm": 5.9751016265015515,
164
+ "learning_rate": 3.968253968253968e-07,
165
+ "logits/chosen": -2.8674731254577637,
166
+ "logits/rejected": -2.819610834121704,
167
+ "logps/chosen": -168.8125,
168
+ "logps/rejected": -154.88980102539062,
169
+ "loss": 0.6789,
170
+ "rewards/accuracies": 0.606249988079071,
171
+ "rewards/chosen": -0.09129344671964645,
172
+ "rewards/margins": 0.039778389036655426,
173
+ "rewards/rejected": -0.13107183575630188,
174
  "step": 100
175
  },
176
  {
177
+ "epoch": 0.08756218905472637,
178
+ "grad_norm": 7.043583746287595,
179
+ "learning_rate": 4.365079365079365e-07,
180
+ "logits/chosen": -2.8453006744384766,
181
+ "logits/rejected": -2.8008811473846436,
182
+ "logps/chosen": -166.57568359375,
183
+ "logps/rejected": -160.43548583984375,
184
+ "loss": 0.6753,
185
+ "rewards/accuracies": 0.6000000238418579,
186
+ "rewards/chosen": -0.1897437423467636,
187
+ "rewards/margins": 0.05553770065307617,
188
+ "rewards/rejected": -0.2452814280986786,
189
  "step": 110
190
  },
191
  {
192
+ "epoch": 0.0955223880597015,
193
+ "grad_norm": 8.93236047182562,
194
+ "learning_rate": 4.761904761904761e-07,
195
+ "logits/chosen": -2.861156940460205,
196
+ "logits/rejected": -2.8308346271514893,
197
+ "logps/chosen": -169.21116638183594,
198
+ "logps/rejected": -174.7342071533203,
199
+ "loss": 0.6668,
200
+ "rewards/accuracies": 0.637499988079071,
201
+ "rewards/chosen": -0.2347549945116043,
202
+ "rewards/margins": 0.07360492646694183,
203
+ "rewards/rejected": -0.30835992097854614,
204
  "step": 120
205
  },
206
  {
207
+ "epoch": 0.10348258706467661,
208
+ "grad_norm": 12.11276981143796,
209
+ "learning_rate": 4.999845414634076e-07,
210
+ "logits/chosen": -2.8432881832122803,
211
+ "logits/rejected": -2.8032021522521973,
212
+ "logps/chosen": -176.11865234375,
213
+ "logps/rejected": -188.36691284179688,
214
+ "loss": 0.644,
215
+ "rewards/accuracies": 0.71875,
216
+ "rewards/chosen": -0.360163152217865,
217
+ "rewards/margins": 0.17286832630634308,
218
+ "rewards/rejected": -0.5330314636230469,
219
  "step": 130
220
  },
221
  {
222
+ "epoch": 0.11144278606965174,
223
+ "grad_norm": 14.823561517874804,
224
+ "learning_rate": 4.998106548810311e-07,
225
+ "logits/chosen": -2.8333194255828857,
226
+ "logits/rejected": -2.786397695541382,
227
+ "logps/chosen": -193.32913208007812,
228
+ "logps/rejected": -193.43296813964844,
229
+ "loss": 0.639,
230
+ "rewards/accuracies": 0.6625000238418579,
231
+ "rewards/chosen": -0.42469319701194763,
232
+ "rewards/margins": 0.16398563981056213,
233
+ "rewards/rejected": -0.5886788964271545,
234
  "step": 140
235
  },
236
  {
237
+ "epoch": 0.11940298507462686,
238
+ "grad_norm": 18.490476889353154,
239
+ "learning_rate": 4.994436933879359e-07,
240
+ "logits/chosen": -2.742354393005371,
241
+ "logits/rejected": -2.6894731521606445,
242
+ "logps/chosen": -206.05099487304688,
243
+ "logps/rejected": -226.1792449951172,
244
+ "loss": 0.6134,
245
+ "rewards/accuracies": 0.7250000238418579,
246
+ "rewards/chosen": -0.5303625464439392,
247
+ "rewards/margins": 0.2593725323677063,
248
+ "rewards/rejected": -0.7897351384162903,
249
  "step": 150
250
  },
251
  {
252
+ "epoch": 0.12736318407960198,
253
+ "grad_norm": 16.59996040063754,
254
+ "learning_rate": 4.988839406031596e-07,
255
+ "logits/chosen": -2.64296555519104,
256
+ "logits/rejected": -2.6306753158569336,
257
+ "logps/chosen": -180.86181640625,
258
+ "logps/rejected": -229.1710968017578,
259
+ "loss": 0.6168,
260
+ "rewards/accuracies": 0.6937500238418579,
261
+ "rewards/chosen": -0.510300874710083,
262
+ "rewards/margins": 0.35416826605796814,
263
+ "rewards/rejected": -0.8644691705703735,
264
  "step": 160
265
  },
266
  {
267
+ "epoch": 0.13532338308457711,
268
+ "grad_norm": 12.427120919609093,
269
+ "learning_rate": 4.981318291512395e-07,
270
+ "logits/chosen": -2.585261821746826,
271
+ "logits/rejected": -2.602097988128662,
272
+ "logps/chosen": -192.22706604003906,
273
+ "logps/rejected": -224.95028686523438,
274
+ "loss": 0.6289,
275
+ "rewards/accuracies": 0.6187499761581421,
276
+ "rewards/chosen": -0.5405030250549316,
277
+ "rewards/margins": 0.21325179934501648,
278
+ "rewards/rejected": -0.7537548542022705,
279
  "step": 170
280
  },
281
  {
282
+ "epoch": 0.14328358208955225,
283
+ "grad_norm": 15.900669280547154,
284
+ "learning_rate": 4.971879403278432e-07,
285
+ "logits/chosen": -2.5604166984558105,
286
+ "logits/rejected": -2.5091071128845215,
287
+ "logps/chosen": -214.48095703125,
288
+ "logps/rejected": -222.5020751953125,
289
+ "loss": 0.6208,
290
+ "rewards/accuracies": 0.625,
291
+ "rewards/chosen": -0.6959569454193115,
292
+ "rewards/margins": 0.1984298974275589,
293
+ "rewards/rejected": -0.894386887550354,
294
  "step": 180
295
  },
296
  {
297
+ "epoch": 0.15124378109452735,
298
+ "grad_norm": 21.336526790653657,
299
+ "learning_rate": 4.960530036504941e-07,
300
+ "logits/chosen": -2.511948823928833,
301
+ "logits/rejected": -2.4782981872558594,
302
+ "logps/chosen": -247.1105499267578,
303
+ "logps/rejected": -259.957763671875,
304
+ "loss": 0.6225,
305
+ "rewards/accuracies": 0.59375,
306
+ "rewards/chosen": -0.9646499752998352,
307
+ "rewards/margins": 0.24105295538902283,
308
+ "rewards/rejected": -1.2057029008865356,
309
  "step": 190
310
  },
311
  {
312
+ "epoch": 0.15920398009950248,
313
+ "grad_norm": 14.558771925056739,
314
+ "learning_rate": 4.947278962947386e-07,
315
+ "logits/chosen": -2.4865562915802,
316
+ "logits/rejected": -2.4496474266052246,
317
+ "logps/chosen": -247.4215850830078,
318
+ "logps/rejected": -294.28839111328125,
319
+ "loss": 0.5863,
320
+ "rewards/accuracies": 0.7250000238418579,
321
+ "rewards/chosen": -1.000279188156128,
322
+ "rewards/margins": 0.5100765228271484,
323
+ "rewards/rejected": -1.5103557109832764,
324
  "step": 200
325
  },
326
  {
327
+ "epoch": 0.16716417910447762,
328
+ "grad_norm": 18.09768702444502,
329
+ "learning_rate": 4.932136424161899e-07,
330
+ "logits/chosen": -2.524082899093628,
331
+ "logits/rejected": -2.460231304168701,
332
+ "logps/chosen": -236.9945526123047,
333
+ "logps/rejected": -280.2134704589844,
334
+ "loss": 0.5892,
335
+ "rewards/accuracies": 0.6875,
336
+ "rewards/chosen": -0.9642118215560913,
337
+ "rewards/margins": 0.4870384633541107,
338
+ "rewards/rejected": -1.4512503147125244,
339
  "step": 210
340
  },
341
  {
342
+ "epoch": 0.17512437810945275,
343
+ "grad_norm": 15.582712307283431,
344
+ "learning_rate": 4.915114123589732e-07,
345
+ "logits/chosen": -2.4720115661621094,
346
+ "logits/rejected": -2.4532742500305176,
347
+ "logps/chosen": -277.3113708496094,
348
+ "logps/rejected": -346.95941162109375,
349
+ "loss": 0.6028,
350
+ "rewards/accuracies": 0.7124999761581421,
351
+ "rewards/chosen": -1.3337619304656982,
352
+ "rewards/margins": 0.5726526379585266,
353
+ "rewards/rejected": -1.9064146280288696,
354
  "step": 220
355
  },
356
  {
357
+ "epoch": 0.18308457711442785,
358
+ "grad_norm": 23.55648961804369,
359
+ "learning_rate": 4.896225217511849e-07,
360
+ "logits/chosen": -2.4816207885742188,
361
+ "logits/rejected": -2.427955389022827,
362
+ "logps/chosen": -294.205810546875,
363
+ "logps/rejected": -344.0794372558594,
364
+ "loss": 0.5888,
365
+ "rewards/accuracies": 0.675000011920929,
366
+ "rewards/chosen": -1.3327975273132324,
367
+ "rewards/margins": 0.5613763332366943,
368
+ "rewards/rejected": -1.8941738605499268,
369
  "step": 230
370
  },
371
  {
372
+ "epoch": 0.191044776119403,
373
+ "grad_norm": 13.953969214382605,
374
+ "learning_rate": 4.875484304880629e-07,
375
+ "logits/chosen": -2.4548521041870117,
376
+ "logits/rejected": -2.370633602142334,
377
+ "logps/chosen": -271.3977355957031,
378
+ "logps/rejected": -332.35137939453125,
379
+ "loss": 0.5803,
380
+ "rewards/accuracies": 0.737500011920929,
381
+ "rewards/chosen": -1.2848864793777466,
382
+ "rewards/margins": 0.5692149996757507,
383
+ "rewards/rejected": -1.854101538658142,
384
  "step": 240
385
  },
386
  {
387
+ "epoch": 0.19900497512437812,
388
+ "grad_norm": 18.518974677225064,
389
+ "learning_rate": 4.852907416036558e-07,
390
+ "logits/chosen": -2.3934974670410156,
391
+ "logits/rejected": -2.312051296234131,
392
+ "logps/chosen": -282.7490234375,
393
+ "logps/rejected": -325.5188293457031,
394
+ "loss": 0.5885,
395
+ "rewards/accuracies": 0.706250011920929,
396
+ "rewards/chosen": -1.3080084323883057,
397
+ "rewards/margins": 0.444119393825531,
398
+ "rewards/rejected": -1.7521278858184814,
399
  "step": 250
400
  },
401
  {
402
+ "epoch": 0.20696517412935322,
403
+ "grad_norm": 16.421871283301197,
404
+ "learning_rate": 4.828512000318616e-07,
405
+ "logits/chosen": -1.9985812902450562,
406
+ "logits/rejected": -1.7946439981460571,
407
+ "logps/chosen": -351.16864013671875,
408
+ "logps/rejected": -392.8179931640625,
409
+ "loss": 0.5685,
410
+ "rewards/accuracies": 0.65625,
411
+ "rewards/chosen": -2.019517660140991,
412
+ "rewards/margins": 0.4194945693016052,
413
+ "rewards/rejected": -2.439012050628662,
414
  "step": 260
415
  },
416
  {
417
+ "epoch": 0.21492537313432836,
418
+ "grad_norm": 14.628264234076205,
419
+ "learning_rate": 4.802316912577946e-07,
420
+ "logits/chosen": -2.0842502117156982,
421
+ "logits/rejected": -1.9886341094970703,
422
+ "logps/chosen": -346.19921875,
423
+ "logps/rejected": -366.7377624511719,
424
+ "loss": 0.5884,
425
+ "rewards/accuracies": 0.637499988079071,
426
+ "rewards/chosen": -2.0177090167999268,
427
+ "rewards/margins": 0.32493501901626587,
428
+ "rewards/rejected": -2.342643976211548,
429
  "step": 270
430
  },
431
  {
432
+ "epoch": 0.2228855721393035,
433
+ "grad_norm": 18.526048621847632,
434
+ "learning_rate": 4.774342398605221e-07,
435
+ "logits/chosen": -2.015749216079712,
436
+ "logits/rejected": -1.818052053451538,
437
+ "logps/chosen": -360.9788513183594,
438
+ "logps/rejected": -402.9951171875,
439
+ "loss": 0.5688,
440
+ "rewards/accuracies": 0.675000011920929,
441
+ "rewards/chosen": -2.1356658935546875,
442
+ "rewards/margins": 0.5189321637153625,
443
+ "rewards/rejected": -2.654597759246826,
444
  "step": 280
445
  },
446
  {
447
+ "epoch": 0.2308457711442786,
448
+ "grad_norm": 18.9693122793002,
449
+ "learning_rate": 4.744610079482978e-07,
450
+ "logits/chosen": -2.1531434059143066,
451
+ "logits/rejected": -1.9195207357406616,
452
+ "logps/chosen": -355.23626708984375,
453
+ "logps/rejected": -396.5318603515625,
454
+ "loss": 0.596,
455
+ "rewards/accuracies": 0.637499988079071,
456
+ "rewards/chosen": -2.032478094100952,
457
+ "rewards/margins": 0.41044849157333374,
458
+ "rewards/rejected": -2.4429266452789307,
459
  "step": 290
460
  },
461
  {
462
+ "epoch": 0.23880597014925373,
463
+ "grad_norm": 14.860387982197683,
464
+ "learning_rate": 4.713142934875005e-07,
465
+ "logits/chosen": -2.4048914909362793,
466
+ "logits/rejected": -2.098559856414795,
467
+ "logps/chosen": -306.0177917480469,
468
+ "logps/rejected": -335.77099609375,
469
+ "loss": 0.582,
470
+ "rewards/accuracies": 0.699999988079071,
471
+ "rewards/chosen": -1.319157361984253,
472
+ "rewards/margins": 0.5222987532615662,
473
+ "rewards/rejected": -1.841456413269043,
474
  "step": 300
475
  },
476
  {
477
+ "epoch": 0.24676616915422886,
478
+ "grad_norm": 17.94094060671055,
479
+ "learning_rate": 4.679965285265706e-07,
480
+ "logits/chosen": -2.0477898120880127,
481
+ "logits/rejected": -1.9392799139022827,
482
+ "logps/chosen": -276.44622802734375,
483
+ "logps/rejected": -320.6493835449219,
484
+ "loss": 0.5616,
485
+ "rewards/accuracies": 0.6187499761581421,
486
+ "rewards/chosen": -1.4027130603790283,
487
+ "rewards/margins": 0.39033347368240356,
488
+ "rewards/rejected": -1.7930463552474976,
489
  "step": 310
490
  },
491
  {
492
+ "epoch": 0.25472636815920396,
493
+ "grad_norm": 20.377060113211176,
494
+ "learning_rate": 4.64510277316316e-07,
495
+ "logits/chosen": -1.7731729745864868,
496
+ "logits/rejected": -1.263430118560791,
497
+ "logps/chosen": -373.1016845703125,
498
+ "logps/rejected": -431.9881896972656,
499
+ "loss": 0.5443,
500
+ "rewards/accuracies": 0.78125,
501
+ "rewards/chosen": -2.0920746326446533,
502
+ "rewards/margins": 0.7201870083808899,
503
+ "rewards/rejected": -2.8122615814208984,
504
  "step": 320
505
  },
506
  {
507
+ "epoch": 0.2626865671641791,
508
+ "grad_norm": 19.309940699402812,
509
+ "learning_rate": 4.6085823432804137e-07,
510
+ "logits/chosen": -1.9024795293807983,
511
+ "logits/rejected": -1.569469928741455,
512
+ "logps/chosen": -355.3124084472656,
513
+ "logps/rejected": -419.6775817871094,
514
+ "loss": 0.5793,
515
+ "rewards/accuracies": 0.699999988079071,
516
+ "rewards/chosen": -1.87880539894104,
517
+ "rewards/margins": 0.6133967638015747,
518
+ "rewards/rejected": -2.4922022819519043,
519
  "step": 330
520
  },
521
  {
522
+ "epoch": 0.27064676616915423,
523
+ "grad_norm": 17.446855339300225,
524
+ "learning_rate": 4.570432221710314e-07,
525
+ "logits/chosen": -2.1071066856384277,
526
+ "logits/rejected": -2.065721273422241,
527
+ "logps/chosen": -280.3807373046875,
528
+ "logps/rejected": -328.475830078125,
529
+ "loss": 0.5779,
530
+ "rewards/accuracies": 0.65625,
531
+ "rewards/chosen": -1.4469754695892334,
532
+ "rewards/margins": 0.44690531492233276,
533
+ "rewards/rejected": -1.893880844116211,
534
  "step": 340
535
  },
536
  {
537
+ "epoch": 0.27860696517412936,
538
+ "grad_norm": 15.474382457829194,
539
+ "learning_rate": 4.5306818941099866e-07,
540
+ "logits/chosen": -2.2009084224700928,
541
+ "logits/rejected": -2.050603151321411,
542
+ "logps/chosen": -296.90423583984375,
543
+ "logps/rejected": -336.07818603515625,
544
+ "loss": 0.597,
545
+ "rewards/accuracies": 0.6812499761581421,
546
+ "rewards/chosen": -1.5415115356445312,
547
+ "rewards/margins": 0.42944687604904175,
548
+ "rewards/rejected": -1.9709584712982178,
549
  "step": 350
550
  },
551
  {
552
+ "epoch": 0.2865671641791045,
553
+ "grad_norm": 16.92609001492546,
554
+ "learning_rate": 4.4893620829118124e-07,
555
+ "logits/chosen": -2.1950016021728516,
556
+ "logits/rejected": -2.0361106395721436,
557
+ "logps/chosen": -296.43865966796875,
558
+ "logps/rejected": -364.69696044921875,
559
+ "loss": 0.5517,
560
+ "rewards/accuracies": 0.7124999761581421,
561
+ "rewards/chosen": -1.6347688436508179,
562
+ "rewards/margins": 0.5980359315872192,
563
+ "rewards/rejected": -2.232804775238037,
564
  "step": 360
565
  },
566
  {
567
+ "epoch": 0.2945273631840796,
568
+ "grad_norm": 19.211553848388508,
569
+ "learning_rate": 4.4465047235785185e-07,
570
+ "logits/chosen": -1.9282668828964233,
571
+ "logits/rejected": -1.6367496252059937,
572
+ "logps/chosen": -351.134033203125,
573
+ "logps/rejected": -394.2359313964844,
574
+ "loss": 0.5659,
575
+ "rewards/accuracies": 0.65625,
576
+ "rewards/chosen": -1.9457461833953857,
577
+ "rewards/margins": 0.6128097772598267,
578
+ "rewards/rejected": -2.558556079864502,
579
  "step": 370
580
  },
581
  {
582
+ "epoch": 0.3024875621890547,
583
+ "grad_norm": 16.77173991467865,
584
+ "learning_rate": 4.40214293992074e-07,
585
+ "logits/chosen": -1.747044563293457,
586
+ "logits/rejected": -1.6618773937225342,
587
+ "logps/chosen": -350.59466552734375,
588
+ "logps/rejected": -409.42901611328125,
589
+ "loss": 0.5571,
590
+ "rewards/accuracies": 0.6499999761581421,
591
+ "rewards/chosen": -2.0240893363952637,
592
+ "rewards/margins": 0.44821667671203613,
593
+ "rewards/rejected": -2.4723057746887207,
594
  "step": 380
595
  },
596
  {
597
+ "epoch": 0.31044776119402984,
598
+ "grad_norm": 18.040794768493605,
599
+ "learning_rate": 4.3563110184961234e-07,
600
+ "logits/chosen": -1.9076989889144897,
601
+ "logits/rejected": -1.5887449979782104,
602
+ "logps/chosen": -332.4600524902344,
603
+ "logps/rejected": -381.29803466796875,
604
+ "loss": 0.5534,
605
+ "rewards/accuracies": 0.668749988079071,
606
+ "rewards/chosen": -1.8599789142608643,
607
+ "rewards/margins": 0.4636203348636627,
608
+ "rewards/rejected": -2.323599338531494,
609
  "step": 390
610
  },
611
  {
612
+ "epoch": 0.31840796019900497,
613
+ "grad_norm": 18.863620586743814,
614
+ "learning_rate": 4.3090443821097566e-07,
615
+ "logits/chosen": -1.803070306777954,
616
+ "logits/rejected": -1.5535482168197632,
617
+ "logps/chosen": -308.82965087890625,
618
+ "logps/rejected": -382.87445068359375,
619
+ "loss": 0.5601,
620
+ "rewards/accuracies": 0.6937500238418579,
621
+ "rewards/chosen": -1.603441596031189,
622
+ "rewards/margins": 0.7121911644935608,
623
+ "rewards/rejected": -2.3156330585479736,
624
  "step": 400
625
  },
626
  {
627
+ "epoch": 0.3263681592039801,
628
+ "grad_norm": 13.883572516302484,
629
+ "learning_rate": 4.2603795624364195e-07,
630
+ "logits/chosen": -1.9973773956298828,
631
+ "logits/rejected": -1.4498814344406128,
632
+ "logps/chosen": -357.13763427734375,
633
+ "logps/rejected": -393.8042907714844,
634
+ "loss": 0.5601,
635
+ "rewards/accuracies": 0.737500011920929,
636
+ "rewards/chosen": -2.0093302726745605,
637
+ "rewards/margins": 0.6177257299423218,
638
+ "rewards/rejected": -2.6270556449890137,
639
+ "step": 410
640
+ },
641
+ {
642
+ "epoch": 0.33432835820895523,
643
+ "grad_norm": 16.354917835389568,
644
+ "learning_rate": 4.210354171785795e-07,
645
+ "logits/chosen": -1.7048423290252686,
646
+ "logits/rejected": -1.520613670349121,
647
+ "logps/chosen": -350.70355224609375,
648
+ "logps/rejected": -406.6918029785156,
649
+ "loss": 0.5501,
650
+ "rewards/accuracies": 0.6499999761581421,
651
+ "rewards/chosen": -2.1793723106384277,
652
+ "rewards/margins": 0.4814273416996002,
653
+ "rewards/rejected": -2.660799741744995,
654
+ "step": 420
655
+ },
656
+ {
657
+ "epoch": 0.34228855721393037,
658
+ "grad_norm": 15.11823770980584,
659
+ "learning_rate": 4.15900687403248e-07,
660
+ "logits/chosen": -1.8165044784545898,
661
+ "logits/rejected": -1.512001872062683,
662
+ "logps/chosen": -351.67706298828125,
663
+ "logps/rejected": -398.6982116699219,
664
+ "loss": 0.5579,
665
+ "rewards/accuracies": 0.6812499761581421,
666
+ "rewards/chosen": -1.9194176197052002,
667
+ "rewards/margins": 0.5377460718154907,
668
+ "rewards/rejected": -2.4571633338928223,
669
+ "step": 430
670
+ },
671
+ {
672
+ "epoch": 0.3502487562189055,
673
+ "grad_norm": 14.824412259935942,
674
+ "learning_rate": 4.1063773547332584e-07,
675
+ "logits/chosen": -2.048107862472534,
676
+ "logits/rejected": -1.8060747385025024,
677
+ "logps/chosen": -311.2574462890625,
678
+ "logps/rejected": -372.2284240722656,
679
+ "loss": 0.5602,
680
+ "rewards/accuracies": 0.706250011920929,
681
+ "rewards/chosen": -1.580950379371643,
682
+ "rewards/margins": 0.6271029114723206,
683
+ "rewards/rejected": -2.2080533504486084,
684
+ "step": 440
685
+ },
686
+ {
687
+ "epoch": 0.3582089552238806,
688
+ "grad_norm": 17.525653455798285,
689
+ "learning_rate": 4.0525062904547276e-07,
690
+ "logits/chosen": -1.958742380142212,
691
+ "logits/rejected": -1.3642756938934326,
692
+ "logps/chosen": -334.4582824707031,
693
+ "logps/rejected": -390.8046875,
694
+ "loss": 0.5462,
695
+ "rewards/accuracies": 0.7562500238418579,
696
+ "rewards/chosen": -1.877528429031372,
697
+ "rewards/margins": 0.6928645372390747,
698
+ "rewards/rejected": -2.5703933238983154,
699
+ "step": 450
700
+ },
701
+ {
702
+ "epoch": 0.3661691542288557,
703
+ "grad_norm": 16.688451846975745,
704
+ "learning_rate": 3.997435317334988e-07,
705
+ "logits/chosen": -1.9058139324188232,
706
+ "logits/rejected": -1.5897517204284668,
707
+ "logps/chosen": -351.8033752441406,
708
+ "logps/rejected": -405.9495544433594,
709
+ "loss": 0.5562,
710
+ "rewards/accuracies": 0.7124999761581421,
711
+ "rewards/chosen": -1.860612154006958,
712
+ "rewards/margins": 0.639590859413147,
713
+ "rewards/rejected": -2.5002028942108154,
714
+ "step": 460
715
+ },
716
+ {
717
+ "epoch": 0.37412935323383084,
718
+ "grad_norm": 16.36630647240935,
719
+ "learning_rate": 3.941206998903701e-07,
720
+ "logits/chosen": -1.9422454833984375,
721
+ "logits/rejected": -1.6349403858184814,
722
+ "logps/chosen": -327.41119384765625,
723
+ "logps/rejected": -375.36102294921875,
724
+ "loss": 0.5597,
725
+ "rewards/accuracies": 0.7124999761581421,
726
+ "rewards/chosen": -1.7558162212371826,
727
+ "rewards/margins": 0.5030372738838196,
728
+ "rewards/rejected": -2.2588534355163574,
729
+ "step": 470
730
+ },
731
+ {
732
+ "epoch": 0.382089552238806,
733
+ "grad_norm": 18.336351088486346,
734
+ "learning_rate": 3.8838647931853684e-07,
735
+ "logits/chosen": -1.7614314556121826,
736
+ "logits/rejected": -1.5050185918807983,
737
+ "logps/chosen": -329.1768798828125,
738
+ "logps/rejected": -407.611328125,
739
+ "loss": 0.546,
740
+ "rewards/accuracies": 0.71875,
741
+ "rewards/chosen": -1.9475609064102173,
742
+ "rewards/margins": 0.7653089761734009,
743
+ "rewards/rejected": -2.712869882583618,
744
+ "step": 480
745
+ },
746
+ {
747
+ "epoch": 0.3900497512437811,
748
+ "grad_norm": 14.526089578492307,
749
+ "learning_rate": 3.825453019111281e-07,
750
+ "logits/chosen": -1.854347825050354,
751
+ "logits/rejected": -1.477128028869629,
752
+ "logps/chosen": -344.77587890625,
753
+ "logps/rejected": -422.0589294433594,
754
+ "loss": 0.5412,
755
+ "rewards/accuracies": 0.7749999761581421,
756
+ "rewards/chosen": -1.9149529933929443,
757
+ "rewards/margins": 0.7679746747016907,
758
+ "rewards/rejected": -2.6829276084899902,
759
+ "step": 490
760
+ },
761
+ {
762
+ "epoch": 0.39800995024875624,
763
+ "grad_norm": 21.279411516585405,
764
+ "learning_rate": 3.7660168222660824e-07,
765
+ "logits/chosen": -1.9854710102081299,
766
+ "logits/rejected": -1.706412672996521,
767
+ "logps/chosen": -314.8876953125,
768
+ "logps/rejected": -394.66925048828125,
769
+ "loss": 0.5304,
770
+ "rewards/accuracies": 0.7562500238418579,
771
+ "rewards/chosen": -1.6202218532562256,
772
+ "rewards/margins": 0.7936272621154785,
773
+ "rewards/rejected": -2.413848876953125,
774
+ "step": 500
775
+ },
776
+ {
777
+ "epoch": 0.4059701492537313,
778
+ "grad_norm": 18.371143169851667,
779
+ "learning_rate": 3.705602139995416e-07,
780
+ "logits/chosen": -2.0707178115844727,
781
+ "logits/rejected": -1.6669524908065796,
782
+ "logps/chosen": -296.0068359375,
783
+ "logps/rejected": -350.65704345703125,
784
+ "loss": 0.5757,
785
+ "rewards/accuracies": 0.737500011920929,
786
+ "rewards/chosen": -1.597426414489746,
787
+ "rewards/margins": 0.5844928026199341,
788
+ "rewards/rejected": -2.1819193363189697,
789
+ "step": 510
790
+ },
791
+ {
792
+ "epoch": 0.41393034825870645,
793
+ "grad_norm": 13.765263633877712,
794
+ "learning_rate": 3.6442556659016475e-07,
795
+ "logits/chosen": -2.2126479148864746,
796
+ "logits/rejected": -1.9664087295532227,
797
+ "logps/chosen": -298.197509765625,
798
+ "logps/rejected": -359.9577331542969,
799
+ "loss": 0.5426,
800
+ "rewards/accuracies": 0.7124999761581421,
801
+ "rewards/chosen": -1.5804378986358643,
802
+ "rewards/margins": 0.6014927625656128,
803
+ "rewards/rejected": -2.1819307804107666,
804
+ "step": 520
805
+ },
806
+ {
807
+ "epoch": 0.4218905472636816,
808
+ "grad_norm": 18.995220185419754,
809
+ "learning_rate": 3.582024813755076e-07,
810
+ "logits/chosen": -1.9249904155731201,
811
+ "logits/rejected": -1.5814507007598877,
812
+ "logps/chosen": -343.10919189453125,
813
+ "logps/rejected": -379.06683349609375,
814
+ "loss": 0.5501,
815
+ "rewards/accuracies": 0.6812499761581421,
816
+ "rewards/chosen": -1.8100788593292236,
817
+ "rewards/margins": 0.5825816988945007,
818
+ "rewards/rejected": -2.392660617828369,
819
+ "step": 530
820
+ },
821
+ {
822
+ "epoch": 0.4298507462686567,
823
+ "grad_norm": 17.75911940444921,
824
+ "learning_rate": 3.5189576808485404e-07,
825
+ "logits/chosen": -1.8188612461090088,
826
+ "logits/rejected": -1.5995066165924072,
827
+ "logps/chosen": -329.63861083984375,
828
+ "logps/rejected": -399.28631591796875,
829
+ "loss": 0.5567,
830
+ "rewards/accuracies": 0.731249988079071,
831
+ "rewards/chosen": -1.9467417001724243,
832
+ "rewards/margins": 0.6499906778335571,
833
+ "rewards/rejected": -2.5967321395874023,
834
+ "step": 540
835
+ },
836
+ {
837
+ "epoch": 0.43781094527363185,
838
+ "grad_norm": 20.774910187785334,
839
+ "learning_rate": 3.4551030108237433e-07,
840
+ "logits/chosen": -2.104665994644165,
841
+ "logits/rejected": -1.6698726415634155,
842
+ "logps/chosen": -325.890869140625,
843
+ "logps/rejected": -385.27838134765625,
844
+ "loss": 0.5517,
845
+ "rewards/accuracies": 0.7250000238418579,
846
+ "rewards/chosen": -1.6647937297821045,
847
+ "rewards/margins": 0.7040282487869263,
848
+ "rewards/rejected": -2.3688220977783203,
849
+ "step": 550
850
+ },
851
+ {
852
+ "epoch": 0.445771144278607,
853
+ "grad_norm": 20.43210579616367,
854
+ "learning_rate": 3.390510155998023e-07,
855
+ "logits/chosen": -2.038442611694336,
856
+ "logits/rejected": -1.6438343524932861,
857
+ "logps/chosen": -325.3457946777344,
858
+ "logps/rejected": -397.16790771484375,
859
+ "loss": 0.5347,
860
+ "rewards/accuracies": 0.7124999761581421,
861
+ "rewards/chosen": -1.7963489294052124,
862
+ "rewards/margins": 0.6992012858390808,
863
+ "rewards/rejected": -2.4955506324768066,
864
+ "step": 560
865
+ },
866
+ {
867
+ "epoch": 0.4537313432835821,
868
+ "grad_norm": 18.83338554622648,
869
+ "learning_rate": 3.325229039220684e-07,
870
+ "logits/chosen": -2.120060682296753,
871
+ "logits/rejected": -1.6965103149414062,
872
+ "logps/chosen": -326.96197509765625,
873
+ "logps/rejected": -418.3592834472656,
874
+ "loss": 0.5631,
875
+ "rewards/accuracies": 0.78125,
876
+ "rewards/chosen": -1.686519980430603,
877
+ "rewards/margins": 0.8552719354629517,
878
+ "rewards/rejected": -2.541792154312134,
879
+ "step": 570
880
+ },
881
+ {
882
+ "epoch": 0.4616915422885572,
883
+ "grad_norm": 15.616564617549583,
884
+ "learning_rate": 3.2593101152883795e-07,
885
+ "logits/chosen": -2.0598907470703125,
886
+ "logits/rejected": -2.0200095176696777,
887
+ "logps/chosen": -321.26348876953125,
888
+ "logps/rejected": -392.21337890625,
889
+ "loss": 0.5639,
890
+ "rewards/accuracies": 0.668749988079071,
891
+ "rewards/chosen": -1.8532001972198486,
892
+ "rewards/margins": 0.5705356001853943,
893
+ "rewards/rejected": -2.4237358570098877,
894
+ "step": 580
895
+ },
896
+ {
897
+ "epoch": 0.4696517412935323,
898
+ "grad_norm": 15.28956399678925,
899
+ "learning_rate": 3.192804331949349e-07,
900
+ "logits/chosen": -2.078157901763916,
901
+ "logits/rejected": -1.7344363927841187,
902
+ "logps/chosen": -332.84100341796875,
903
+ "logps/rejected": -405.9479064941406,
904
+ "loss": 0.5267,
905
+ "rewards/accuracies": 0.762499988079071,
906
+ "rewards/chosen": -1.8422882556915283,
907
+ "rewards/margins": 0.780554473400116,
908
+ "rewards/rejected": -2.622842788696289,
909
+ "step": 590
910
+ },
911
+ {
912
+ "epoch": 0.47761194029850745,
913
+ "grad_norm": 20.287126580243257,
914
+ "learning_rate": 3.125763090526674e-07,
915
+ "logits/chosen": -1.8083003759384155,
916
+ "logits/rejected": -1.4841382503509521,
917
+ "logps/chosen": -308.1997985839844,
918
+ "logps/rejected": -384.129150390625,
919
+ "loss": 0.5666,
920
+ "rewards/accuracies": 0.668749988079071,
921
+ "rewards/chosen": -1.7298943996429443,
922
+ "rewards/margins": 0.6618615388870239,
923
+ "rewards/rejected": -2.3917558193206787,
924
+ "step": 600
925
+ },
926
+ {
927
+ "epoch": 0.4855721393034826,
928
+ "grad_norm": 15.063154404223198,
929
+ "learning_rate": 3.0582382061909623e-07,
930
+ "logits/chosen": -1.9274730682373047,
931
+ "logits/rejected": -1.7204185724258423,
932
+ "logps/chosen": -333.6025390625,
933
+ "logps/rejected": -400.88446044921875,
934
+ "loss": 0.5437,
935
+ "rewards/accuracies": 0.737500011920929,
936
+ "rewards/chosen": -1.8027238845825195,
937
+ "rewards/margins": 0.5840314626693726,
938
+ "rewards/rejected": -2.3867554664611816,
939
+ "step": 610
940
+ },
941
+ {
942
+ "epoch": 0.4935323383084577,
943
+ "grad_norm": 15.13264360466539,
944
+ "learning_rate": 2.9902818679131775e-07,
945
+ "logits/chosen": -1.9839204549789429,
946
+ "logits/rejected": -1.7049957513809204,
947
+ "logps/chosen": -320.3578796386719,
948
+ "logps/rejected": -402.8902587890625,
949
+ "loss": 0.5565,
950
+ "rewards/accuracies": 0.768750011920929,
951
+ "rewards/chosen": -1.8458640575408936,
952
+ "rewards/margins": 0.7623557448387146,
953
+ "rewards/rejected": -2.608219623565674,
954
+ "step": 620
955
+ },
956
+ {
957
+ "epoch": 0.5014925373134328,
958
+ "grad_norm": 18.241658852083088,
959
+ "learning_rate": 2.921946598128571e-07,
960
+ "logits/chosen": -1.8552188873291016,
961
+ "logits/rejected": -1.6487993001937866,
962
+ "logps/chosen": -320.94561767578125,
963
+ "logps/rejected": -407.82769775390625,
964
+ "loss": 0.5333,
965
+ "rewards/accuracies": 0.675000011920929,
966
+ "rewards/chosen": -1.7461166381835938,
967
+ "rewards/margins": 0.7682201266288757,
968
+ "rewards/rejected": -2.514336585998535,
969
+ "step": 630
970
+ },
971
+ {
972
+ "epoch": 0.5094527363184079,
973
+ "grad_norm": 18.48691570398085,
974
+ "learning_rate": 2.8532852121428733e-07,
975
+ "logits/chosen": -1.7485021352767944,
976
+ "logits/rejected": -1.165112018585205,
977
+ "logps/chosen": -350.9917907714844,
978
+ "logps/rejected": -418.5997619628906,
979
+ "loss": 0.5373,
980
+ "rewards/accuracies": 0.7437499761581421,
981
+ "rewards/chosen": -2.05993914604187,
982
+ "rewards/margins": 0.7539520263671875,
983
+ "rewards/rejected": -2.8138911724090576,
984
+ "step": 640
985
+ },
986
+ {
987
+ "epoch": 0.5174129353233831,
988
+ "grad_norm": 16.255617080563933,
989
+ "learning_rate": 2.7843507773121414e-07,
990
+ "logits/chosen": -1.929992437362671,
991
+ "logits/rejected": -1.2393519878387451,
992
+ "logps/chosen": -344.0683898925781,
993
+ "logps/rejected": -392.7326354980469,
994
+ "loss": 0.5132,
995
+ "rewards/accuracies": 0.699999988079071,
996
+ "rewards/chosen": -1.9438108205795288,
997
+ "rewards/margins": 0.6626324653625488,
998
+ "rewards/rejected": -2.6064436435699463,
999
+ "step": 650
1000
+ },
1001
+ {
1002
+ "epoch": 0.5253731343283582,
1003
+ "grad_norm": 19.277456544122902,
1004
+ "learning_rate": 2.715196572027789e-07,
1005
+ "logits/chosen": -1.7599384784698486,
1006
+ "logits/rejected": -1.478522777557373,
1007
+ "logps/chosen": -312.7566833496094,
1008
+ "logps/rejected": -391.0369873046875,
1009
+ "loss": 0.5536,
1010
+ "rewards/accuracies": 0.75,
1011
+ "rewards/chosen": -1.6669034957885742,
1012
+ "rewards/margins": 0.7611022591590881,
1013
+ "rewards/rejected": -2.4280056953430176,
1014
+ "step": 660
1015
+ },
1016
+ {
1017
+ "epoch": 0.5333333333333333,
1018
+ "grad_norm": 17.410375794414502,
1019
+ "learning_rate": 2.645876044538521e-07,
1020
+ "logits/chosen": -1.991127371788025,
1021
+ "logits/rejected": -1.690598726272583,
1022
+ "logps/chosen": -306.1337890625,
1023
+ "logps/rejected": -391.0872802734375,
1024
+ "loss": 0.5555,
1025
+ "rewards/accuracies": 0.706250011920929,
1026
+ "rewards/chosen": -1.585716962814331,
1027
+ "rewards/margins": 0.7589733600616455,
1028
+ "rewards/rejected": -2.3446903228759766,
1029
+ "step": 670
1030
+ },
1031
+ {
1032
+ "epoch": 0.5412935323383085,
1033
+ "grad_norm": 15.825036910316848,
1034
+ "learning_rate": 2.5764427716409815e-07,
1035
+ "logits/chosen": -1.9365549087524414,
1036
+ "logits/rejected": -1.4879454374313354,
1037
+ "logps/chosen": -305.0138854980469,
1038
+ "logps/rejected": -357.05596923828125,
1039
+ "loss": 0.5446,
1040
+ "rewards/accuracies": 0.668749988079071,
1041
+ "rewards/chosen": -1.7255747318267822,
1042
+ "rewards/margins": 0.5652529001235962,
1043
+ "rewards/rejected": -2.2908272743225098,
1044
+ "step": 680
1045
+ },
1046
+ {
1047
+ "epoch": 0.5492537313432836,
1048
+ "grad_norm": 19.97205918760906,
1049
+ "learning_rate": 2.5069504172710494e-07,
1050
+ "logits/chosen": -1.8567272424697876,
1051
+ "logits/rejected": -1.7196296453475952,
1052
+ "logps/chosen": -317.20428466796875,
1053
+ "logps/rejected": -414.39404296875,
1054
+ "loss": 0.5509,
1055
+ "rewards/accuracies": 0.7437499761581421,
1056
+ "rewards/chosen": -1.7734630107879639,
1057
+ "rewards/margins": 0.7310962080955505,
1058
+ "rewards/rejected": -2.50455904006958,
1059
+ "step": 690
1060
+ },
1061
+ {
1062
+ "epoch": 0.5572139303482587,
1063
+ "grad_norm": 23.24991074411631,
1064
+ "learning_rate": 2.4374526910277886e-07,
1065
+ "logits/chosen": -1.976335883140564,
1066
+ "logits/rejected": -1.6772323846817017,
1067
+ "logps/chosen": -334.4820251464844,
1068
+ "logps/rejected": -401.43585205078125,
1069
+ "loss": 0.5486,
1070
+ "rewards/accuracies": 0.731249988079071,
1071
+ "rewards/chosen": -1.7821439504623413,
1072
+ "rewards/margins": 0.7127203345298767,
1073
+ "rewards/rejected": -2.4948642253875732,
1074
+ "step": 700
1075
+ },
1076
+ {
1077
+ "epoch": 0.5651741293532339,
1078
+ "grad_norm": 18.01899564630511,
1079
+ "learning_rate": 2.368003306662104e-07,
1080
+ "logits/chosen": -1.8737258911132812,
1081
+ "logits/rejected": -1.4357565641403198,
1082
+ "logps/chosen": -350.22772216796875,
1083
+ "logps/rejected": -403.4524841308594,
1084
+ "loss": 0.5471,
1085
+ "rewards/accuracies": 0.6812499761581421,
1086
+ "rewards/chosen": -1.8687664270401,
1087
+ "rewards/margins": 0.6295676231384277,
1088
+ "rewards/rejected": -2.4983339309692383,
1089
+ "step": 710
1090
+ },
1091
+ {
1092
+ "epoch": 0.573134328358209,
1093
+ "grad_norm": 24.167230989729408,
1094
+ "learning_rate": 2.2986559405621886e-07,
1095
+ "logits/chosen": -1.8662601709365845,
1096
+ "logits/rejected": -1.612449288368225,
1097
+ "logps/chosen": -298.7684326171875,
1098
+ "logps/rejected": -387.6333312988281,
1099
+ "loss": 0.5546,
1100
+ "rewards/accuracies": 0.706250011920929,
1101
+ "rewards/chosen": -1.6841051578521729,
1102
+ "rewards/margins": 0.712591290473938,
1103
+ "rewards/rejected": -2.3966965675354004,
1104
+ "step": 720
1105
+ },
1106
+ {
1107
+ "epoch": 0.5810945273631841,
1108
+ "grad_norm": 17.64113619581508,
1109
+ "learning_rate": 2.2294641902678443e-07,
1110
+ "logits/chosen": -1.850368857383728,
1111
+ "logits/rejected": -1.5335843563079834,
1112
+ "logps/chosen": -342.439208984375,
1113
+ "logps/rejected": -392.17120361328125,
1114
+ "loss": 0.5178,
1115
+ "rewards/accuracies": 0.6812499761581421,
1116
+ "rewards/chosen": -1.8309608697891235,
1117
+ "rewards/margins": 0.5988297462463379,
1118
+ "rewards/rejected": -2.4297900199890137,
1119
+ "step": 730
1120
+ },
1121
+ {
1122
+ "epoch": 0.5890547263681593,
1123
+ "grad_norm": 18.74126586937816,
1124
+ "learning_rate": 2.160481533045751e-07,
1125
+ "logits/chosen": -1.860264778137207,
1126
+ "logits/rejected": -1.3350017070770264,
1127
+ "logps/chosen": -327.9742126464844,
1128
+ "logps/rejected": -388.90789794921875,
1129
+ "loss": 0.5582,
1130
+ "rewards/accuracies": 0.6875,
1131
+ "rewards/chosen": -1.7639926671981812,
1132
+ "rewards/margins": 0.679111659526825,
1133
+ "rewards/rejected": -2.4431042671203613,
1134
+ "step": 740
1135
+ },
1136
+ {
1137
+ "epoch": 0.5970149253731343,
1138
+ "grad_norm": 17.924098734097832,
1139
+ "learning_rate": 2.0917612845576882e-07,
1140
+ "logits/chosen": -1.8485305309295654,
1141
+ "logits/rejected": -1.4422708749771118,
1142
+ "logps/chosen": -310.1546325683594,
1143
+ "logps/rejected": -376.4947204589844,
1144
+ "loss": 0.5258,
1145
+ "rewards/accuracies": 0.6937500238418579,
1146
+ "rewards/chosen": -1.6890716552734375,
1147
+ "rewards/margins": 0.6663106679916382,
1148
+ "rewards/rejected": -2.355382204055786,
1149
+ "step": 750
1150
+ },
1151
+ {
1152
+ "epoch": 0.6049751243781094,
1153
+ "grad_norm": 16.32315826074932,
1154
+ "learning_rate": 2.0233565576536564e-07,
1155
+ "logits/chosen": -1.8351771831512451,
1156
+ "logits/rejected": -1.5401179790496826,
1157
+ "logps/chosen": -319.2179870605469,
1158
+ "logps/rejected": -388.4412536621094,
1159
+ "loss": 0.5637,
1160
+ "rewards/accuracies": 0.7124999761581421,
1161
+ "rewards/chosen": -1.7108579874038696,
1162
+ "rewards/margins": 0.6797488927841187,
1163
+ "rewards/rejected": -2.3906068801879883,
1164
+ "step": 760
1165
+ },
1166
+ {
1167
+ "epoch": 0.6129353233830845,
1168
+ "grad_norm": 15.45028984513974,
1169
+ "learning_rate": 1.9553202213217537e-07,
1170
+ "logits/chosen": -2.024503469467163,
1171
+ "logits/rejected": -1.4444875717163086,
1172
+ "logps/chosen": -284.2763366699219,
1173
+ "logps/rejected": -360.1900939941406,
1174
+ "loss": 0.5266,
1175
+ "rewards/accuracies": 0.768750011920929,
1176
+ "rewards/chosen": -1.384358286857605,
1177
+ "rewards/margins": 0.8107110857963562,
1178
+ "rewards/rejected": -2.1950695514678955,
1179
+ "step": 770
1180
+ },
1181
+ {
1182
+ "epoch": 0.6208955223880597,
1183
+ "grad_norm": 20.649664170903453,
1184
+ "learning_rate": 1.887704859826528e-07,
1185
+ "logits/chosen": -1.937726616859436,
1186
+ "logits/rejected": -1.501091718673706,
1187
+ "logps/chosen": -324.5860900878906,
1188
+ "logps/rejected": -393.491943359375,
1189
+ "loss": 0.5384,
1190
+ "rewards/accuracies": 0.6875,
1191
+ "rewards/chosen": -1.7339260578155518,
1192
+ "rewards/margins": 0.6434067487716675,
1193
+ "rewards/rejected": -2.3773326873779297,
1194
+ "step": 780
1195
+ },
1196
+ {
1197
+ "epoch": 0.6288557213930348,
1198
+ "grad_norm": 21.08511180186782,
1199
+ "learning_rate": 1.8205627320673836e-07,
1200
+ "logits/chosen": -1.6596505641937256,
1201
+ "logits/rejected": -1.2972899675369263,
1202
+ "logps/chosen": -337.11920166015625,
1203
+ "logps/rejected": -406.6131286621094,
1204
+ "loss": 0.5298,
1205
+ "rewards/accuracies": 0.7124999761581421,
1206
+ "rewards/chosen": -1.9775470495224,
1207
+ "rewards/margins": 0.7096458673477173,
1208
+ "rewards/rejected": -2.6871931552886963,
1209
+ "step": 790
1210
+ },
1211
+ {
1212
+ "epoch": 0.6368159203980099,
1213
+ "grad_norm": 17.361400963292194,
1214
+ "learning_rate": 1.7539457311884675e-07,
1215
+ "logits/chosen": -1.6588348150253296,
1216
+ "logits/rejected": -1.1941492557525635,
1217
+ "logps/chosen": -328.57879638671875,
1218
+ "logps/rejected": -394.7569274902344,
1219
+ "loss": 0.5249,
1220
+ "rewards/accuracies": 0.731249988079071,
1221
+ "rewards/chosen": -1.87433660030365,
1222
+ "rewards/margins": 0.719002366065979,
1223
+ "rewards/rejected": -2.593339204788208,
1224
+ "step": 800
1225
+ },
1226
+ {
1227
+ "epoch": 0.6447761194029851,
1228
+ "grad_norm": 16.182542903518556,
1229
+ "learning_rate": 1.687905344471226e-07,
1230
+ "logits/chosen": -1.7270431518554688,
1231
+ "logits/rejected": -1.6882022619247437,
1232
+ "logps/chosen": -318.8824462890625,
1233
+ "logps/rejected": -387.10113525390625,
1234
+ "loss": 0.5637,
1235
+ "rewards/accuracies": 0.668749988079071,
1236
+ "rewards/chosen": -1.8991286754608154,
1237
+ "rewards/margins": 0.5004085302352905,
1238
+ "rewards/rejected": -2.3995373249053955,
1239
+ "step": 810
1240
+ },
1241
+ {
1242
+ "epoch": 0.6527363184079602,
1243
+ "grad_norm": 16.172059932517488,
1244
+ "learning_rate": 1.6224926135406693e-07,
1245
+ "logits/chosen": -1.9119154214859009,
1246
+ "logits/rejected": -1.4078561067581177,
1247
+ "logps/chosen": -336.28851318359375,
1248
+ "logps/rejected": -399.19146728515625,
1249
+ "loss": 0.5432,
1250
+ "rewards/accuracies": 0.6937500238418579,
1251
+ "rewards/chosen": -1.8701921701431274,
1252
+ "rewards/margins": 0.729258120059967,
1253
+ "rewards/rejected": -2.5994503498077393,
1254
+ "step": 820
1255
+ },
1256
+ {
1257
+ "epoch": 0.6606965174129353,
1258
+ "grad_norm": 15.778737810564293,
1259
+ "learning_rate": 1.557758094916053e-07,
1260
+ "logits/chosen": -1.7709137201309204,
1261
+ "logits/rejected": -1.497018814086914,
1262
+ "logps/chosen": -352.27264404296875,
1263
+ "logps/rejected": -433.1822204589844,
1264
+ "loss": 0.5354,
1265
+ "rewards/accuracies": 0.731249988079071,
1266
+ "rewards/chosen": -2.045311450958252,
1267
+ "rewards/margins": 0.6497110724449158,
1268
+ "rewards/rejected": -2.6950223445892334,
1269
+ "step": 830
1270
+ },
1271
+ {
1272
+ "epoch": 0.6686567164179105,
1273
+ "grad_norm": 16.405494774284087,
1274
+ "learning_rate": 1.4937518209365108e-07,
1275
+ "logits/chosen": -1.7025296688079834,
1276
+ "logits/rejected": -1.1163098812103271,
1277
+ "logps/chosen": -354.352294921875,
1278
+ "logps/rejected": -409.0311279296875,
1279
+ "loss": 0.5527,
1280
+ "rewards/accuracies": 0.7250000238418579,
1281
+ "rewards/chosen": -1.9278078079223633,
1282
+ "rewards/margins": 0.6557299494743347,
1283
+ "rewards/rejected": -2.5835378170013428,
1284
+ "step": 840
1285
+ },
1286
+ {
1287
+ "epoch": 0.6766169154228856,
1288
+ "grad_norm": 22.75535436647274,
1289
+ "learning_rate": 1.4305232610918045e-07,
1290
+ "logits/chosen": -1.7835566997528076,
1291
+ "logits/rejected": -1.3081390857696533,
1292
+ "logps/chosen": -334.17626953125,
1293
+ "logps/rejected": -396.04974365234375,
1294
+ "loss": 0.5598,
1295
+ "rewards/accuracies": 0.6875,
1296
+ "rewards/chosen": -1.9084104299545288,
1297
+ "rewards/margins": 0.6380718350410461,
1298
+ "rewards/rejected": -2.5464820861816406,
1299
+ "step": 850
1300
+ },
1301
+ {
1302
+ "epoch": 0.6845771144278607,
1303
+ "grad_norm": 16.28637474078457,
1304
+ "learning_rate": 1.3681212837880977e-07,
1305
+ "logits/chosen": -1.7336877584457397,
1306
+ "logits/rejected": -1.3859245777130127,
1307
+ "logps/chosen": -351.1092834472656,
1308
+ "logps/rejected": -402.8192138671875,
1309
+ "loss": 0.5382,
1310
+ "rewards/accuracies": 0.6937500238418579,
1311
+ "rewards/chosen": -1.9633920192718506,
1312
+ "rewards/margins": 0.6326353549957275,
1313
+ "rewards/rejected": -2.5960276126861572,
1314
+ "step": 860
1315
+ },
1316
+ {
1317
+ "epoch": 0.6925373134328359,
1318
+ "grad_norm": 14.992190553347786,
1319
+ "learning_rate": 1.3065941185782977e-07,
1320
+ "logits/chosen": -1.6035921573638916,
1321
+ "logits/rejected": -1.390312910079956,
1322
+ "logps/chosen": -338.3635559082031,
1323
+ "logps/rejected": -407.48101806640625,
1324
+ "loss": 0.5465,
1325
+ "rewards/accuracies": 0.699999988079071,
1326
+ "rewards/chosen": -2.1271004676818848,
1327
+ "rewards/margins": 0.5808795690536499,
1328
+ "rewards/rejected": -2.707979679107666,
1329
+ "step": 870
1330
+ },
1331
+ {
1332
+ "epoch": 0.700497512437811,
1333
+ "grad_norm": 18.882701792962603,
1334
+ "learning_rate": 1.2459893188861613e-07,
1335
+ "logits/chosen": -1.7753925323486328,
1336
+ "logits/rejected": -1.313396692276001,
1337
+ "logps/chosen": -328.06280517578125,
1338
+ "logps/rejected": -409.3533020019531,
1339
+ "loss": 0.5378,
1340
+ "rewards/accuracies": 0.8125,
1341
+ "rewards/chosen": -1.7554057836532593,
1342
+ "rewards/margins": 0.8705118894577026,
1343
+ "rewards/rejected": -2.625917434692383,
1344
+ "step": 880
1345
+ },
1346
+ {
1347
+ "epoch": 0.708457711442786,
1348
+ "grad_norm": 15.585282528531224,
1349
+ "learning_rate": 1.1863537252529548e-07,
1350
+ "logits/chosen": -1.6346286535263062,
1351
+ "logits/rejected": -1.0885813236236572,
1352
+ "logps/chosen": -314.69415283203125,
1353
+ "logps/rejected": -406.14593505859375,
1354
+ "loss": 0.5274,
1355
+ "rewards/accuracies": 0.7124999761581421,
1356
+ "rewards/chosen": -1.7842576503753662,
1357
+ "rewards/margins": 0.8567419052124023,
1358
+ "rewards/rejected": -2.6409995555877686,
1359
+ "step": 890
1360
+ },
1361
+ {
1362
+ "epoch": 0.7164179104477612,
1363
+ "grad_norm": 17.240684758370243,
1364
+ "learning_rate": 1.1277334291351145e-07,
1365
+ "logits/chosen": -1.559463381767273,
1366
+ "logits/rejected": -0.979491114616394,
1367
+ "logps/chosen": -314.4317321777344,
1368
+ "logps/rejected": -395.7962341308594,
1369
+ "loss": 0.5356,
1370
+ "rewards/accuracies": 0.6937500238418579,
1371
+ "rewards/chosen": -1.7382938861846924,
1372
+ "rewards/margins": 0.8430318832397461,
1373
+ "rewards/rejected": -2.5813255310058594,
1374
+ "step": 900
1375
+ },
1376
+ {
1377
+ "epoch": 0.7243781094527363,
1378
+ "grad_norm": 17.81287348840895,
1379
+ "learning_rate": 1.0701737372808431e-07,
1380
+ "logits/chosen": -1.356980562210083,
1381
+ "logits/rejected": -1.0851752758026123,
1382
+ "logps/chosen": -321.2406005859375,
1383
+ "logps/rejected": -439.5826110839844,
1384
+ "loss": 0.5266,
1385
+ "rewards/accuracies": 0.8187500238418579,
1386
+ "rewards/chosen": -1.7592580318450928,
1387
+ "rewards/margins": 1.1267178058624268,
1388
+ "rewards/rejected": -2.8859758377075195,
1389
+ "step": 910
1390
+ },
1391
+ {
1392
+ "epoch": 0.7323383084577114,
1393
+ "grad_norm": 16.146564583575124,
1394
+ "learning_rate": 1.0137191367132078e-07,
1395
+ "logits/chosen": -1.5627696514129639,
1396
+ "logits/rejected": -0.8526192903518677,
1397
+ "logps/chosen": -343.89093017578125,
1398
+ "logps/rejected": -410.4419860839844,
1399
+ "loss": 0.5202,
1400
+ "rewards/accuracies": 0.71875,
1401
+ "rewards/chosen": -2.032959461212158,
1402
+ "rewards/margins": 0.7283061146736145,
1403
+ "rewards/rejected": -2.761265754699707,
1404
+ "step": 920
1405
+ },
1406
+ {
1407
+ "epoch": 0.7402985074626866,
1408
+ "grad_norm": 16.30781388479682,
1409
+ "learning_rate": 9.584132603467827e-08,
1410
+ "logits/chosen": -1.472614049911499,
1411
+ "logits/rejected": -1.0152631998062134,
1412
+ "logps/chosen": -369.9658203125,
1413
+ "logps/rejected": -457.43280029296875,
1414
+ "loss": 0.5391,
1415
+ "rewards/accuracies": 0.8125,
1416
+ "rewards/chosen": -2.0644240379333496,
1417
+ "rewards/margins": 0.929151177406311,
1418
+ "rewards/rejected": -2.99357533454895,
1419
+ "step": 930
1420
+ },
1421
+ {
1422
+ "epoch": 0.7482587064676617,
1423
+ "grad_norm": 16.90834018386895,
1424
+ "learning_rate": 9.042988532644249e-08,
1425
+ "logits/chosen": -1.397674798965454,
1426
+ "logits/rejected": -1.057903528213501,
1427
+ "logps/chosen": -375.06805419921875,
1428
+ "logps/rejected": -459.8721618652344,
1429
+ "loss": 0.5326,
1430
+ "rewards/accuracies": 0.699999988079071,
1431
+ "rewards/chosen": -2.2227516174316406,
1432
+ "rewards/margins": 0.7576023936271667,
1433
+ "rewards/rejected": -2.980353832244873,
1434
+ "step": 940
1435
+ },
1436
+ {
1437
+ "epoch": 0.7562189054726368,
1438
+ "grad_norm": 19.44454583035634,
1439
+ "learning_rate": 8.514177396802428e-08,
1440
+ "logits/chosen": -1.3311452865600586,
1441
+ "logits/rejected": -0.988301157951355,
1442
+ "logps/chosen": -356.1918640136719,
1443
+ "logps/rejected": -429.5113220214844,
1444
+ "loss": 0.5311,
1445
+ "rewards/accuracies": 0.731249988079071,
1446
+ "rewards/chosen": -2.1742124557495117,
1447
+ "rewards/margins": 0.7231701612472534,
1448
+ "rewards/rejected": -2.8973822593688965,
1449
+ "step": 950
1450
+ },
1451
+ {
1452
+ "epoch": 0.764179104477612,
1453
+ "grad_norm": 18.789667512348625,
1454
+ "learning_rate": 7.998107906142839e-08,
1455
+ "logits/chosen": -1.2201939821243286,
1456
+ "logits/rejected": -0.8874862790107727,
1457
+ "logps/chosen": -364.1236267089844,
1458
+ "logps/rejected": -449.46466064453125,
1459
+ "loss": 0.5436,
1460
+ "rewards/accuracies": 0.7250000238418579,
1461
+ "rewards/chosen": -2.1308789253234863,
1462
+ "rewards/margins": 0.786114513874054,
1463
+ "rewards/rejected": -2.9169933795928955,
1464
+ "step": 960
1465
+ },
1466
+ {
1467
+ "epoch": 0.7721393034825871,
1468
+ "grad_norm": 18.028047538401793,
1469
+ "learning_rate": 7.495178923039396e-08,
1470
+ "logits/chosen": -1.3335978984832764,
1471
+ "logits/rejected": -0.8373553156852722,
1472
+ "logps/chosen": -352.852294921875,
1473
+ "logps/rejected": -425.99853515625,
1474
+ "loss": 0.5323,
1475
+ "rewards/accuracies": 0.75,
1476
+ "rewards/chosen": -2.04585862159729,
1477
+ "rewards/margins": 0.8074928522109985,
1478
+ "rewards/rejected": -2.853351354598999,
1479
+ "step": 970
1480
+ },
1481
+ {
1482
+ "epoch": 0.7800995024875622,
1483
+ "grad_norm": 19.107873765643053,
1484
+ "learning_rate": 7.005779153764682e-08,
1485
+ "logits/chosen": -1.2632755041122437,
1486
+ "logits/rejected": -1.1477338075637817,
1487
+ "logps/chosen": -362.19293212890625,
1488
+ "logps/rejected": -445.4449157714844,
1489
+ "loss": 0.5212,
1490
+ "rewards/accuracies": 0.699999988079071,
1491
+ "rewards/chosen": -2.2130560874938965,
1492
+ "rewards/margins": 0.6761744618415833,
1493
+ "rewards/rejected": -2.889230728149414,
1494
+ "step": 980
1495
+ },
1496
+ {
1497
+ "epoch": 0.7880597014925373,
1498
+ "grad_norm": 20.371049931041078,
1499
+ "learning_rate": 6.530286848064698e-08,
1500
+ "logits/chosen": -1.2885363101959229,
1501
+ "logits/rejected": -0.6255804896354675,
1502
+ "logps/chosen": -338.9552001953125,
1503
+ "logps/rejected": -428.79931640625,
1504
+ "loss": 0.5195,
1505
+ "rewards/accuracies": 0.7562500238418579,
1506
+ "rewards/chosen": -2.0429329872131348,
1507
+ "rewards/margins": 0.9163187146186829,
1508
+ "rewards/rejected": -2.959251880645752,
1509
+ "step": 990
1510
+ },
1511
+ {
1512
+ "epoch": 0.7960199004975125,
1513
+ "grad_norm": 19.72789599453256,
1514
+ "learning_rate": 6.069069506815325e-08,
1515
+ "logits/chosen": -1.360527515411377,
1516
+ "logits/rejected": -0.8227861523628235,
1517
+ "logps/chosen": -373.851318359375,
1518
+ "logps/rejected": -440.9930114746094,
1519
+ "loss": 0.5563,
1520
+ "rewards/accuracies": 0.706250011920929,
1521
+ "rewards/chosen": -2.2749392986297607,
1522
+ "rewards/margins": 0.752488911151886,
1523
+ "rewards/rejected": -3.027428388595581,
1524
+ "step": 1000
1525
+ },
1526
+ {
1527
+ "epoch": 0.8039800995024876,
1528
+ "grad_norm": 17.338627129120006,
1529
+ "learning_rate": 5.6224835979863714e-08,
1530
+ "logits/chosen": -1.377640724182129,
1531
+ "logits/rejected": -0.8697378039360046,
1532
+ "logps/chosen": -388.3387756347656,
1533
+ "logps/rejected": -466.47174072265625,
1534
+ "loss": 0.5397,
1535
+ "rewards/accuracies": 0.7250000238418579,
1536
+ "rewards/chosen": -2.309962034225464,
1537
+ "rewards/margins": 0.7413021922111511,
1538
+ "rewards/rejected": -3.0512640476226807,
1539
+ "step": 1010
1540
+ },
1541
+ {
1542
+ "epoch": 0.8119402985074626,
1543
+ "grad_norm": 18.9063759035064,
1544
+ "learning_rate": 5.190874281132851e-08,
1545
+ "logits/chosen": -1.4672131538391113,
1546
+ "logits/rejected": -0.6555231809616089,
1547
+ "logps/chosen": -377.4454650878906,
1548
+ "logps/rejected": -419.4335021972656,
1549
+ "loss": 0.5269,
1550
+ "rewards/accuracies": 0.6499999761581421,
1551
+ "rewards/chosen": -2.285759449005127,
1552
+ "rewards/margins": 0.5871948003768921,
1553
+ "rewards/rejected": -2.8729541301727295,
1554
+ "step": 1020
1555
+ },
1556
+ {
1557
+ "epoch": 0.8199004975124378,
1558
+ "grad_norm": 17.93974901514964,
1559
+ "learning_rate": 4.774575140626316e-08,
1560
+ "logits/chosen": -1.227505087852478,
1561
+ "logits/rejected": -0.7521185278892517,
1562
+ "logps/chosen": -379.4794616699219,
1563
+ "logps/rejected": -443.300537109375,
1564
+ "loss": 0.5371,
1565
+ "rewards/accuracies": 0.71875,
1566
+ "rewards/chosen": -2.316551685333252,
1567
+ "rewards/margins": 0.7855070233345032,
1568
+ "rewards/rejected": -3.1020588874816895,
1569
+ "step": 1030
1570
+ },
1571
+ {
1572
+ "epoch": 0.8278606965174129,
1573
+ "grad_norm": 19.765840382966818,
1574
+ "learning_rate": 4.373907927832513e-08,
1575
+ "logits/chosen": -1.2585428953170776,
1576
+ "logits/rejected": -0.6001216173171997,
1577
+ "logps/chosen": -360.7466125488281,
1578
+ "logps/rejected": -432.0596618652344,
1579
+ "loss": 0.5324,
1580
+ "rewards/accuracies": 0.7250000238418579,
1581
+ "rewards/chosen": -2.2043449878692627,
1582
+ "rewards/margins": 0.739472508430481,
1583
+ "rewards/rejected": -2.943817615509033,
1584
+ "step": 1040
1585
+ },
1586
+ {
1587
+ "epoch": 0.835820895522388,
1588
+ "grad_norm": 16.791397180911105,
1589
+ "learning_rate": 3.9891823124345665e-08,
1590
+ "logits/chosen": -1.595428466796875,
1591
+ "logits/rejected": -0.7433096170425415,
1592
+ "logps/chosen": -372.77667236328125,
1593
+ "logps/rejected": -427.80633544921875,
1594
+ "loss": 0.5419,
1595
+ "rewards/accuracies": 0.706250011920929,
1596
+ "rewards/chosen": -2.052586078643799,
1597
+ "rewards/margins": 0.7817685008049011,
1598
+ "rewards/rejected": -2.834354877471924,
1599
+ "step": 1050
1600
+ },
1601
+ {
1602
+ "epoch": 0.8437810945273632,
1603
+ "grad_norm": 17.099354050242567,
1604
+ "learning_rate": 3.620695643093924e-08,
1605
+ "logits/chosen": -1.0566952228546143,
1606
+ "logits/rejected": -1.0658143758773804,
1607
+ "logps/chosen": -353.5264587402344,
1608
+ "logps/rejected": -453.6097717285156,
1609
+ "loss": 0.5355,
1610
+ "rewards/accuracies": 0.7562500238418579,
1611
+ "rewards/chosen": -2.2306525707244873,
1612
+ "rewards/margins": 0.7557250261306763,
1613
+ "rewards/rejected": -2.986377716064453,
1614
+ "step": 1060
1615
+ },
1616
+ {
1617
+ "epoch": 0.8517412935323383,
1618
+ "grad_norm": 17.626047522616386,
1619
+ "learning_rate": 3.268732717634032e-08,
1620
+ "logits/chosen": -1.082467794418335,
1621
+ "logits/rejected": -0.6993826031684875,
1622
+ "logps/chosen": -356.1838073730469,
1623
+ "logps/rejected": -432.0118713378906,
1624
+ "loss": 0.5382,
1625
+ "rewards/accuracies": 0.731249988079071,
1626
+ "rewards/chosen": -2.2437031269073486,
1627
+ "rewards/margins": 0.703085720539093,
1628
+ "rewards/rejected": -2.946788787841797,
1629
+ "step": 1070
1630
+ },
1631
+ {
1632
+ "epoch": 0.8597014925373134,
1633
+ "grad_norm": 18.59186735479471,
1634
+ "learning_rate": 2.9335655629243645e-08,
1635
+ "logits/chosen": -1.294574499130249,
1636
+ "logits/rejected": -0.9409192800521851,
1637
+ "logps/chosen": -375.03857421875,
1638
+ "logps/rejected": -447.16845703125,
1639
+ "loss": 0.5504,
1640
+ "rewards/accuracies": 0.7437499761581421,
1641
+ "rewards/chosen": -2.2595221996307373,
1642
+ "rewards/margins": 0.7225955724716187,
1643
+ "rewards/rejected": -2.9821178913116455,
1644
+ "step": 1080
1645
+ },
1646
+ {
1647
+ "epoch": 0.8676616915422886,
1648
+ "grad_norm": 21.397708720247653,
1649
+ "learning_rate": 2.6154532246349476e-08,
1650
+ "logits/chosen": -1.1999056339263916,
1651
+ "logits/rejected": -1.0267088413238525,
1652
+ "logps/chosen": -363.5381164550781,
1653
+ "logps/rejected": -422.05511474609375,
1654
+ "loss": 0.5754,
1655
+ "rewards/accuracies": 0.6625000238418579,
1656
+ "rewards/chosen": -2.3281185626983643,
1657
+ "rewards/margins": 0.50066077709198,
1658
+ "rewards/rejected": -2.8287787437438965,
1659
+ "step": 1090
1660
+ },
1661
+ {
1662
+ "epoch": 0.8756218905472637,
1663
+ "grad_norm": 20.052028882501826,
1664
+ "learning_rate": 2.31464156702382e-08,
1665
+ "logits/chosen": -1.3676979541778564,
1666
+ "logits/rejected": -0.5716175436973572,
1667
+ "logps/chosen": -375.810302734375,
1668
+ "logps/rejected": -450.2315979003906,
1669
+ "loss": 0.5262,
1670
+ "rewards/accuracies": 0.7124999761581421,
1671
+ "rewards/chosen": -2.2596614360809326,
1672
+ "rewards/margins": 0.835864245891571,
1673
+ "rewards/rejected": -3.0955255031585693,
1674
+ "step": 1100
1675
+ },
1676
+ {
1677
+ "epoch": 0.8835820895522388,
1678
+ "grad_norm": 22.37302428318916,
1679
+ "learning_rate": 2.031363082912252e-08,
1680
+ "logits/chosen": -1.2721402645111084,
1681
+ "logits/rejected": -0.6073659658432007,
1682
+ "logps/chosen": -368.27984619140625,
1683
+ "logps/rejected": -448.68157958984375,
1684
+ "loss": 0.5231,
1685
+ "rewards/accuracies": 0.7250000238418579,
1686
+ "rewards/chosen": -2.095541477203369,
1687
+ "rewards/margins": 0.8900821805000305,
1688
+ "rewards/rejected": -2.985623598098755,
1689
+ "step": 1110
1690
+ },
1691
+ {
1692
+ "epoch": 0.891542288557214,
1693
+ "grad_norm": 25.266800229701747,
1694
+ "learning_rate": 1.7658367139945228e-08,
1695
+ "logits/chosen": -1.5140697956085205,
1696
+ "logits/rejected": -0.7945183515548706,
1697
+ "logps/chosen": -384.32269287109375,
1698
+ "logps/rejected": -433.766845703125,
1699
+ "loss": 0.5287,
1700
+ "rewards/accuracies": 0.7124999761581421,
1701
+ "rewards/chosen": -2.154731273651123,
1702
+ "rewards/margins": 0.6882571578025818,
1703
+ "rewards/rejected": -2.8429884910583496,
1704
+ "step": 1120
1705
+ },
1706
+ {
1707
+ "epoch": 0.8995024875621891,
1708
+ "grad_norm": 20.62304308306268,
1709
+ "learning_rate": 1.5182676816211632e-08,
1710
+ "logits/chosen": -1.3684704303741455,
1711
+ "logits/rejected": -0.8318718075752258,
1712
+ "logps/chosen": -365.13079833984375,
1713
+ "logps/rejected": -472.0492248535156,
1714
+ "loss": 0.5163,
1715
+ "rewards/accuracies": 0.7562500238418579,
1716
+ "rewards/chosen": -2.154531478881836,
1717
+ "rewards/margins": 1.0084809064865112,
1718
+ "rewards/rejected": -3.1630122661590576,
1719
+ "step": 1130
1720
+ },
1721
+ {
1722
+ "epoch": 0.9074626865671642,
1723
+ "grad_norm": 16.335373316417314,
1724
+ "learning_rate": 1.2888473281864597e-08,
1725
+ "logits/chosen": -1.4910027980804443,
1726
+ "logits/rejected": -0.7867361903190613,
1727
+ "logps/chosen": -357.47393798828125,
1728
+ "logps/rejected": -418.6597595214844,
1729
+ "loss": 0.5239,
1730
+ "rewards/accuracies": 0.6625000238418579,
1731
+ "rewards/chosen": -2.1304430961608887,
1732
+ "rewards/margins": 0.6699010133743286,
1733
+ "rewards/rejected": -2.8003439903259277,
1734
+ "step": 1140
1735
+ },
1736
+ {
1737
+ "epoch": 0.9154228855721394,
1738
+ "grad_norm": 19.127473957890142,
1739
+ "learning_rate": 1.0777529692427679e-08,
1740
+ "logits/chosen": -1.3886005878448486,
1741
+ "logits/rejected": -1.1176588535308838,
1742
+ "logps/chosen": -361.29595947265625,
1743
+ "logps/rejected": -450.3651428222656,
1744
+ "loss": 0.5321,
1745
+ "rewards/accuracies": 0.7562500238418579,
1746
+ "rewards/chosen": -2.1988742351531982,
1747
+ "rewards/margins": 0.7343587279319763,
1748
+ "rewards/rejected": -2.9332327842712402,
1749
+ "step": 1150
1750
+ },
1751
+ {
1752
+ "epoch": 0.9233830845771144,
1753
+ "grad_norm": 18.113861239231703,
1754
+ "learning_rate": 8.851477564560061e-09,
1755
+ "logits/chosen": -1.4955084323883057,
1756
+ "logits/rejected": -1.0137813091278076,
1757
+ "logps/chosen": -344.0810852050781,
1758
+ "logps/rejected": -438.7982482910156,
1759
+ "loss": 0.5534,
1760
+ "rewards/accuracies": 0.768750011920929,
1761
+ "rewards/chosen": -2.007995128631592,
1762
+ "rewards/margins": 0.8483503460884094,
1763
+ "rewards/rejected": -2.8563454151153564,
1764
+ "step": 1160
1765
+ },
1766
+ {
1767
+ "epoch": 0.9313432835820895,
1768
+ "grad_norm": 18.419748638148626,
1769
+ "learning_rate": 7.111805515081531e-09,
1770
+ "logits/chosen": -1.1694109439849854,
1771
+ "logits/rejected": -0.9867475628852844,
1772
+ "logps/chosen": -344.5587158203125,
1773
+ "logps/rejected": -453.9609375,
1774
+ "loss": 0.5216,
1775
+ "rewards/accuracies": 0.7749999761581421,
1776
+ "rewards/chosen": -2.174985408782959,
1777
+ "rewards/margins": 0.8908305168151855,
1778
+ "rewards/rejected": -3.0658156871795654,
1779
+ "step": 1170
1780
+ },
1781
+ {
1782
+ "epoch": 0.9393034825870646,
1783
+ "grad_norm": 16.31657411408144,
1784
+ "learning_rate": 5.559858110443016e-09,
1785
+ "logits/chosen": -1.391318440437317,
1786
+ "logits/rejected": -0.968989372253418,
1787
+ "logps/chosen": -338.8396301269531,
1788
+ "logps/rejected": -450.4630432128906,
1789
+ "loss": 0.5101,
1790
+ "rewards/accuracies": 0.7562500238418579,
1791
+ "rewards/chosen": -2.04402232170105,
1792
+ "rewards/margins": 0.873876690864563,
1793
+ "rewards/rejected": -2.9178991317749023,
1794
+ "step": 1180
1795
+ },
1796
+ {
1797
+ "epoch": 0.9472636815920398,
1798
+ "grad_norm": 16.213421308211853,
1799
+ "learning_rate": 4.196834827531276e-09,
1800
+ "logits/chosen": -1.3122155666351318,
1801
+ "logits/rejected": -0.727799117565155,
1802
+ "logps/chosen": -363.20123291015625,
1803
+ "logps/rejected": -421.6800231933594,
1804
+ "loss": 0.5223,
1805
+ "rewards/accuracies": 0.6625000238418579,
1806
+ "rewards/chosen": -2.240814685821533,
1807
+ "rewards/margins": 0.7033532857894897,
1808
+ "rewards/rejected": -2.9441676139831543,
1809
+ "step": 1190
1810
+ },
1811
+ {
1812
+ "epoch": 0.9552238805970149,
1813
+ "grad_norm": 16.61159163977494,
1814
+ "learning_rate": 3.023789126611137e-09,
1815
+ "logits/chosen": -1.3651138544082642,
1816
+ "logits/rejected": -1.0306357145309448,
1817
+ "logps/chosen": -346.49127197265625,
1818
+ "logps/rejected": -433.2818298339844,
1819
+ "loss": 0.5514,
1820
+ "rewards/accuracies": 0.6937500238418579,
1821
+ "rewards/chosen": -2.017871856689453,
1822
+ "rewards/margins": 0.7361677289009094,
1823
+ "rewards/rejected": -2.7540395259857178,
1824
+ "step": 1200
1825
+ },
1826
+ {
1827
+ "epoch": 0.96318407960199,
1828
+ "grad_norm": 19.71446411178462,
1829
+ "learning_rate": 2.041627637121929e-09,
1830
+ "logits/chosen": -1.2996383905410767,
1831
+ "logits/rejected": -0.6256915330886841,
1832
+ "logps/chosen": -358.1094665527344,
1833
+ "logps/rejected": -430.37811279296875,
1834
+ "loss": 0.5558,
1835
+ "rewards/accuracies": 0.7562500238418579,
1836
+ "rewards/chosen": -2.246715784072876,
1837
+ "rewards/margins": 0.7623183727264404,
1838
+ "rewards/rejected": -3.0090343952178955,
1839
+ "step": 1210
1840
+ },
1841
+ {
1842
+ "epoch": 0.9711442786069652,
1843
+ "grad_norm": 19.712848729427638,
1844
+ "learning_rate": 1.2511094569571668e-09,
1845
+ "logits/chosen": -1.328294277191162,
1846
+ "logits/rejected": -0.7868885397911072,
1847
+ "logps/chosen": -368.758056640625,
1848
+ "logps/rejected": -458.2462463378906,
1849
+ "loss": 0.5449,
1850
+ "rewards/accuracies": 0.7749999761581421,
1851
+ "rewards/chosen": -2.1683990955352783,
1852
+ "rewards/margins": 0.9658330082893372,
1853
+ "rewards/rejected": -3.1342320442199707,
1854
+ "step": 1220
1855
+ },
1856
+ {
1857
+ "epoch": 0.9791044776119403,
1858
+ "grad_norm": 16.725128383172244,
1859
+ "learning_rate": 6.528455657691112e-10,
1860
+ "logits/chosen": -1.339804768562317,
1861
+ "logits/rejected": -0.7188054323196411,
1862
+ "logps/chosen": -367.33935546875,
1863
+ "logps/rejected": -451.83984375,
1864
+ "loss": 0.5116,
1865
+ "rewards/accuracies": 0.7250000238418579,
1866
+ "rewards/chosen": -2.153168201446533,
1867
+ "rewards/margins": 0.8999155759811401,
1868
+ "rewards/rejected": -3.053083896636963,
1869
+ "step": 1230
1870
+ },
1871
+ {
1872
+ "epoch": 0.9870646766169154,
1873
+ "grad_norm": 19.596666231596082,
1874
+ "learning_rate": 2.4729835275189016e-10,
1875
+ "logits/chosen": -1.3673975467681885,
1876
+ "logits/rejected": -0.6154208183288574,
1877
+ "logps/chosen": -359.135009765625,
1878
+ "logps/rejected": -439.5702209472656,
1879
+ "loss": 0.543,
1880
+ "rewards/accuracies": 0.7437499761581421,
1881
+ "rewards/chosen": -2.121037721633911,
1882
+ "rewards/margins": 0.8869087100028992,
1883
+ "rewards/rejected": -3.007946491241455,
1884
+ "step": 1240
1885
+ },
1886
+ {
1887
+ "epoch": 0.9950248756218906,
1888
+ "grad_norm": 19.08758915029115,
1889
+ "learning_rate": 3.478125926756337e-11,
1890
+ "logits/chosen": -1.2848167419433594,
1891
+ "logits/rejected": -0.7639325261116028,
1892
+ "logps/chosen": -373.97735595703125,
1893
+ "logps/rejected": -479.5267639160156,
1894
+ "loss": 0.5345,
1895
+ "rewards/accuracies": 0.78125,
1896
+ "rewards/chosen": -2.261409044265747,
1897
+ "rewards/margins": 0.9971047639846802,
1898
+ "rewards/rejected": -3.2585136890411377,
1899
+ "step": 1250
1900
+ },
1901
+ {
1902
+ "epoch": 0.9998009950248756,
1903
+ "step": 1256,
1904
  "total_flos": 0.0,
1905
+ "train_loss": 0.56497666932595,
1906
+ "train_runtime": 62670.8183,
1907
+ "train_samples_per_second": 2.566,
1908
+ "train_steps_per_second": 0.02
1909
  }
1910
  ],
1911
  "logging_steps": 10,
1912
+ "max_steps": 1256,
1913
+ "num_input_tokens_seen": 0,
1914
  "num_train_epochs": 1,
1915
+ "save_steps": 100,
1916
+ "stateful_callbacks": {
1917
+ "TrainerControl": {
1918
+ "args": {
1919
+ "should_epoch_stop": false,
1920
+ "should_evaluate": false,
1921
+ "should_log": false,
1922
+ "should_save": true,
1923
+ "should_training_stop": false
1924
+ },
1925
+ "attributes": {}
1926
+ }
1927
+ },
1928
  "total_flos": 0.0,
1929
+ "train_batch_size": 1,
1930
  "trial_name": null,
1931
  "trial_params": null
1932
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b277b1feb57ed04289fa7a5826a75f7c9798133781b7de28bcc1e3953a6201f
3
- size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e19cc2efde9c9407a03928a56d8f9f4a67e235bc530069812edfde062436b88
3
+ size 6456