RikkiXu commited on
Commit
358fc85
1 Parent(s): 3ee6e58

Model save

Browse files
README.md CHANGED
@@ -14,7 +14,7 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  # zephyr-7b-dpo-full
16
 
17
- This model is a fine-tuned version of [princeton-nlp/Mistral-7B-Base-SFT-DPO](https://huggingface.co/princeton-nlp/Mistral-7B-Base-SFT-DPO) on the None dataset.
18
 
19
  ## Model description
20
 
@@ -55,5 +55,5 @@ The following hyperparameters were used during training:
55
 
56
  - Transformers 4.39.3
57
  - Pytorch 2.1.2+cu118
58
- - Datasets 2.16.1
59
  - Tokenizers 0.15.2
 
14
 
15
  # zephyr-7b-dpo-full
16
 
17
+ This model is a fine-tuned version of [princeton-nlp/Mistral-7B-Base-SFT-DPO](https://huggingface.co/princeton-nlp/Mistral-7B-Base-SFT-DPO) on an unknown dataset.
18
 
19
  ## Model description
20
 
 
55
 
56
  - Transformers 4.39.3
57
  - Pytorch 2.1.2+cu118
58
+ - Datasets 2.19.1
59
  - Tokenizers 0.15.2
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.37812867454580357,
4
- "train_runtime": 5319.5814,
5
- "train_samples": 47302,
6
- "train_samples_per_second": 8.892,
7
- "train_steps_per_second": 0.035
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.2785977178812027,
4
+ "train_runtime": 11929.9898,
5
+ "train_samples": 102360,
6
+ "train_samples_per_second": 8.58,
7
+ "train_steps_per_second": 0.034
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b0d91deaf39d5239312d2ee38ee390acaa4b4fa405cd4b896c533d8687b80b9
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41b405c6b39fce54d81023410da5b0175bc34f1b707551cc87bb57315a19139d
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b806d3480fd4992109c5d61a5d48cf88cca223fbec8796f7b25067bfe4956722
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4fcfc77bc0cce12435a691bee318c376a963ab3c60e50f0201871ef7f9f1899
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a766c0fba83e2189c5e7cdab4f60fcf4245ea877bf5d1a20cb88a89e9fbb76e
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec4cc12a5a582b9be0861f8da52397528a0b5094e4e53a00c5c10ad9fdc740da
3
  size 4540516344
runs/Jun21_05-09-37_n136-112-146/events.out.tfevents.1718919313.n136-112-146.2797891.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3fddba7721247414cf78b00df1f9f9286ace834b37d429f6f52162097a4fd91
3
- size 32986
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec7cfced69671d65a2c2d087a11f64906f935b37201b11175a3442ce3a673df9
3
+ size 33340
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.37812867454580357,
4
- "train_runtime": 5319.5814,
5
- "train_samples": 47302,
6
- "train_samples_per_second": 8.892,
7
- "train_steps_per_second": 0.035
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.2785977178812027,
4
+ "train_runtime": 11929.9898,
5
+ "train_samples": 102360,
6
+ "train_samples_per_second": 8.58,
7
+ "train_steps_per_second": 0.034
8
  }
trainer_state.json CHANGED
@@ -3,20 +3,20 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 185,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.01,
13
- "grad_norm": 518.0837836920463,
14
- "learning_rate": 2.6315789473684208e-08,
15
- "logits/chosen": -0.1266070306301117,
16
- "logits/rejected": 0.7204304933547974,
17
- "logps/chosen": -319.01666259765625,
18
- "logps/rejected": -252.47039794921875,
19
- "loss": 0.6957,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
22
  "rewards/margins": 0.0,
@@ -24,287 +24,617 @@
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.05,
28
- "grad_norm": 455.8868449558538,
29
- "learning_rate": 2.631578947368421e-07,
30
- "logits/chosen": -0.38653168082237244,
31
- "logits/rejected": 0.3361072242259979,
32
- "logps/chosen": -266.4560546875,
33
- "logps/rejected": -224.02757263183594,
34
- "loss": 0.6557,
35
- "rewards/accuracies": 0.53125,
36
- "rewards/chosen": -0.08313964307308197,
37
- "rewards/margins": 0.08996326476335526,
38
- "rewards/rejected": -0.17310291528701782,
39
  "step": 10
40
  },
41
  {
42
- "epoch": 0.11,
43
- "grad_norm": 261.73889640243516,
44
- "learning_rate": 4.999552306674344e-07,
45
- "logits/chosen": -0.3945041298866272,
46
- "logits/rejected": 0.5968992114067078,
47
- "logps/chosen": -283.4828796386719,
48
- "logps/rejected": -241.41641235351562,
49
- "loss": 0.4425,
50
- "rewards/accuracies": 0.8218749761581421,
51
- "rewards/chosen": 0.35899922251701355,
52
- "rewards/margins": 2.6830153465270996,
53
- "rewards/rejected": -2.3240160942077637,
54
  "step": 20
55
  },
56
  {
57
- "epoch": 0.16,
58
- "grad_norm": 261.2394624512226,
59
- "learning_rate": 4.946022852363932e-07,
60
- "logits/chosen": -0.47767549753189087,
61
- "logits/rejected": 0.3392488360404968,
62
- "logps/chosen": -265.13568115234375,
63
- "logps/rejected": -243.9243927001953,
64
- "loss": 0.462,
65
- "rewards/accuracies": 0.815625011920929,
66
- "rewards/chosen": -0.15065696835517883,
67
- "rewards/margins": 5.174683094024658,
68
- "rewards/rejected": -5.3253397941589355,
69
  "step": 30
70
  },
71
  {
72
- "epoch": 0.22,
73
- "grad_norm": 310.52802941610554,
74
- "learning_rate": 4.805146507594034e-07,
75
- "logits/chosen": -0.5098148584365845,
76
- "logits/rejected": 0.3751198649406433,
77
- "logps/chosen": -261.0537109375,
78
- "logps/rejected": -233.42532348632812,
79
- "loss": 0.4377,
80
- "rewards/accuracies": 0.862500011920929,
81
- "rewards/chosen": -0.6695741415023804,
82
- "rewards/margins": 6.071677207946777,
83
- "rewards/rejected": -6.7412519454956055,
84
  "step": 40
85
  },
86
  {
87
- "epoch": 0.27,
88
- "grad_norm": 313.59929050730216,
89
- "learning_rate": 4.581953932909403e-07,
90
- "logits/chosen": -0.5670371055603027,
91
- "logits/rejected": 0.24361911416053772,
92
- "logps/chosen": -281.9254150390625,
93
- "logps/rejected": -255.1023712158203,
94
- "loss": 0.3616,
95
- "rewards/accuracies": 0.871874988079071,
96
- "rewards/chosen": 0.5183612108230591,
97
- "rewards/margins": 6.197042465209961,
98
- "rewards/rejected": -5.678681373596191,
99
  "step": 50
100
  },
101
  {
102
- "epoch": 0.32,
103
- "grad_norm": 266.5544579622121,
104
- "learning_rate": 4.284415281717847e-07,
105
- "logits/chosen": -0.47226476669311523,
106
- "logits/rejected": 0.4281126856803894,
107
- "logps/chosen": -278.83251953125,
108
- "logps/rejected": -258.22943115234375,
109
- "loss": 0.3576,
110
- "rewards/accuracies": 0.878125011920929,
111
- "rewards/chosen": -0.19791364669799805,
112
- "rewards/margins": 6.015913486480713,
113
- "rewards/rejected": -6.213827133178711,
114
  "step": 60
115
  },
116
  {
117
- "epoch": 0.38,
118
- "grad_norm": 302.58453941602346,
119
- "learning_rate": 3.923155588020165e-07,
120
- "logits/chosen": -0.5101458430290222,
121
- "logits/rejected": 0.430727481842041,
122
- "logps/chosen": -249.14736938476562,
123
- "logps/rejected": -227.1038055419922,
124
- "loss": 0.3598,
125
- "rewards/accuracies": 0.871874988079071,
126
- "rewards/chosen": 0.11757852882146835,
127
- "rewards/margins": 5.637847900390625,
128
- "rewards/rejected": -5.520269870758057,
129
  "step": 70
130
  },
131
  {
132
- "epoch": 0.43,
133
- "grad_norm": 307.7393549252261,
134
- "learning_rate": 3.511075348989692e-07,
135
- "logits/chosen": -0.28342846035957336,
136
- "logits/rejected": 0.524147629737854,
137
- "logps/chosen": -275.883544921875,
138
- "logps/rejected": -249.2301788330078,
139
- "loss": 0.4166,
140
- "rewards/accuracies": 0.8500000238418579,
141
- "rewards/chosen": -1.1018508672714233,
142
- "rewards/margins": 5.499217510223389,
143
- "rewards/rejected": -6.601068019866943,
144
  "step": 80
145
  },
146
  {
147
- "epoch": 0.49,
148
- "grad_norm": 307.7617928649449,
149
- "learning_rate": 3.062889851306735e-07,
150
- "logits/chosen": -0.23101525008678436,
151
- "logits/rejected": 0.594648003578186,
152
- "logps/chosen": -263.6214294433594,
153
- "logps/rejected": -237.5282440185547,
154
- "loss": 0.3926,
155
- "rewards/accuracies": 0.8687499761581421,
156
- "rewards/chosen": -0.46620288491249084,
157
- "rewards/margins": 5.611274719238281,
158
- "rewards/rejected": -6.077476978302002,
159
  "step": 90
160
  },
161
  {
162
- "epoch": 0.54,
163
- "grad_norm": 263.0128253752098,
164
- "learning_rate": 2.594603691794176e-07,
165
- "logits/chosen": -0.35276657342910767,
166
- "logits/rejected": 0.5542042851448059,
167
- "logps/chosen": -268.01763916015625,
168
- "logps/rejected": -240.2915496826172,
169
- "loss": 0.3427,
170
- "rewards/accuracies": 0.831250011920929,
171
- "rewards/chosen": -0.9488876461982727,
172
- "rewards/margins": 5.2973761558532715,
173
- "rewards/rejected": -6.246264457702637,
174
  "step": 100
175
  },
176
  {
177
- "epoch": 0.59,
178
- "grad_norm": 321.2698886355005,
179
- "learning_rate": 2.1229392570965654e-07,
180
- "logits/chosen": 0.03777565434575081,
181
- "logits/rejected": 0.7461130023002625,
182
- "logps/chosen": -262.41558837890625,
183
- "logps/rejected": -246.7896270751953,
184
- "loss": 0.3086,
185
- "rewards/accuracies": 0.871874988079071,
186
- "rewards/chosen": -1.0481889247894287,
187
- "rewards/margins": 5.13235330581665,
188
- "rewards/rejected": -6.180542945861816,
189
  "step": 110
190
  },
191
  {
192
- "epoch": 0.65,
193
- "grad_norm": 257.6145945098329,
194
- "learning_rate": 1.6647395712565254e-07,
195
- "logits/chosen": -0.36657968163490295,
196
- "logits/rejected": 0.6054766774177551,
197
- "logps/chosen": -276.4320373535156,
198
- "logps/rejected": -247.306640625,
199
- "loss": 0.3606,
200
- "rewards/accuracies": 0.8687499761581421,
201
- "rewards/chosen": -0.45442095398902893,
202
- "rewards/margins": 5.263146877288818,
203
- "rewards/rejected": -5.717567443847656,
204
  "step": 120
205
  },
206
  {
207
- "epoch": 0.7,
208
- "grad_norm": 224.99155866870328,
209
- "learning_rate": 1.2363668353585485e-07,
210
- "logits/chosen": -0.39301735162734985,
211
- "logits/rejected": 0.5232549905776978,
212
- "logps/chosen": -264.6087951660156,
213
- "logps/rejected": -236.9209442138672,
214
- "loss": 0.3158,
215
- "rewards/accuracies": 0.903124988079071,
216
- "rewards/chosen": -0.47178196907043457,
217
- "rewards/margins": 5.685537815093994,
218
- "rewards/rejected": -6.157320499420166,
219
  "step": 130
220
  },
221
  {
222
- "epoch": 0.76,
223
- "grad_norm": 234.77001103810687,
224
- "learning_rate": 8.53118137245516e-08,
225
- "logits/chosen": -0.21741196513175964,
226
- "logits/rejected": 0.6465774178504944,
227
- "logps/chosen": -270.5892028808594,
228
- "logps/rejected": -251.4233856201172,
229
- "loss": 0.2942,
230
- "rewards/accuracies": 0.8687499761581421,
231
- "rewards/chosen": -0.42987537384033203,
232
- "rewards/margins": 5.9034929275512695,
233
- "rewards/rejected": -6.33336877822876,
234
  "step": 140
235
  },
236
  {
237
- "epoch": 0.81,
238
- "grad_norm": 289.2759977765359,
239
- "learning_rate": 5.2867919617408553e-08,
240
- "logits/chosen": -0.30728083848953247,
241
- "logits/rejected": 0.5963491201400757,
242
- "logps/chosen": -267.1575927734375,
243
- "logps/rejected": -238.6244659423828,
244
- "loss": 0.3253,
245
- "rewards/accuracies": 0.8812500238418579,
246
- "rewards/chosen": 0.4330620765686035,
247
- "rewards/margins": 5.231642723083496,
248
- "rewards/rejected": -4.798580646514893,
249
  "step": 150
250
  },
251
  {
252
- "epoch": 0.86,
253
- "grad_norm": 230.9320948308424,
254
- "learning_rate": 2.7463564905650853e-08,
255
- "logits/chosen": -0.37829893827438354,
256
- "logits/rejected": 0.43174609541893005,
257
- "logps/chosen": -270.28094482421875,
258
- "logps/rejected": -246.8420867919922,
259
- "loss": 0.3039,
260
  "rewards/accuracies": 0.893750011920929,
261
- "rewards/chosen": 0.014125394634902477,
262
- "rewards/margins": 5.368089199066162,
263
- "rewards/rejected": -5.353963375091553,
264
  "step": 160
265
  },
266
  {
267
- "epoch": 0.92,
268
- "grad_norm": 253.43499472230928,
269
- "learning_rate": 1.0005933014019307e-08,
270
- "logits/chosen": -0.3070162534713745,
271
- "logits/rejected": 0.6451749205589294,
272
- "logps/chosen": -269.88385009765625,
273
- "logps/rejected": -247.8417205810547,
274
- "loss": 0.314,
275
- "rewards/accuracies": 0.8812500238418579,
276
- "rewards/chosen": -0.7552144527435303,
277
- "rewards/margins": 5.324892044067383,
278
- "rewards/rejected": -6.080106258392334,
279
  "step": 170
280
  },
281
  {
282
- "epoch": 0.97,
283
- "grad_norm": 264.72644139971993,
284
- "learning_rate": 1.1184317978602808e-09,
285
- "logits/chosen": -0.4602000117301941,
286
- "logits/rejected": 0.40653783082962036,
287
- "logps/chosen": -261.251220703125,
288
- "logps/rejected": -241.342529296875,
289
- "loss": 0.404,
290
- "rewards/accuracies": 0.9281250238418579,
291
- "rewards/chosen": -0.2922298312187195,
292
- "rewards/margins": 5.988170623779297,
293
- "rewards/rejected": -6.28040075302124,
294
  "step": 180
295
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  {
297
  "epoch": 1.0,
298
- "step": 185,
299
  "total_flos": 0.0,
300
- "train_loss": 0.37812867454580357,
301
- "train_runtime": 5319.5814,
302
- "train_samples_per_second": 8.892,
303
- "train_steps_per_second": 0.035
304
  }
305
  ],
306
  "logging_steps": 10,
307
- "max_steps": 185,
308
  "num_input_tokens_seen": 0,
309
  "num_train_epochs": 1,
310
  "save_steps": 100,
 
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0,
13
+ "grad_norm": 24.862652137264853,
14
+ "learning_rate": 1.25e-08,
15
+ "logits/chosen": -0.5811702013015747,
16
+ "logits/rejected": -0.11655431985855103,
17
+ "logps/chosen": -351.5902099609375,
18
+ "logps/rejected": -240.969970703125,
19
+ "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
22
  "rewards/margins": 0.0,
 
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.03,
28
+ "grad_norm": 23.69292682023629,
29
+ "learning_rate": 1.25e-07,
30
+ "logits/chosen": 0.26120826601982117,
31
+ "logits/rejected": 0.23706814646720886,
32
+ "logps/chosen": -333.1805419921875,
33
+ "logps/rejected": -244.67898559570312,
34
+ "loss": 0.6922,
35
+ "rewards/accuracies": 0.5173611044883728,
36
+ "rewards/chosen": 0.0021614907309412956,
37
+ "rewards/margins": 0.0021554920822381973,
38
+ "rewards/rejected": 5.998538654239383e-06,
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.05,
43
+ "grad_norm": 18.203526649945516,
44
+ "learning_rate": 2.5e-07,
45
+ "logits/chosen": -0.017204787582159042,
46
+ "logits/rejected": 0.1991611272096634,
47
+ "logps/chosen": -320.430908203125,
48
+ "logps/rejected": -234.376220703125,
49
+ "loss": 0.669,
50
+ "rewards/accuracies": 0.737500011920929,
51
+ "rewards/chosen": 0.033605434000492096,
52
+ "rewards/margins": 0.04716432839632034,
53
+ "rewards/rejected": -0.01355889905244112,
54
  "step": 20
55
  },
56
  {
57
+ "epoch": 0.07,
58
+ "grad_norm": 10.096989474079606,
59
+ "learning_rate": 3.75e-07,
60
+ "logits/chosen": -0.2575300931930542,
61
+ "logits/rejected": -0.4580558240413666,
62
+ "logps/chosen": -300.87896728515625,
63
+ "logps/rejected": -255.5655517578125,
64
+ "loss": 0.5805,
65
+ "rewards/accuracies": 0.7749999761581421,
66
+ "rewards/chosen": 0.14600001275539398,
67
+ "rewards/margins": 0.2884979844093323,
68
+ "rewards/rejected": -0.14249801635742188,
69
  "step": 30
70
  },
71
  {
72
+ "epoch": 0.1,
73
+ "grad_norm": 9.68944337059453,
74
+ "learning_rate": 5e-07,
75
+ "logits/chosen": -0.6759181022644043,
76
+ "logits/rejected": -0.6345951557159424,
77
+ "logps/chosen": -317.50872802734375,
78
+ "logps/rejected": -302.39630126953125,
79
+ "loss": 0.4819,
80
+ "rewards/accuracies": 0.840624988079071,
81
+ "rewards/chosen": 0.07102981209754944,
82
+ "rewards/margins": 0.6418195366859436,
83
+ "rewards/rejected": -0.5707896947860718,
84
  "step": 40
85
  },
86
  {
87
+ "epoch": 0.12,
88
+ "grad_norm": 13.329379682299182,
89
+ "learning_rate": 4.990486745229364e-07,
90
+ "logits/chosen": -0.12263472378253937,
91
+ "logits/rejected": 0.44540151953697205,
92
+ "logps/chosen": -374.64556884765625,
93
+ "logps/rejected": -388.1717224121094,
94
+ "loss": 0.3966,
95
+ "rewards/accuracies": 0.800000011920929,
96
+ "rewards/chosen": -0.4549541473388672,
97
+ "rewards/margins": 1.0250240564346313,
98
+ "rewards/rejected": -1.4799782037734985,
99
  "step": 50
100
  },
101
  {
102
+ "epoch": 0.15,
103
+ "grad_norm": 17.333516248641253,
104
+ "learning_rate": 4.96201938253052e-07,
105
+ "logits/chosen": -0.30300790071487427,
106
+ "logits/rejected": 0.3122316002845764,
107
+ "logps/chosen": -394.78106689453125,
108
+ "logps/rejected": -432.4813537597656,
109
+ "loss": 0.3861,
110
+ "rewards/accuracies": 0.8187500238418579,
111
+ "rewards/chosen": -0.7015730142593384,
112
+ "rewards/margins": 1.1719900369644165,
113
+ "rewards/rejected": -1.8735630512237549,
114
  "step": 60
115
  },
116
  {
117
+ "epoch": 0.17,
118
+ "grad_norm": 15.677534908750197,
119
+ "learning_rate": 4.91481456572267e-07,
120
+ "logits/chosen": 0.7395630478858948,
121
+ "logits/rejected": 1.5376254320144653,
122
+ "logps/chosen": -425.17236328125,
123
+ "logps/rejected": -448.2694396972656,
124
+ "loss": 0.3474,
125
+ "rewards/accuracies": 0.831250011920929,
126
+ "rewards/chosen": -0.8609533309936523,
127
+ "rewards/margins": 1.3486477136611938,
128
+ "rewards/rejected": -2.2096011638641357,
129
  "step": 70
130
  },
131
  {
132
+ "epoch": 0.2,
133
+ "grad_norm": 17.182808543364636,
134
+ "learning_rate": 4.849231551964771e-07,
135
+ "logits/chosen": 2.598942995071411,
136
+ "logits/rejected": 3.4538092613220215,
137
+ "logps/chosen": -448.8929748535156,
138
+ "logps/rejected": -540.0630493164062,
139
+ "loss": 0.3215,
140
+ "rewards/accuracies": 0.890625,
141
+ "rewards/chosen": -1.3736767768859863,
142
+ "rewards/margins": 1.7528272867202759,
143
+ "rewards/rejected": -3.126504421234131,
144
  "step": 80
145
  },
146
  {
147
+ "epoch": 0.23,
148
+ "grad_norm": 16.648755569621386,
149
+ "learning_rate": 4.7657694675916247e-07,
150
+ "logits/chosen": 2.8463895320892334,
151
+ "logits/rejected": 3.732513427734375,
152
+ "logps/chosen": -496.74005126953125,
153
+ "logps/rejected": -623.58984375,
154
+ "loss": 0.3048,
155
+ "rewards/accuracies": 0.878125011920929,
156
+ "rewards/chosen": -1.7601783275604248,
157
+ "rewards/margins": 1.9939384460449219,
158
+ "rewards/rejected": -3.7541167736053467,
159
  "step": 90
160
  },
161
  {
162
+ "epoch": 0.25,
163
+ "grad_norm": 15.972608527062494,
164
+ "learning_rate": 4.6650635094610966e-07,
165
+ "logits/chosen": 2.0133347511291504,
166
+ "logits/rejected": 3.3279690742492676,
167
+ "logps/chosen": -554.5970458984375,
168
+ "logps/rejected": -683.0777587890625,
169
+ "loss": 0.2797,
170
+ "rewards/accuracies": 0.859375,
171
+ "rewards/chosen": -2.21871018409729,
172
+ "rewards/margins": 2.000453233718872,
173
+ "rewards/rejected": -4.219162940979004,
174
  "step": 100
175
  },
176
  {
177
+ "epoch": 0.28,
178
+ "grad_norm": 16.95927334748175,
179
+ "learning_rate": 4.5478801107224794e-07,
180
+ "logits/chosen": 2.1293346881866455,
181
+ "logits/rejected": 3.9433817863464355,
182
+ "logps/chosen": -545.55078125,
183
+ "logps/rejected": -698.3030395507812,
184
+ "loss": 0.2718,
185
+ "rewards/accuracies": 0.887499988079071,
186
+ "rewards/chosen": -2.248697280883789,
187
+ "rewards/margins": 2.459144353866577,
188
+ "rewards/rejected": -4.707841873168945,
189
  "step": 110
190
  },
191
  {
192
+ "epoch": 0.3,
193
+ "grad_norm": 15.769838259410646,
194
+ "learning_rate": 4.415111107797445e-07,
195
+ "logits/chosen": 2.2328364849090576,
196
+ "logits/rejected": 3.943868637084961,
197
+ "logps/chosen": -547.4822998046875,
198
+ "logps/rejected": -709.2218017578125,
199
+ "loss": 0.2597,
200
+ "rewards/accuracies": 0.859375,
201
+ "rewards/chosen": -2.386432409286499,
202
+ "rewards/margins": 2.306048631668091,
203
+ "rewards/rejected": -4.692481517791748,
204
  "step": 120
205
  },
206
  {
207
+ "epoch": 0.33,
208
+ "grad_norm": 16.240997635455848,
209
+ "learning_rate": 4.2677669529663686e-07,
210
+ "logits/chosen": 3.3713316917419434,
211
+ "logits/rejected": 4.970644950866699,
212
+ "logps/chosen": -669.5197143554688,
213
+ "logps/rejected": -839.8416748046875,
214
+ "loss": 0.2523,
215
+ "rewards/accuracies": 0.8687499761581421,
216
+ "rewards/chosen": -3.3710944652557373,
217
+ "rewards/margins": 2.5790421962738037,
218
+ "rewards/rejected": -5.950136661529541,
219
  "step": 130
220
  },
221
  {
222
+ "epoch": 0.35,
223
+ "grad_norm": 16.664869807154886,
224
+ "learning_rate": 4.106969024216348e-07,
225
+ "logits/chosen": 3.0220611095428467,
226
+ "logits/rejected": 4.610594749450684,
227
+ "logps/chosen": -647.0032958984375,
228
+ "logps/rejected": -834.1439208984375,
229
+ "loss": 0.2514,
230
+ "rewards/accuracies": 0.90625,
231
+ "rewards/chosen": -3.284292221069336,
232
+ "rewards/margins": 2.7818052768707275,
233
+ "rewards/rejected": -6.066097259521484,
234
  "step": 140
235
  },
236
  {
237
+ "epoch": 0.38,
238
+ "grad_norm": 17.103959159416473,
239
+ "learning_rate": 3.933941090877615e-07,
240
+ "logits/chosen": 1.9788957834243774,
241
+ "logits/rejected": 3.797266721725464,
242
+ "logps/chosen": -657.1544799804688,
243
+ "logps/rejected": -866.92236328125,
244
+ "loss": 0.2465,
245
+ "rewards/accuracies": 0.887499988079071,
246
+ "rewards/chosen": -3.3205840587615967,
247
+ "rewards/margins": 2.8848683834075928,
248
+ "rewards/rejected": -6.205452919006348,
249
  "step": 150
250
  },
251
  {
252
+ "epoch": 0.4,
253
+ "grad_norm": 22.71759647433438,
254
+ "learning_rate": 3.75e-07,
255
+ "logits/chosen": 2.598877429962158,
256
+ "logits/rejected": 3.922821044921875,
257
+ "logps/chosen": -650.6119995117188,
258
+ "logps/rejected": -860.2496337890625,
259
+ "loss": 0.2424,
260
  "rewards/accuracies": 0.893750011920929,
261
+ "rewards/chosen": -3.585509777069092,
262
+ "rewards/margins": 2.7439045906066895,
263
+ "rewards/rejected": -6.3294148445129395,
264
  "step": 160
265
  },
266
  {
267
+ "epoch": 0.42,
268
+ "grad_norm": 17.154680074008297,
269
+ "learning_rate": 3.5565456543517485e-07,
270
+ "logits/chosen": 1.2129310369491577,
271
+ "logits/rejected": 3.644993543624878,
272
+ "logps/chosen": -650.40576171875,
273
+ "logps/rejected": -869.5897216796875,
274
+ "loss": 0.245,
275
+ "rewards/accuracies": 0.8656250238418579,
276
+ "rewards/chosen": -3.348802089691162,
277
+ "rewards/margins": 3.0448169708251953,
278
+ "rewards/rejected": -6.393619537353516,
279
  "step": 170
280
  },
281
  {
282
+ "epoch": 0.45,
283
+ "grad_norm": 15.288786440112402,
284
+ "learning_rate": 3.355050358314172e-07,
285
+ "logits/chosen": 2.0979018211364746,
286
+ "logits/rejected": 3.6165339946746826,
287
+ "logps/chosen": -733.31298828125,
288
+ "logps/rejected": -946.0720825195312,
289
+ "loss": 0.225,
290
+ "rewards/accuracies": 0.909375011920929,
291
+ "rewards/chosen": -3.9488494396209717,
292
+ "rewards/margins": 3.014504909515381,
293
+ "rewards/rejected": -6.963354587554932,
294
  "step": 180
295
  },
296
+ {
297
+ "epoch": 0.47,
298
+ "grad_norm": 15.20089211524797,
299
+ "learning_rate": 3.147047612756302e-07,
300
+ "logits/chosen": 1.049578309059143,
301
+ "logits/rejected": 3.2230868339538574,
302
+ "logps/chosen": -655.8287963867188,
303
+ "logps/rejected": -909.56787109375,
304
+ "loss": 0.2177,
305
+ "rewards/accuracies": 0.8999999761581421,
306
+ "rewards/chosen": -3.187329053878784,
307
+ "rewards/margins": 3.4261791706085205,
308
+ "rewards/rejected": -6.613508701324463,
309
+ "step": 190
310
+ },
311
+ {
312
+ "epoch": 0.5,
313
+ "grad_norm": 19.21517389497067,
314
+ "learning_rate": 2.934120444167326e-07,
315
+ "logits/chosen": 2.0917961597442627,
316
+ "logits/rejected": 4.381856918334961,
317
+ "logps/chosen": -707.9210205078125,
318
+ "logps/rejected": -967.8511962890625,
319
+ "loss": 0.2291,
320
+ "rewards/accuracies": 0.875,
321
+ "rewards/chosen": -4.029782772064209,
322
+ "rewards/margins": 3.380286455154419,
323
+ "rewards/rejected": -7.410069465637207,
324
+ "step": 200
325
+ },
326
+ {
327
+ "epoch": 0.53,
328
+ "grad_norm": 17.876619392703006,
329
+ "learning_rate": 2.717889356869146e-07,
330
+ "logits/chosen": 2.075894832611084,
331
+ "logits/rejected": 3.812873363494873,
332
+ "logps/chosen": -664.9110717773438,
333
+ "logps/rejected": -898.7711791992188,
334
+ "loss": 0.2335,
335
+ "rewards/accuracies": 0.90625,
336
+ "rewards/chosen": -3.6399245262145996,
337
+ "rewards/margins": 3.0123069286346436,
338
+ "rewards/rejected": -6.652230739593506,
339
+ "step": 210
340
+ },
341
+ {
342
+ "epoch": 0.55,
343
+ "grad_norm": 16.42311250323521,
344
+ "learning_rate": 2.5e-07,
345
+ "logits/chosen": 2.077141523361206,
346
+ "logits/rejected": 4.0336527824401855,
347
+ "logps/chosen": -714.0510864257812,
348
+ "logps/rejected": -951.3372802734375,
349
+ "loss": 0.2163,
350
+ "rewards/accuracies": 0.893750011920929,
351
+ "rewards/chosen": -4.000287055969238,
352
+ "rewards/margins": 3.160945415496826,
353
+ "rewards/rejected": -7.161231994628906,
354
+ "step": 220
355
+ },
356
+ {
357
+ "epoch": 0.57,
358
+ "grad_norm": 21.86260854020408,
359
+ "learning_rate": 2.2821106431308543e-07,
360
+ "logits/chosen": 1.8970081806182861,
361
+ "logits/rejected": 3.8517441749572754,
362
+ "logps/chosen": -711.104248046875,
363
+ "logps/rejected": -952.9786987304688,
364
+ "loss": 0.2307,
365
+ "rewards/accuracies": 0.875,
366
+ "rewards/chosen": -4.102777481079102,
367
+ "rewards/margins": 3.2044379711151123,
368
+ "rewards/rejected": -7.307215213775635,
369
+ "step": 230
370
+ },
371
+ {
372
+ "epoch": 0.6,
373
+ "grad_norm": 17.415535830140726,
374
+ "learning_rate": 2.065879555832674e-07,
375
+ "logits/chosen": 1.93063485622406,
376
+ "logits/rejected": 3.716691255569458,
377
+ "logps/chosen": -733.0238037109375,
378
+ "logps/rejected": -995.0330200195312,
379
+ "loss": 0.2135,
380
+ "rewards/accuracies": 0.8968750238418579,
381
+ "rewards/chosen": -4.205197811126709,
382
+ "rewards/margins": 3.2796833515167236,
383
+ "rewards/rejected": -7.4848809242248535,
384
+ "step": 240
385
+ },
386
+ {
387
+ "epoch": 0.62,
388
+ "grad_norm": 23.48694643420195,
389
+ "learning_rate": 1.8529523872436977e-07,
390
+ "logits/chosen": 1.754500150680542,
391
+ "logits/rejected": 3.7942306995391846,
392
+ "logps/chosen": -756.137939453125,
393
+ "logps/rejected": -1002.9494018554688,
394
+ "loss": 0.2284,
395
+ "rewards/accuracies": 0.887499988079071,
396
+ "rewards/chosen": -4.309305667877197,
397
+ "rewards/margins": 3.231421947479248,
398
+ "rewards/rejected": -7.5407280921936035,
399
+ "step": 250
400
+ },
401
+ {
402
+ "epoch": 0.65,
403
+ "grad_norm": 16.97115932824073,
404
+ "learning_rate": 1.6449496416858282e-07,
405
+ "logits/chosen": 2.1520519256591797,
406
+ "logits/rejected": 4.258932590484619,
407
+ "logps/chosen": -750.8233642578125,
408
+ "logps/rejected": -1022.2374267578125,
409
+ "loss": 0.2136,
410
+ "rewards/accuracies": 0.8999999761581421,
411
+ "rewards/chosen": -4.413332939147949,
412
+ "rewards/margins": 3.5410499572753906,
413
+ "rewards/rejected": -7.954381465911865,
414
+ "step": 260
415
+ },
416
+ {
417
+ "epoch": 0.68,
418
+ "grad_norm": 20.50346815073402,
419
+ "learning_rate": 1.4434543456482518e-07,
420
+ "logits/chosen": 1.4445512294769287,
421
+ "logits/rejected": 3.574235439300537,
422
+ "logps/chosen": -682.8297729492188,
423
+ "logps/rejected": -979.3341674804688,
424
+ "loss": 0.1978,
425
+ "rewards/accuracies": 0.9156249761581421,
426
+ "rewards/chosen": -3.6296730041503906,
427
+ "rewards/margins": 3.7298316955566406,
428
+ "rewards/rejected": -7.359505653381348,
429
+ "step": 270
430
+ },
431
+ {
432
+ "epoch": 0.7,
433
+ "grad_norm": 16.82073433691609,
434
+ "learning_rate": 1.2500000000000005e-07,
435
+ "logits/chosen": 1.172753930091858,
436
+ "logits/rejected": 3.4942619800567627,
437
+ "logps/chosen": -675.3418579101562,
438
+ "logps/rejected": -974.0270385742188,
439
+ "loss": 0.2189,
440
+ "rewards/accuracies": 0.934374988079071,
441
+ "rewards/chosen": -3.6095943450927734,
442
+ "rewards/margins": 3.8261497020721436,
443
+ "rewards/rejected": -7.435744285583496,
444
+ "step": 280
445
+ },
446
+ {
447
+ "epoch": 0.72,
448
+ "grad_norm": 27.148115499609514,
449
+ "learning_rate": 1.0660589091223854e-07,
450
+ "logits/chosen": 1.3045436143875122,
451
+ "logits/rejected": 3.874147891998291,
452
+ "logps/chosen": -724.1644287109375,
453
+ "logps/rejected": -1020.5606689453125,
454
+ "loss": 0.2159,
455
+ "rewards/accuracies": 0.918749988079071,
456
+ "rewards/chosen": -3.9626336097717285,
457
+ "rewards/margins": 3.8746466636657715,
458
+ "rewards/rejected": -7.837281227111816,
459
+ "step": 290
460
+ },
461
+ {
462
+ "epoch": 0.75,
463
+ "grad_norm": 18.896563678409045,
464
+ "learning_rate": 8.930309757836516e-08,
465
+ "logits/chosen": 1.5070204734802246,
466
+ "logits/rejected": 3.8179619312286377,
467
+ "logps/chosen": -756.2379150390625,
468
+ "logps/rejected": -1010.8900146484375,
469
+ "loss": 0.2118,
470
+ "rewards/accuracies": 0.90625,
471
+ "rewards/chosen": -4.342662811279297,
472
+ "rewards/margins": 3.381243944168091,
473
+ "rewards/rejected": -7.72390604019165,
474
+ "step": 300
475
+ },
476
+ {
477
+ "epoch": 0.78,
478
+ "grad_norm": 18.302278636631012,
479
+ "learning_rate": 7.322330470336313e-08,
480
+ "logits/chosen": 2.013995885848999,
481
+ "logits/rejected": 4.006863117218018,
482
+ "logps/chosen": -755.84326171875,
483
+ "logps/rejected": -1039.4290771484375,
484
+ "loss": 0.2007,
485
+ "rewards/accuracies": 0.925000011920929,
486
+ "rewards/chosen": -4.390773296356201,
487
+ "rewards/margins": 3.617499589920044,
488
+ "rewards/rejected": -8.008273124694824,
489
+ "step": 310
490
+ },
491
+ {
492
+ "epoch": 0.8,
493
+ "grad_norm": 19.269057964941258,
494
+ "learning_rate": 5.848888922025552e-08,
495
+ "logits/chosen": 1.4602447748184204,
496
+ "logits/rejected": 3.709857940673828,
497
+ "logps/chosen": -771.56640625,
498
+ "logps/rejected": -1046.9013671875,
499
+ "loss": 0.2211,
500
+ "rewards/accuracies": 0.903124988079071,
501
+ "rewards/chosen": -4.392203330993652,
502
+ "rewards/margins": 3.6757659912109375,
503
+ "rewards/rejected": -8.067970275878906,
504
+ "step": 320
505
+ },
506
+ {
507
+ "epoch": 0.82,
508
+ "grad_norm": 16.212116381856944,
509
+ "learning_rate": 4.521198892775202e-08,
510
+ "logits/chosen": 1.5877026319503784,
511
+ "logits/rejected": 3.5275306701660156,
512
+ "logps/chosen": -748.9736328125,
513
+ "logps/rejected": -1030.6241455078125,
514
+ "loss": 0.1902,
515
+ "rewards/accuracies": 0.909375011920929,
516
+ "rewards/chosen": -4.198099613189697,
517
+ "rewards/margins": 3.724585771560669,
518
+ "rewards/rejected": -7.922685146331787,
519
+ "step": 330
520
+ },
521
+ {
522
+ "epoch": 0.85,
523
+ "grad_norm": 23.048514547738275,
524
+ "learning_rate": 3.349364905389032e-08,
525
+ "logits/chosen": 1.2227389812469482,
526
+ "logits/rejected": 3.192277193069458,
527
+ "logps/chosen": -744.6568603515625,
528
+ "logps/rejected": -1009.3792724609375,
529
+ "loss": 0.2061,
530
+ "rewards/accuracies": 0.893750011920929,
531
+ "rewards/chosen": -4.285494804382324,
532
+ "rewards/margins": 3.405372142791748,
533
+ "rewards/rejected": -7.690866947174072,
534
+ "step": 340
535
+ },
536
+ {
537
+ "epoch": 0.88,
538
+ "grad_norm": 18.4900810885199,
539
+ "learning_rate": 2.3423053240837514e-08,
540
+ "logits/chosen": 1.2598426342010498,
541
+ "logits/rejected": 3.358072280883789,
542
+ "logps/chosen": -737.2872314453125,
543
+ "logps/rejected": -1013.8024291992188,
544
+ "loss": 0.2239,
545
+ "rewards/accuracies": 0.871874988079071,
546
+ "rewards/chosen": -4.232865333557129,
547
+ "rewards/margins": 3.510840892791748,
548
+ "rewards/rejected": -7.743706703186035,
549
+ "step": 350
550
+ },
551
+ {
552
+ "epoch": 0.9,
553
+ "grad_norm": 16.48305540217272,
554
+ "learning_rate": 1.507684480352292e-08,
555
+ "logits/chosen": 1.188299298286438,
556
+ "logits/rejected": 3.3616530895233154,
557
+ "logps/chosen": -726.2432861328125,
558
+ "logps/rejected": -1018.1263427734375,
559
+ "loss": 0.1958,
560
+ "rewards/accuracies": 0.90625,
561
+ "rewards/chosen": -4.075113773345947,
562
+ "rewards/margins": 3.6948330402374268,
563
+ "rewards/rejected": -7.769946098327637,
564
+ "step": 360
565
+ },
566
+ {
567
+ "epoch": 0.93,
568
+ "grad_norm": 20.27854050236199,
569
+ "learning_rate": 8.518543427732949e-09,
570
+ "logits/chosen": 1.1218559741973877,
571
+ "logits/rejected": 3.376429319381714,
572
+ "logps/chosen": -706.4921875,
573
+ "logps/rejected": -1009.96533203125,
574
+ "loss": 0.199,
575
+ "rewards/accuracies": 0.9156249761581421,
576
+ "rewards/chosen": -4.078927516937256,
577
+ "rewards/margins": 3.7843894958496094,
578
+ "rewards/rejected": -7.863317966461182,
579
+ "step": 370
580
+ },
581
+ {
582
+ "epoch": 0.95,
583
+ "grad_norm": 21.44365501017452,
584
+ "learning_rate": 3.798061746947995e-09,
585
+ "logits/chosen": 1.2741193771362305,
586
+ "logits/rejected": 3.5388190746307373,
587
+ "logps/chosen": -727.7400512695312,
588
+ "logps/rejected": -996.4246215820312,
589
+ "loss": 0.2143,
590
+ "rewards/accuracies": 0.9281250238418579,
591
+ "rewards/chosen": -4.139595985412598,
592
+ "rewards/margins": 3.5642218589782715,
593
+ "rewards/rejected": -7.703817844390869,
594
+ "step": 380
595
+ },
596
+ {
597
+ "epoch": 0.97,
598
+ "grad_norm": 19.980658149789488,
599
+ "learning_rate": 9.513254770636137e-10,
600
+ "logits/chosen": 1.3589586019515991,
601
+ "logits/rejected": 3.3931171894073486,
602
+ "logps/chosen": -751.2428588867188,
603
+ "logps/rejected": -1019.2142333984375,
604
+ "loss": 0.2117,
605
+ "rewards/accuracies": 0.893750011920929,
606
+ "rewards/chosen": -4.217888832092285,
607
+ "rewards/margins": 3.668452739715576,
608
+ "rewards/rejected": -7.8863420486450195,
609
+ "step": 390
610
+ },
611
+ {
612
+ "epoch": 1.0,
613
+ "grad_norm": 19.223012927780225,
614
+ "learning_rate": 0.0,
615
+ "logits/chosen": 1.3647325038909912,
616
+ "logits/rejected": 3.651308536529541,
617
+ "logps/chosen": -756.8760986328125,
618
+ "logps/rejected": -1033.67333984375,
619
+ "loss": 0.2003,
620
+ "rewards/accuracies": 0.9125000238418579,
621
+ "rewards/chosen": -4.446074485778809,
622
+ "rewards/margins": 3.6355972290039062,
623
+ "rewards/rejected": -8.081671714782715,
624
+ "step": 400
625
+ },
626
  {
627
  "epoch": 1.0,
628
+ "step": 400,
629
  "total_flos": 0.0,
630
+ "train_loss": 0.2785977178812027,
631
+ "train_runtime": 11929.9898,
632
+ "train_samples_per_second": 8.58,
633
+ "train_steps_per_second": 0.034
634
  }
635
  ],
636
  "logging_steps": 10,
637
+ "max_steps": 400,
638
  "num_input_tokens_seen": 0,
639
  "num_train_epochs": 1,
640
  "save_steps": 100,