sfulay commited on
Commit
2932011
1 Parent(s): 8d28128

Model save

Browse files
README.md CHANGED
@@ -2,16 +2,10 @@
2
  license: apache-2.0
3
  base_model: alignment-handbook/zephyr-7b-sft-full
4
  tags:
5
- - alignment-handbook
6
- - trl
7
- - dpo
8
- - generated_from_trainer
9
  - trl
10
  - dpo
11
  - alignment-handbook
12
  - generated_from_trainer
13
- datasets:
14
- - HuggingFaceH4/ultrafeedback_binarized
15
  model-index:
16
  - name: zephyr-7b-dpo-full-ultrabin-high-margin
17
  results: []
@@ -22,17 +16,17 @@ should probably proofread and complete it, then remove this comment. -->
22
 
23
  # zephyr-7b-dpo-full-ultrabin-high-margin
24
 
25
- This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the HuggingFaceH4/ultrafeedback_binarized dataset.
26
  It achieves the following results on the evaluation set:
27
- - Loss: 0.5565
28
- - Rewards/chosen: -0.6538
29
- - Rewards/rejected: -1.5442
30
- - Rewards/accuracies: 0.7578
31
- - Rewards/margins: 0.8904
32
- - Logps/rejected: -417.0811
33
- - Logps/chosen: -328.0004
34
- - Logits/rejected: 0.3127
35
- - Logits/chosen: -0.1625
36
 
37
  ## Model description
38
 
@@ -69,7 +63,8 @@ The following hyperparameters were used during training:
69
 
70
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
71
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
72
- | 0.3228 | 0.6969 | 100 | 0.5641 | -0.9601 | -1.8247 | 0.7539 | 0.8645 | -445.1261 | -358.6317 | 0.5872 | 0.2254 |
 
73
 
74
 
75
  ### Framework versions
 
2
  license: apache-2.0
3
  base_model: alignment-handbook/zephyr-7b-sft-full
4
  tags:
 
 
 
 
5
  - trl
6
  - dpo
7
  - alignment-handbook
8
  - generated_from_trainer
 
 
9
  model-index:
10
  - name: zephyr-7b-dpo-full-ultrabin-high-margin
11
  results: []
 
16
 
17
  # zephyr-7b-dpo-full-ultrabin-high-margin
18
 
19
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.5598
22
+ - Rewards/chosen: -0.6746
23
+ - Rewards/rejected: -1.5654
24
+ - Rewards/accuracies: 0.75
25
+ - Rewards/margins: 0.8907
26
+ - Logps/rejected: -419.1961
27
+ - Logps/chosen: -330.0835
28
+ - Logits/rejected: 0.2134
29
+ - Logits/chosen: -0.2417
30
 
31
  ## Model description
32
 
 
63
 
64
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
65
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
66
+ | 0.4719 | 0.3484 | 50 | 0.5899 | -0.3895 | -0.8981 | 0.7070 | 0.5086 | -352.4708 | -301.5678 | -1.9397 | -1.9963 |
67
+ | 0.3224 | 0.6969 | 100 | 0.5598 | -0.6746 | -1.5654 | 0.75 | 0.8907 | -419.1961 | -330.0835 | 0.2134 | -0.2417 |
68
 
69
 
70
  ### Framework versions
all_results.json CHANGED
@@ -14,9 +14,9 @@
14
  "eval_samples_per_second": 19.581,
15
  "eval_steps_per_second": 0.313,
16
  "total_flos": 0.0,
17
- "train_loss": 0.4296882511018873,
18
- "train_runtime": 3529.992,
19
  "train_samples": 18339,
20
- "train_samples_per_second": 5.195,
21
- "train_steps_per_second": 0.041
22
  }
 
14
  "eval_samples_per_second": 19.581,
15
  "eval_steps_per_second": 0.313,
16
  "total_flos": 0.0,
17
+ "train_loss": 0.42797933008287337,
18
+ "train_runtime": 3631.7108,
19
  "train_samples": 18339,
20
+ "train_samples_per_second": 5.05,
21
+ "train_steps_per_second": 0.039
22
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b55076072d3d71f9edcf9b1d358562348023a796ae519c1f0ce9da2573c74ceb
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2850ffba269c9beea7a27e656c559c5f876b38967c7a6b87d210c4d1b66e9185
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:954546cecc8cfffe662fdb29010a136c024abfbf677bfe49a3cc9eada0aae98f
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f82482a5b53c718a34153ce2321f5c28b587d7da9ba733d4b9e3b74bff5feb2
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:83e63cfe024227217839bcad4c55250906141175fe511bc646f943e2c1c4fd98
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88875900874228283080cbb0b5f0f9444fc6aa58ee90c727879470f81eae1681
3
  size 4540516344
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9965156794425087,
3
  "total_flos": 0.0,
4
- "train_loss": 0.4296882511018873,
5
- "train_runtime": 3529.992,
6
  "train_samples": 18339,
7
- "train_samples_per_second": 5.195,
8
- "train_steps_per_second": 0.041
9
  }
 
1
  {
2
  "epoch": 0.9965156794425087,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.42797933008287337,
5
+ "train_runtime": 3631.7108,
6
  "train_samples": 18339,
7
+ "train_samples_per_second": 5.05,
8
+ "train_steps_per_second": 0.039
9
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 0.9965156794425087,
5
- "eval_steps": 100,
6
  "global_step": 143,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
@@ -10,238 +10,254 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.06968641114982578,
13
- "grad_norm": 8.870989685926396,
14
  "learning_rate": 3.333333333333333e-07,
15
- "logits/chosen": -2.5107316970825195,
16
- "logits/rejected": -2.459897994995117,
17
- "logps/chosen": -224.4440460205078,
18
- "logps/rejected": -205.32931518554688,
19
  "loss": 0.6925,
20
- "rewards/accuracies": 0.4937500059604645,
21
- "rewards/chosen": 0.0013574643526226282,
22
- "rewards/margins": 0.001965512754395604,
23
- "rewards/rejected": -0.0006080485763959587,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.13937282229965156,
28
- "grad_norm": 8.120676621682783,
29
  "learning_rate": 4.981198836496775e-07,
30
- "logits/chosen": -2.539044141769409,
31
- "logits/rejected": -2.4031107425689697,
32
- "logps/chosen": -231.701416015625,
33
- "logps/rejected": -203.5116729736328,
34
  "loss": 0.6753,
35
- "rewards/accuracies": 0.768750011920929,
36
- "rewards/chosen": 0.00874270312488079,
37
- "rewards/margins": 0.029630497097969055,
38
- "rewards/rejected": -0.020887792110443115,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.20905923344947736,
43
- "grad_norm": 10.53786145514599,
44
  "learning_rate": 4.832481997086846e-07,
45
- "logits/chosen": -2.5250794887542725,
46
- "logits/rejected": -2.4301116466522217,
47
- "logps/chosen": -225.29916381835938,
48
- "logps/rejected": -253.11544799804688,
49
  "loss": 0.6114,
50
  "rewards/accuracies": 0.856249988079071,
51
- "rewards/chosen": 0.030692869797348976,
52
- "rewards/margins": 0.19038814306259155,
53
- "rewards/rejected": -0.15969529747962952,
54
  "step": 30
55
  },
56
  {
57
  "epoch": 0.2787456445993031,
58
- "grad_norm": 19.29362558041151,
59
  "learning_rate": 4.543962032878959e-07,
60
- "logits/chosen": -2.405980110168457,
61
- "logits/rejected": -2.3023486137390137,
62
- "logps/chosen": -241.8335418701172,
63
- "logps/rejected": -256.7768249511719,
64
- "loss": 0.5446,
65
  "rewards/accuracies": 0.8500000238418579,
66
- "rewards/chosen": -0.04428701475262642,
67
- "rewards/margins": 0.44735702872276306,
68
- "rewards/rejected": -0.4916439950466156,
69
  "step": 40
70
  },
71
  {
72
  "epoch": 0.34843205574912894,
73
- "grad_norm": 21.034007556263575,
74
  "learning_rate": 4.1329321073844413e-07,
75
- "logits/chosen": -2.3216395378112793,
76
- "logits/rejected": -2.171525239944458,
77
- "logps/chosen": -253.8367919921875,
78
- "logps/rejected": -303.62744140625,
79
- "loss": 0.4742,
80
- "rewards/accuracies": 0.8062499761581421,
81
- "rewards/chosen": -0.29281625151634216,
82
- "rewards/margins": 0.6403177380561829,
83
- "rewards/rejected": -0.9331340789794922,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  "step": 50
85
  },
86
  {
87
  "epoch": 0.4181184668989547,
88
- "grad_norm": 28.77756194136436,
89
  "learning_rate": 3.624028324136517e-07,
90
- "logits/chosen": -1.4179085493087769,
91
- "logits/rejected": -1.13853919506073,
92
- "logps/chosen": -299.9506530761719,
93
- "logps/rejected": -362.7635803222656,
94
- "loss": 0.4053,
95
- "rewards/accuracies": 0.84375,
96
- "rewards/chosen": -0.5958151817321777,
97
- "rewards/margins": 0.9765597581863403,
98
- "rewards/rejected": -1.572374939918518,
99
  "step": 60
100
  },
101
  {
102
  "epoch": 0.4878048780487805,
103
- "grad_norm": 24.40968960837773,
104
  "learning_rate": 3.047753100392174e-07,
105
- "logits/chosen": -0.7467012405395508,
106
- "logits/rejected": -0.3752829432487488,
107
- "logps/chosen": -266.78118896484375,
108
- "logps/rejected": -392.96392822265625,
109
- "loss": 0.364,
110
  "rewards/accuracies": 0.862500011920929,
111
- "rewards/chosen": -0.5415125489234924,
112
- "rewards/margins": 1.2262623310089111,
113
- "rewards/rejected": -1.7677749395370483,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.5574912891986062,
118
- "grad_norm": 35.97929804339313,
119
  "learning_rate": 2.4386469286927194e-07,
120
- "logits/chosen": -0.13451911509037018,
121
- "logits/rejected": 0.4377119541168213,
122
- "logps/chosen": -296.5347595214844,
123
- "logps/rejected": -490.88177490234375,
124
- "loss": 0.3454,
125
- "rewards/accuracies": 0.8687499761581421,
126
- "rewards/chosen": -0.7803608179092407,
127
- "rewards/margins": 1.4910542964935303,
128
- "rewards/rejected": -2.2714149951934814,
129
  "step": 80
130
  },
131
  {
132
  "epoch": 0.627177700348432,
133
- "grad_norm": 33.964744689691656,
134
  "learning_rate": 1.8332181063127542e-07,
135
- "logits/chosen": -0.3771997094154358,
136
- "logits/rejected": 0.5029958486557007,
137
- "logps/chosen": -298.05755615234375,
138
- "logps/rejected": -438.8995056152344,
139
  "loss": 0.3319,
140
- "rewards/accuracies": 0.90625,
141
- "rewards/chosen": -0.5154351592063904,
142
- "rewards/margins": 1.5300105810165405,
143
- "rewards/rejected": -2.0454459190368652,
144
  "step": 90
145
  },
146
  {
147
  "epoch": 0.6968641114982579,
148
- "grad_norm": 28.73220407106371,
149
  "learning_rate": 1.26775451942554e-07,
150
- "logits/chosen": 0.3942970931529999,
151
- "logits/rejected": 1.039069414138794,
152
- "logps/chosen": -326.0721435546875,
153
- "logps/rejected": -470.18853759765625,
154
- "loss": 0.3228,
155
- "rewards/accuracies": 0.862500011920929,
156
- "rewards/chosen": -0.9078343510627747,
157
- "rewards/margins": 1.4802639484405518,
158
- "rewards/rejected": -2.3880982398986816,
159
  "step": 100
160
  },
161
  {
162
  "epoch": 0.6968641114982579,
163
- "eval_logits/chosen": 0.2254416048526764,
164
- "eval_logits/rejected": 0.5872303247451782,
165
- "eval_logps/chosen": -358.6317443847656,
166
- "eval_logps/rejected": -445.1260681152344,
167
- "eval_loss": 0.5641274452209473,
168
- "eval_rewards/accuracies": 0.75390625,
169
- "eval_rewards/chosen": -0.960127592086792,
170
- "eval_rewards/margins": 0.8645257353782654,
171
- "eval_rewards/rejected": -1.824653148651123,
172
- "eval_runtime": 102.0937,
173
- "eval_samples_per_second": 19.59,
174
- "eval_steps_per_second": 0.313,
175
  "step": 100
176
  },
177
  {
178
  "epoch": 0.7665505226480837,
179
- "grad_norm": 35.093388243905714,
180
  "learning_rate": 7.761486381573326e-08,
181
- "logits/chosen": 0.3774252235889435,
182
- "logits/rejected": 1.4682767391204834,
183
- "logps/chosen": -346.6317443847656,
184
- "logps/rejected": -460.89837646484375,
185
- "loss": 0.3193,
186
  "rewards/accuracies": 0.8687499761581421,
187
- "rewards/chosen": -0.9842090606689453,
188
- "rewards/margins": 1.5870163440704346,
189
- "rewards/rejected": -2.57122540473938,
190
  "step": 110
191
  },
192
  {
193
  "epoch": 0.8362369337979094,
194
- "grad_norm": 33.95663558990993,
195
  "learning_rate": 3.878660868757322e-08,
196
- "logits/chosen": 0.2716614603996277,
197
- "logits/rejected": 1.619855284690857,
198
- "logps/chosen": -330.23992919921875,
199
- "logps/rejected": -440.49334716796875,
200
- "loss": 0.309,
201
  "rewards/accuracies": 0.8812500238418579,
202
- "rewards/chosen": -0.8578445315361023,
203
- "rewards/margins": 1.694758415222168,
204
- "rewards/rejected": -2.552602529525757,
205
  "step": 120
206
  },
207
  {
208
  "epoch": 0.9059233449477352,
209
- "grad_norm": 29.086065504687443,
210
  "learning_rate": 1.261795485174083e-08,
211
- "logits/chosen": 0.13383683562278748,
212
- "logits/rejected": 1.3482104539871216,
213
- "logps/chosen": -283.5831604003906,
214
- "logps/rejected": -445.7456970214844,
215
- "loss": 0.3327,
216
  "rewards/accuracies": 0.918749988079071,
217
- "rewards/chosen": -0.6794359683990479,
218
- "rewards/margins": 1.8727607727050781,
219
- "rewards/rejected": -2.552196741104126,
220
  "step": 130
221
  },
222
  {
223
  "epoch": 0.975609756097561,
224
- "grad_norm": 32.68843173564522,
225
  "learning_rate": 6.773858303274482e-10,
226
- "logits/chosen": 0.3474978804588318,
227
- "logits/rejected": 1.376219630241394,
228
- "logps/chosen": -289.83441162109375,
229
- "logps/rejected": -441.0291442871094,
230
- "loss": 0.3297,
231
- "rewards/accuracies": 0.90625,
232
- "rewards/chosen": -0.7964991331100464,
233
- "rewards/margins": 1.6330715417861938,
234
- "rewards/rejected": -2.4295706748962402,
235
  "step": 140
236
  },
237
  {
238
  "epoch": 0.9965156794425087,
239
  "step": 143,
240
  "total_flos": 0.0,
241
- "train_loss": 0.4296882511018873,
242
- "train_runtime": 3529.992,
243
- "train_samples_per_second": 5.195,
244
- "train_steps_per_second": 0.041
245
  }
246
  ],
247
  "logging_steps": 10,
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 0.9965156794425087,
5
+ "eval_steps": 50,
6
  "global_step": 143,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.06968641114982578,
13
+ "grad_norm": 8.879917985004713,
14
  "learning_rate": 3.333333333333333e-07,
15
+ "logits/chosen": -2.5109400749206543,
16
+ "logits/rejected": -2.4602229595184326,
17
+ "logps/chosen": -224.4826202392578,
18
+ "logps/rejected": -205.3321075439453,
19
  "loss": 0.6925,
20
+ "rewards/accuracies": 0.512499988079071,
21
+ "rewards/chosen": 0.0009716759668663144,
22
+ "rewards/margins": 0.001607558922842145,
23
+ "rewards/rejected": -0.0006358829559758306,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.13937282229965156,
28
+ "grad_norm": 8.119644042984005,
29
  "learning_rate": 4.981198836496775e-07,
30
+ "logits/chosen": -2.538778305053711,
31
+ "logits/rejected": -2.402923583984375,
32
+ "logps/chosen": -231.76394653320312,
33
+ "logps/rejected": -203.54684448242188,
34
  "loss": 0.6753,
35
+ "rewards/accuracies": 0.7875000238418579,
36
+ "rewards/chosen": 0.008117455057799816,
37
+ "rewards/margins": 0.029356980696320534,
38
+ "rewards/rejected": -0.02123952843248844,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.20905923344947736,
43
+ "grad_norm": 10.589687412392658,
44
  "learning_rate": 4.832481997086846e-07,
45
+ "logits/chosen": -2.5253872871398926,
46
+ "logits/rejected": -2.430739402770996,
47
+ "logps/chosen": -225.355712890625,
48
+ "logps/rejected": -253.0909881591797,
49
  "loss": 0.6114,
50
  "rewards/accuracies": 0.856249988079071,
51
+ "rewards/chosen": 0.030127260833978653,
52
+ "rewards/margins": 0.18957777321338654,
53
+ "rewards/rejected": -0.15945051610469818,
54
  "step": 30
55
  },
56
  {
57
  "epoch": 0.2787456445993031,
58
+ "grad_norm": 19.230572297481732,
59
  "learning_rate": 4.543962032878959e-07,
60
+ "logits/chosen": -2.4045207500457764,
61
+ "logits/rejected": -2.3013105392456055,
62
+ "logps/chosen": -241.72854614257812,
63
+ "logps/rejected": -256.7030944824219,
64
+ "loss": 0.5447,
65
  "rewards/accuracies": 0.8500000238418579,
66
+ "rewards/chosen": -0.04323701187968254,
67
+ "rewards/margins": 0.447670042514801,
68
+ "rewards/rejected": -0.4909070134162903,
69
  "step": 40
70
  },
71
  {
72
  "epoch": 0.34843205574912894,
73
+ "grad_norm": 23.777968472263748,
74
  "learning_rate": 4.1329321073844413e-07,
75
+ "logits/chosen": -2.2776694297790527,
76
+ "logits/rejected": -2.1192574501037598,
77
+ "logps/chosen": -255.7515411376953,
78
+ "logps/rejected": -307.02410888671875,
79
+ "loss": 0.4719,
80
+ "rewards/accuracies": 0.793749988079071,
81
+ "rewards/chosen": -0.31196385622024536,
82
+ "rewards/margins": 0.6551374197006226,
83
+ "rewards/rejected": -0.9671012163162231,
84
+ "step": 50
85
+ },
86
+ {
87
+ "epoch": 0.34843205574912894,
88
+ "eval_logits/chosen": -1.996337890625,
89
+ "eval_logits/rejected": -1.9396870136260986,
90
+ "eval_logps/chosen": -301.5677795410156,
91
+ "eval_logps/rejected": -352.47076416015625,
92
+ "eval_loss": 0.5898596048355103,
93
+ "eval_rewards/accuracies": 0.70703125,
94
+ "eval_rewards/chosen": -0.3894880414009094,
95
+ "eval_rewards/margins": 0.5086125135421753,
96
+ "eval_rewards/rejected": -0.8981005549430847,
97
+ "eval_runtime": 102.5575,
98
+ "eval_samples_per_second": 19.501,
99
+ "eval_steps_per_second": 0.312,
100
  "step": 50
101
  },
102
  {
103
  "epoch": 0.4181184668989547,
104
+ "grad_norm": 33.164641300897365,
105
  "learning_rate": 3.624028324136517e-07,
106
+ "logits/chosen": -1.2023751735687256,
107
+ "logits/rejected": -0.8880468606948853,
108
+ "logps/chosen": -297.7267150878906,
109
+ "logps/rejected": -362.5870056152344,
110
+ "loss": 0.4024,
111
+ "rewards/accuracies": 0.8500000238418579,
112
+ "rewards/chosen": -0.5735751986503601,
113
+ "rewards/margins": 0.9970341920852661,
114
+ "rewards/rejected": -1.570609211921692,
115
  "step": 60
116
  },
117
  {
118
  "epoch": 0.4878048780487805,
119
+ "grad_norm": 26.00066903060501,
120
  "learning_rate": 3.047753100392174e-07,
121
+ "logits/chosen": -0.6797115802764893,
122
+ "logits/rejected": -0.30923840403556824,
123
+ "logps/chosen": -267.3857116699219,
124
+ "logps/rejected": -395.60101318359375,
125
+ "loss": 0.362,
126
  "rewards/accuracies": 0.862500011920929,
127
+ "rewards/chosen": -0.5475583076477051,
128
+ "rewards/margins": 1.2465879917144775,
129
+ "rewards/rejected": -1.7941462993621826,
130
  "step": 70
131
  },
132
  {
133
  "epoch": 0.5574912891986062,
134
+ "grad_norm": 36.007825212896435,
135
  "learning_rate": 2.4386469286927194e-07,
136
+ "logits/chosen": -0.32210594415664673,
137
+ "logits/rejected": 0.29763275384902954,
138
+ "logps/chosen": -286.4994812011719,
139
+ "logps/rejected": -484.10357666015625,
140
+ "loss": 0.3448,
141
+ "rewards/accuracies": 0.8812500238418579,
142
+ "rewards/chosen": -0.6800082921981812,
143
+ "rewards/margins": 1.5236244201660156,
144
+ "rewards/rejected": -2.2036328315734863,
145
  "step": 80
146
  },
147
  {
148
  "epoch": 0.627177700348432,
149
+ "grad_norm": 28.429320380976726,
150
  "learning_rate": 1.8332181063127542e-07,
151
+ "logits/chosen": -0.29179516434669495,
152
+ "logits/rejected": 0.5656725168228149,
153
+ "logps/chosen": -315.7966613769531,
154
+ "logps/rejected": -458.8309631347656,
155
  "loss": 0.3319,
156
+ "rewards/accuracies": 0.893750011920929,
157
+ "rewards/chosen": -0.6928261518478394,
158
+ "rewards/margins": 1.5519336462020874,
159
+ "rewards/rejected": -2.244760036468506,
160
  "step": 90
161
  },
162
  {
163
  "epoch": 0.6968641114982579,
164
+ "grad_norm": 29.97250123159769,
165
  "learning_rate": 1.26775451942554e-07,
166
+ "logits/chosen": -0.13269878923892975,
167
+ "logits/rejected": 0.6727190017700195,
168
+ "logps/chosen": -300.4376220703125,
169
+ "logps/rejected": -451.68609619140625,
170
+ "loss": 0.3224,
171
+ "rewards/accuracies": 0.8687499761581421,
172
+ "rewards/chosen": -0.6514891982078552,
173
+ "rewards/margins": 1.551584243774414,
174
+ "rewards/rejected": -2.203073501586914,
175
  "step": 100
176
  },
177
  {
178
  "epoch": 0.6968641114982579,
179
+ "eval_logits/chosen": -0.2416563630104065,
180
+ "eval_logits/rejected": 0.21337364614009857,
181
+ "eval_logps/chosen": -330.0835266113281,
182
+ "eval_logps/rejected": -419.1961364746094,
183
+ "eval_loss": 0.5598118305206299,
184
+ "eval_rewards/accuracies": 0.75,
185
+ "eval_rewards/chosen": -0.6746450662612915,
186
+ "eval_rewards/margins": 0.8907086849212646,
187
+ "eval_rewards/rejected": -1.5653537511825562,
188
+ "eval_runtime": 100.9616,
189
+ "eval_samples_per_second": 19.81,
190
+ "eval_steps_per_second": 0.317,
191
  "step": 100
192
  },
193
  {
194
  "epoch": 0.7665505226480837,
195
+ "grad_norm": 26.411152004320307,
196
  "learning_rate": 7.761486381573326e-08,
197
+ "logits/chosen": 0.1635294407606125,
198
+ "logits/rejected": 1.3954848051071167,
199
+ "logps/chosen": -325.93487548828125,
200
+ "logps/rejected": -446.07916259765625,
201
+ "loss": 0.3142,
202
  "rewards/accuracies": 0.8687499761581421,
203
+ "rewards/chosen": -0.7772396206855774,
204
+ "rewards/margins": 1.6457939147949219,
205
+ "rewards/rejected": -2.4230334758758545,
206
  "step": 110
207
  },
208
  {
209
  "epoch": 0.8362369337979094,
210
+ "grad_norm": 32.75502406597345,
211
  "learning_rate": 3.878660868757322e-08,
212
+ "logits/chosen": 0.5776845216751099,
213
+ "logits/rejected": 1.9672679901123047,
214
+ "logps/chosen": -338.0449523925781,
215
+ "logps/rejected": -449.5287170410156,
216
+ "loss": 0.3042,
217
  "rewards/accuracies": 0.8812500238418579,
218
+ "rewards/chosen": -0.9358948469161987,
219
+ "rewards/margins": 1.7070610523223877,
220
+ "rewards/rejected": -2.642955780029297,
221
  "step": 120
222
  },
223
  {
224
  "epoch": 0.9059233449477352,
225
+ "grad_norm": 34.08869226634673,
226
  "learning_rate": 1.261795485174083e-08,
227
+ "logits/chosen": 0.4366391599178314,
228
+ "logits/rejected": 1.6738389730453491,
229
+ "logps/chosen": -295.5234375,
230
+ "logps/rejected": -462.7455139160156,
231
+ "loss": 0.3275,
232
  "rewards/accuracies": 0.918749988079071,
233
+ "rewards/chosen": -0.7988389730453491,
234
+ "rewards/margins": 1.923356294631958,
235
+ "rewards/rejected": -2.7221951484680176,
236
  "step": 130
237
  },
238
  {
239
  "epoch": 0.975609756097561,
240
+ "grad_norm": 33.23571195007149,
241
  "learning_rate": 6.773858303274482e-10,
242
+ "logits/chosen": 0.5510319471359253,
243
+ "logits/rejected": 1.6277908086776733,
244
+ "logps/chosen": -300.76043701171875,
245
+ "logps/rejected": -455.19793701171875,
246
+ "loss": 0.3272,
247
+ "rewards/accuracies": 0.887499988079071,
248
+ "rewards/chosen": -0.9057596325874329,
249
+ "rewards/margins": 1.6654990911483765,
250
+ "rewards/rejected": -2.571258544921875,
251
  "step": 140
252
  },
253
  {
254
  "epoch": 0.9965156794425087,
255
  "step": 143,
256
  "total_flos": 0.0,
257
+ "train_loss": 0.42797933008287337,
258
+ "train_runtime": 3631.7108,
259
+ "train_samples_per_second": 5.05,
260
+ "train_steps_per_second": 0.039
261
  }
262
  ],
263
  "logging_steps": 10,