btherien commited on
Commit
22385c4
1 Parent(s): b236d16

Model save

Browse files
README.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: btherien/410M_it-132366_Tr-slim-pajama-300B-replay5_finetune_sft-full
3
+ tags:
4
+ - trl
5
+ - dpo
6
+ - generated_from_trainer
7
+ model-index:
8
+ - name: 410M_it-132366_Tr-slim-pajama-300B-replay5_finetune_dpo-full
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # 410M_it-132366_Tr-slim-pajama-300B-replay5_finetune_dpo-full
16
+
17
+ This model is a fine-tuned version of [btherien/410M_it-132366_Tr-slim-pajama-300B-replay5_finetune_sft-full](https://huggingface.co/btherien/410M_it-132366_Tr-slim-pajama-300B-replay5_finetune_sft-full) on the None dataset.
18
+ It achieves the following results on the evaluation set:
19
+ - Loss: 0.6500
20
+ - Rewards/chosen: -0.4023
21
+ - Rewards/rejected: -0.5547
22
+ - Rewards/accuracies: 0.6171
23
+ - Rewards/margins: 0.1514
24
+ - Logps/rejected: -420.0
25
+ - Logps/chosen: -464.0
26
+ - Logits/rejected: -1.3828
27
+ - Logits/chosen: -1.4297
28
+
29
+ ## Model description
30
+
31
+ More information needed
32
+
33
+ ## Intended uses & limitations
34
+
35
+ More information needed
36
+
37
+ ## Training and evaluation data
38
+
39
+ More information needed
40
+
41
+ ## Training procedure
42
+
43
+ ### Training hyperparameters
44
+
45
+ The following hyperparameters were used during training:
46
+ - learning_rate: 5e-07
47
+ - train_batch_size: 8
48
+ - eval_batch_size: 8
49
+ - seed: 42
50
+ - distributed_type: multi-GPU
51
+ - num_devices: 4
52
+ - gradient_accumulation_steps: 4
53
+ - total_train_batch_size: 128
54
+ - total_eval_batch_size: 32
55
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
56
+ - lr_scheduler_type: cosine
57
+ - lr_scheduler_warmup_ratio: 0.1
58
+ - num_epochs: 1
59
+
60
+ ### Training results
61
+
62
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
63
+ |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
64
+ | 0.6867 | 0.21 | 100 | 0.6853 | -0.0056 | -0.0271 | 0.5556 | 0.0216 | -368.0 | -424.0 | -1.1953 | -1.2422 |
65
+ | 0.6632 | 0.42 | 200 | 0.6625 | -0.2285 | -0.3203 | 0.625 | 0.0913 | -396.0 | -446.0 | -1.3203 | -1.3672 |
66
+ | 0.6493 | 0.63 | 300 | 0.6532 | -0.3555 | -0.4902 | 0.625 | 0.1348 | -412.0 | -460.0 | -1.3516 | -1.3984 |
67
+ | 0.6462 | 0.84 | 400 | 0.6495 | -0.4004 | -0.5547 | 0.6389 | 0.1553 | -420.0 | -464.0 | -1.375 | -1.4219 |
68
+
69
+
70
+ ### Framework versions
71
+
72
+ - Transformers 4.36.2
73
+ - Pytorch 2.0.1+cu117
74
+ - Datasets 2.14.6
75
+ - Tokenizers 0.15.2
all_results.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_logits/chosen": -1.4296875,
4
+ "eval_logits/rejected": -1.3828125,
5
+ "eval_logps/chosen": -464.0,
6
+ "eval_logps/rejected": -420.0,
7
+ "eval_loss": 0.6500468850135803,
8
+ "eval_rewards/accuracies": 0.6170634627342224,
9
+ "eval_rewards/chosen": -0.40234375,
10
+ "eval_rewards/margins": 0.1513671875,
11
+ "eval_rewards/rejected": -0.5546875,
12
+ "eval_runtime": 52.2116,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 38.306,
15
+ "eval_steps_per_second": 1.207,
16
+ "train_loss": 0.6625334221861897,
17
+ "train_runtime": 3684.7873,
18
+ "train_samples": 61135,
19
+ "train_samples_per_second": 16.591,
20
+ "train_steps_per_second": 0.129
21
+ }
eval_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_logits/chosen": -1.4296875,
4
+ "eval_logits/rejected": -1.3828125,
5
+ "eval_logps/chosen": -464.0,
6
+ "eval_logps/rejected": -420.0,
7
+ "eval_loss": 0.6500468850135803,
8
+ "eval_rewards/accuracies": 0.6170634627342224,
9
+ "eval_rewards/chosen": -0.40234375,
10
+ "eval_rewards/margins": 0.1513671875,
11
+ "eval_rewards/rejected": -0.5546875,
12
+ "eval_runtime": 52.2116,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 38.306,
15
+ "eval_steps_per_second": 1.207
16
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 0,
5
+ "transformers_version": "4.36.2"
6
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c010cc216d639363d360459699055dced12162ba441f99f28981bda95eca768
3
  size 810702192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c950b37cfaf9ca6200ddf854aaadf4209162d3815229ce993a50bfe535102b63
3
  size 810702192
runs/Mar05_13-08-21_u124757/events.out.tfevents.1709673156.u124757.331914.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5cff8fe006454004d8eb3f74bb37b4121a1de00e408b2bf4a738a619d7c02c34
3
- size 33586
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9036f83ba7cb8b28ec6e7a2139264780af9b8f6fd70d8e3055b4e83ff5458862
3
+ size 38378
runs/Mar05_13-08-21_u124757/events.out.tfevents.1709676893.u124757.331914.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b31053bf0be366a280df5e4b42edf2bf15645a51a1043978676362698508e821
3
+ size 828
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.6625334221861897,
4
+ "train_runtime": 3684.7873,
5
+ "train_samples": 61135,
6
+ "train_samples_per_second": 16.591,
7
+ "train_steps_per_second": 0.129
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,766 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9984301412872841,
5
+ "eval_steps": 100,
6
+ "global_step": 477,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 1.0416666666666666e-08,
14
+ "logits/chosen": -1.375,
15
+ "logits/rejected": -1.21875,
16
+ "logps/chosen": -404.0,
17
+ "logps/rejected": -374.0,
18
+ "loss": 0.6914,
19
+ "rewards/accuracies": 0.0,
20
+ "rewards/chosen": 0.0,
21
+ "rewards/margins": 0.0,
22
+ "rewards/rejected": 0.0,
23
+ "step": 1
24
+ },
25
+ {
26
+ "epoch": 0.02,
27
+ "learning_rate": 1.0416666666666667e-07,
28
+ "logits/chosen": -1.1875,
29
+ "logits/rejected": -1.15625,
30
+ "logps/chosen": -394.0,
31
+ "logps/rejected": -334.0,
32
+ "loss": 0.6944,
33
+ "rewards/accuracies": 0.34375,
34
+ "rewards/chosen": -0.0024261474609375,
35
+ "rewards/margins": -0.004364013671875,
36
+ "rewards/rejected": 0.001922607421875,
37
+ "step": 10
38
+ },
39
+ {
40
+ "epoch": 0.04,
41
+ "learning_rate": 2.0833333333333333e-07,
42
+ "logits/chosen": -1.2109375,
43
+ "logits/rejected": -1.171875,
44
+ "logps/chosen": -412.0,
45
+ "logps/rejected": -342.0,
46
+ "loss": 0.6931,
47
+ "rewards/accuracies": 0.47187501192092896,
48
+ "rewards/chosen": 0.000743865966796875,
49
+ "rewards/margins": 0.003692626953125,
50
+ "rewards/rejected": -0.0029449462890625,
51
+ "step": 20
52
+ },
53
+ {
54
+ "epoch": 0.06,
55
+ "learning_rate": 3.1249999999999997e-07,
56
+ "logits/chosen": -1.2109375,
57
+ "logits/rejected": -1.2265625,
58
+ "logps/chosen": -432.0,
59
+ "logps/rejected": -372.0,
60
+ "loss": 0.6923,
61
+ "rewards/accuracies": 0.45625001192092896,
62
+ "rewards/chosen": 0.002197265625,
63
+ "rewards/margins": 0.003204345703125,
64
+ "rewards/rejected": -0.0009918212890625,
65
+ "step": 30
66
+ },
67
+ {
68
+ "epoch": 0.08,
69
+ "learning_rate": 4.1666666666666667e-07,
70
+ "logits/chosen": -1.2421875,
71
+ "logits/rejected": -1.21875,
72
+ "logps/chosen": -444.0,
73
+ "logps/rejected": -368.0,
74
+ "loss": 0.6933,
75
+ "rewards/accuracies": 0.46875,
76
+ "rewards/chosen": -0.0019073486328125,
77
+ "rewards/margins": 9.357929229736328e-06,
78
+ "rewards/rejected": -0.00191497802734375,
79
+ "step": 40
80
+ },
81
+ {
82
+ "epoch": 0.1,
83
+ "learning_rate": 4.999731868769026e-07,
84
+ "logits/chosen": -1.1953125,
85
+ "logits/rejected": -1.15625,
86
+ "logps/chosen": -432.0,
87
+ "logps/rejected": -384.0,
88
+ "loss": 0.6918,
89
+ "rewards/accuracies": 0.4906249940395355,
90
+ "rewards/chosen": 0.000812530517578125,
91
+ "rewards/margins": 0.00439453125,
92
+ "rewards/rejected": -0.0035858154296875,
93
+ "step": 50
94
+ },
95
+ {
96
+ "epoch": 0.13,
97
+ "learning_rate": 4.990353313429303e-07,
98
+ "logits/chosen": -1.1484375,
99
+ "logits/rejected": -1.078125,
100
+ "logps/chosen": -400.0,
101
+ "logps/rejected": -308.0,
102
+ "loss": 0.6915,
103
+ "rewards/accuracies": 0.484375,
104
+ "rewards/chosen": 0.0038299560546875,
105
+ "rewards/margins": 0.00457763671875,
106
+ "rewards/rejected": -0.000759124755859375,
107
+ "step": 60
108
+ },
109
+ {
110
+ "epoch": 0.15,
111
+ "learning_rate": 4.967625656594781e-07,
112
+ "logits/chosen": -1.203125,
113
+ "logits/rejected": -1.203125,
114
+ "logps/chosen": -406.0,
115
+ "logps/rejected": -336.0,
116
+ "loss": 0.6907,
117
+ "rewards/accuracies": 0.4625000059604645,
118
+ "rewards/chosen": -0.00010776519775390625,
119
+ "rewards/margins": 0.002593994140625,
120
+ "rewards/rejected": -0.002685546875,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.17,
125
+ "learning_rate": 4.93167072587771e-07,
126
+ "logits/chosen": -1.2265625,
127
+ "logits/rejected": -1.2265625,
128
+ "logps/chosen": -416.0,
129
+ "logps/rejected": -394.0,
130
+ "loss": 0.6907,
131
+ "rewards/accuracies": 0.45625001192092896,
132
+ "rewards/chosen": -0.000720977783203125,
133
+ "rewards/margins": 0.00144195556640625,
134
+ "rewards/rejected": -0.002166748046875,
135
+ "step": 80
136
+ },
137
+ {
138
+ "epoch": 0.19,
139
+ "learning_rate": 4.882681251368548e-07,
140
+ "logits/chosen": -1.171875,
141
+ "logits/rejected": -1.1796875,
142
+ "logps/chosen": -382.0,
143
+ "logps/rejected": -324.0,
144
+ "loss": 0.6871,
145
+ "rewards/accuracies": 0.546875,
146
+ "rewards/chosen": 0.0035858154296875,
147
+ "rewards/margins": 0.01318359375,
148
+ "rewards/rejected": -0.0096435546875,
149
+ "step": 90
150
+ },
151
+ {
152
+ "epoch": 0.21,
153
+ "learning_rate": 4.820919832540181e-07,
154
+ "logits/chosen": -1.28125,
155
+ "logits/rejected": -1.171875,
156
+ "logps/chosen": -420.0,
157
+ "logps/rejected": -372.0,
158
+ "loss": 0.6867,
159
+ "rewards/accuracies": 0.5625,
160
+ "rewards/chosen": -0.00103759765625,
161
+ "rewards/margins": 0.015625,
162
+ "rewards/rejected": -0.0167236328125,
163
+ "step": 100
164
+ },
165
+ {
166
+ "epoch": 0.21,
167
+ "eval_logits/chosen": -1.2421875,
168
+ "eval_logits/rejected": -1.1953125,
169
+ "eval_logps/chosen": -424.0,
170
+ "eval_logps/rejected": -368.0,
171
+ "eval_loss": 0.6852812767028809,
172
+ "eval_rewards/accuracies": 0.5555555820465088,
173
+ "eval_rewards/chosen": -0.005645751953125,
174
+ "eval_rewards/margins": 0.0216064453125,
175
+ "eval_rewards/rejected": -0.027099609375,
176
+ "eval_runtime": 51.3914,
177
+ "eval_samples_per_second": 38.917,
178
+ "eval_steps_per_second": 1.226,
179
+ "step": 100
180
+ },
181
+ {
182
+ "epoch": 0.23,
183
+ "learning_rate": 4.7467175306295647e-07,
184
+ "logits/chosen": -1.234375,
185
+ "logits/rejected": -1.21875,
186
+ "logps/chosen": -448.0,
187
+ "logps/rejected": -366.0,
188
+ "loss": 0.6836,
189
+ "rewards/accuracies": 0.5625,
190
+ "rewards/chosen": -0.0045166015625,
191
+ "rewards/margins": 0.0247802734375,
192
+ "rewards/rejected": -0.029296875,
193
+ "step": 110
194
+ },
195
+ {
196
+ "epoch": 0.25,
197
+ "learning_rate": 4.6604720940421207e-07,
198
+ "logits/chosen": -1.203125,
199
+ "logits/rejected": -1.140625,
200
+ "logps/chosen": -410.0,
201
+ "logps/rejected": -332.0,
202
+ "loss": 0.6802,
203
+ "rewards/accuracies": 0.578125,
204
+ "rewards/chosen": -0.0155029296875,
205
+ "rewards/margins": 0.025390625,
206
+ "rewards/rejected": -0.040771484375,
207
+ "step": 120
208
+ },
209
+ {
210
+ "epoch": 0.27,
211
+ "learning_rate": 4.5626458262912735e-07,
212
+ "logits/chosen": -1.265625,
213
+ "logits/rejected": -1.2265625,
214
+ "logps/chosen": -404.0,
215
+ "logps/rejected": -380.0,
216
+ "loss": 0.6806,
217
+ "rewards/accuracies": 0.5718749761581421,
218
+ "rewards/chosen": -0.04638671875,
219
+ "rewards/margins": 0.0257568359375,
220
+ "rewards/rejected": -0.072265625,
221
+ "step": 130
222
+ },
223
+ {
224
+ "epoch": 0.29,
225
+ "learning_rate": 4.453763107901675e-07,
226
+ "logits/chosen": -1.265625,
227
+ "logits/rejected": -1.2578125,
228
+ "logps/chosen": -454.0,
229
+ "logps/rejected": -388.0,
230
+ "loss": 0.6776,
231
+ "rewards/accuracies": 0.574999988079071,
232
+ "rewards/chosen": -0.0654296875,
233
+ "rewards/margins": 0.037841796875,
234
+ "rewards/rejected": -0.10302734375,
235
+ "step": 140
236
+ },
237
+ {
238
+ "epoch": 0.31,
239
+ "learning_rate": 4.3344075855595097e-07,
240
+ "logits/chosen": -1.2421875,
241
+ "logits/rejected": -1.234375,
242
+ "logps/chosen": -420.0,
243
+ "logps/rejected": -356.0,
244
+ "loss": 0.6716,
245
+ "rewards/accuracies": 0.609375,
246
+ "rewards/chosen": -0.09423828125,
247
+ "rewards/margins": 0.049560546875,
248
+ "rewards/rejected": -0.1435546875,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.33,
253
+ "learning_rate": 4.2052190435769554e-07,
254
+ "logits/chosen": -1.28125,
255
+ "logits/rejected": -1.234375,
256
+ "logps/chosen": -422.0,
257
+ "logps/rejected": -388.0,
258
+ "loss": 0.6672,
259
+ "rewards/accuracies": 0.6000000238418579,
260
+ "rewards/chosen": -0.12353515625,
261
+ "rewards/margins": 0.061767578125,
262
+ "rewards/rejected": -0.185546875,
263
+ "step": 160
264
+ },
265
+ {
266
+ "epoch": 0.36,
267
+ "learning_rate": 4.0668899744407567e-07,
268
+ "logits/chosen": -1.28125,
269
+ "logits/rejected": -1.2265625,
270
+ "logps/chosen": -390.0,
271
+ "logps/rejected": -336.0,
272
+ "loss": 0.674,
273
+ "rewards/accuracies": 0.59375,
274
+ "rewards/chosen": -0.1630859375,
275
+ "rewards/margins": 0.037353515625,
276
+ "rewards/rejected": -0.2001953125,
277
+ "step": 170
278
+ },
279
+ {
280
+ "epoch": 0.38,
281
+ "learning_rate": 3.920161866827889e-07,
282
+ "logits/chosen": -1.3125,
283
+ "logits/rejected": -1.296875,
284
+ "logps/chosen": -426.0,
285
+ "logps/rejected": -382.0,
286
+ "loss": 0.6659,
287
+ "rewards/accuracies": 0.574999988079071,
288
+ "rewards/chosen": -0.181640625,
289
+ "rewards/margins": 0.06298828125,
290
+ "rewards/rejected": -0.244140625,
291
+ "step": 180
292
+ },
293
+ {
294
+ "epoch": 0.4,
295
+ "learning_rate": 3.765821230985757e-07,
296
+ "logits/chosen": -1.3203125,
297
+ "logits/rejected": -1.265625,
298
+ "logps/chosen": -410.0,
299
+ "logps/rejected": -380.0,
300
+ "loss": 0.6594,
301
+ "rewards/accuracies": 0.659375011920929,
302
+ "rewards/chosen": -0.19140625,
303
+ "rewards/margins": 0.09814453125,
304
+ "rewards/rejected": -0.2890625,
305
+ "step": 190
306
+ },
307
+ {
308
+ "epoch": 0.42,
309
+ "learning_rate": 3.604695382782159e-07,
310
+ "logits/chosen": -1.3671875,
311
+ "logits/rejected": -1.265625,
312
+ "logps/chosen": -438.0,
313
+ "logps/rejected": -374.0,
314
+ "loss": 0.6632,
315
+ "rewards/accuracies": 0.621874988079071,
316
+ "rewards/chosen": -0.21875,
317
+ "rewards/margins": 0.07421875,
318
+ "rewards/rejected": -0.29296875,
319
+ "step": 200
320
+ },
321
+ {
322
+ "epoch": 0.42,
323
+ "eval_logits/chosen": -1.3671875,
324
+ "eval_logits/rejected": -1.3203125,
325
+ "eval_logps/chosen": -446.0,
326
+ "eval_logps/rejected": -396.0,
327
+ "eval_loss": 0.6625468730926514,
328
+ "eval_rewards/accuracies": 0.625,
329
+ "eval_rewards/chosen": -0.228515625,
330
+ "eval_rewards/margins": 0.09130859375,
331
+ "eval_rewards/rejected": -0.3203125,
332
+ "eval_runtime": 52.3612,
333
+ "eval_samples_per_second": 38.196,
334
+ "eval_steps_per_second": 1.203,
335
+ "step": 200
336
+ },
337
+ {
338
+ "epoch": 0.44,
339
+ "learning_rate": 3.4376480090239047e-07,
340
+ "logits/chosen": -1.4296875,
341
+ "logits/rejected": -1.359375,
342
+ "logps/chosen": -442.0,
343
+ "logps/rejected": -388.0,
344
+ "loss": 0.6604,
345
+ "rewards/accuracies": 0.609375,
346
+ "rewards/chosen": -0.2421875,
347
+ "rewards/margins": 0.0888671875,
348
+ "rewards/rejected": -0.33203125,
349
+ "step": 210
350
+ },
351
+ {
352
+ "epoch": 0.46,
353
+ "learning_rate": 3.265574537815398e-07,
354
+ "logits/chosen": -1.296875,
355
+ "logits/rejected": -1.28125,
356
+ "logps/chosen": -406.0,
357
+ "logps/rejected": -374.0,
358
+ "loss": 0.6588,
359
+ "rewards/accuracies": 0.590624988079071,
360
+ "rewards/chosen": -0.271484375,
361
+ "rewards/margins": 0.0673828125,
362
+ "rewards/rejected": -0.33984375,
363
+ "step": 220
364
+ },
365
+ {
366
+ "epoch": 0.48,
367
+ "learning_rate": 3.0893973387735683e-07,
368
+ "logits/chosen": -1.34375,
369
+ "logits/rejected": -1.296875,
370
+ "logps/chosen": -406.0,
371
+ "logps/rejected": -372.0,
372
+ "loss": 0.6516,
373
+ "rewards/accuracies": 0.6000000238418579,
374
+ "rewards/chosen": -0.26171875,
375
+ "rewards/margins": 0.1064453125,
376
+ "rewards/rejected": -0.369140625,
377
+ "step": 230
378
+ },
379
+ {
380
+ "epoch": 0.5,
381
+ "learning_rate": 2.910060778827554e-07,
382
+ "logits/chosen": -1.3515625,
383
+ "logits/rejected": -1.328125,
384
+ "logps/chosen": -420.0,
385
+ "logps/rejected": -368.0,
386
+ "loss": 0.6635,
387
+ "rewards/accuracies": 0.609375,
388
+ "rewards/chosen": -0.302734375,
389
+ "rewards/margins": 0.09619140625,
390
+ "rewards/rejected": -0.400390625,
391
+ "step": 240
392
+ },
393
+ {
394
+ "epoch": 0.52,
395
+ "learning_rate": 2.7285261601056697e-07,
396
+ "logits/chosen": -1.3828125,
397
+ "logits/rejected": -1.328125,
398
+ "logps/chosen": -426.0,
399
+ "logps/rejected": -384.0,
400
+ "loss": 0.6546,
401
+ "rewards/accuracies": 0.550000011920929,
402
+ "rewards/chosen": -0.30859375,
403
+ "rewards/margins": 0.10693359375,
404
+ "rewards/rejected": -0.416015625,
405
+ "step": 250
406
+ },
407
+ {
408
+ "epoch": 0.54,
409
+ "learning_rate": 2.5457665670441937e-07,
410
+ "logits/chosen": -1.3515625,
411
+ "logits/rejected": -1.2890625,
412
+ "logps/chosen": -432.0,
413
+ "logps/rejected": -384.0,
414
+ "loss": 0.6521,
415
+ "rewards/accuracies": 0.637499988079071,
416
+ "rewards/chosen": -0.337890625,
417
+ "rewards/margins": 0.142578125,
418
+ "rewards/rejected": -0.48046875,
419
+ "step": 260
420
+ },
421
+ {
422
+ "epoch": 0.57,
423
+ "learning_rate": 2.3627616503391812e-07,
424
+ "logits/chosen": -1.3828125,
425
+ "logits/rejected": -1.390625,
426
+ "logps/chosen": -454.0,
427
+ "logps/rejected": -426.0,
428
+ "loss": 0.6522,
429
+ "rewards/accuracies": 0.609375,
430
+ "rewards/chosen": -0.34375,
431
+ "rewards/margins": 0.11181640625,
432
+ "rewards/rejected": -0.45703125,
433
+ "step": 270
434
+ },
435
+ {
436
+ "epoch": 0.59,
437
+ "learning_rate": 2.1804923757009882e-07,
438
+ "logits/chosen": -1.296875,
439
+ "logits/rejected": -1.3046875,
440
+ "logps/chosen": -426.0,
441
+ "logps/rejected": -372.0,
442
+ "loss": 0.6545,
443
+ "rewards/accuracies": 0.606249988079071,
444
+ "rewards/chosen": -0.35546875,
445
+ "rewards/margins": 0.134765625,
446
+ "rewards/rejected": -0.48828125,
447
+ "step": 280
448
+ },
449
+ {
450
+ "epoch": 0.61,
451
+ "learning_rate": 1.9999357655598891e-07,
452
+ "logits/chosen": -1.3515625,
453
+ "logits/rejected": -1.3671875,
454
+ "logps/chosen": -436.0,
455
+ "logps/rejected": -388.0,
456
+ "loss": 0.6546,
457
+ "rewards/accuracies": 0.6000000238418579,
458
+ "rewards/chosen": -0.37890625,
459
+ "rewards/margins": 0.109375,
460
+ "rewards/rejected": -0.48828125,
461
+ "step": 290
462
+ },
463
+ {
464
+ "epoch": 0.63,
465
+ "learning_rate": 1.8220596619089573e-07,
466
+ "logits/chosen": -1.375,
467
+ "logits/rejected": -1.3046875,
468
+ "logps/chosen": -474.0,
469
+ "logps/rejected": -442.0,
470
+ "loss": 0.6493,
471
+ "rewards/accuracies": 0.6312500238418579,
472
+ "rewards/chosen": -0.33984375,
473
+ "rewards/margins": 0.12109375,
474
+ "rewards/rejected": -0.458984375,
475
+ "step": 300
476
+ },
477
+ {
478
+ "epoch": 0.63,
479
+ "eval_logits/chosen": -1.3984375,
480
+ "eval_logits/rejected": -1.3515625,
481
+ "eval_logps/chosen": -460.0,
482
+ "eval_logps/rejected": -412.0,
483
+ "eval_loss": 0.653249979019165,
484
+ "eval_rewards/accuracies": 0.625,
485
+ "eval_rewards/chosen": -0.35546875,
486
+ "eval_rewards/margins": 0.134765625,
487
+ "eval_rewards/rejected": -0.490234375,
488
+ "eval_runtime": 52.3802,
489
+ "eval_samples_per_second": 38.182,
490
+ "eval_steps_per_second": 1.203,
491
+ "step": 300
492
+ },
493
+ {
494
+ "epoch": 0.65,
495
+ "learning_rate": 1.647817538357072e-07,
496
+ "logits/chosen": -1.390625,
497
+ "logits/rejected": -1.3203125,
498
+ "logps/chosen": -432.0,
499
+ "logps/rejected": -366.0,
500
+ "loss": 0.6467,
501
+ "rewards/accuracies": 0.637499988079071,
502
+ "rewards/chosen": -0.353515625,
503
+ "rewards/margins": 0.125,
504
+ "rewards/rejected": -0.478515625,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 0.67,
509
+ "learning_rate": 1.478143389201113e-07,
510
+ "logits/chosen": -1.296875,
511
+ "logits/rejected": -1.328125,
512
+ "logps/chosen": -416.0,
513
+ "logps/rejected": -386.0,
514
+ "loss": 0.6499,
515
+ "rewards/accuracies": 0.6343749761581421,
516
+ "rewards/chosen": -0.3515625,
517
+ "rewards/margins": 0.11865234375,
518
+ "rewards/rejected": -0.47265625,
519
+ "step": 320
520
+ },
521
+ {
522
+ "epoch": 0.69,
523
+ "learning_rate": 1.3139467229135998e-07,
524
+ "logits/chosen": -1.4296875,
525
+ "logits/rejected": -1.390625,
526
+ "logps/chosen": -476.0,
527
+ "logps/rejected": -422.0,
528
+ "loss": 0.6409,
529
+ "rewards/accuracies": 0.6343749761581421,
530
+ "rewards/chosen": -0.373046875,
531
+ "rewards/margins": 0.1669921875,
532
+ "rewards/rejected": -0.5390625,
533
+ "step": 330
534
+ },
535
+ {
536
+ "epoch": 0.71,
537
+ "learning_rate": 1.1561076868822755e-07,
538
+ "logits/chosen": -1.3671875,
539
+ "logits/rejected": -1.328125,
540
+ "logps/chosen": -428.0,
541
+ "logps/rejected": -404.0,
542
+ "loss": 0.6505,
543
+ "rewards/accuracies": 0.59375,
544
+ "rewards/chosen": -0.384765625,
545
+ "rewards/margins": 0.10400390625,
546
+ "rewards/rejected": -0.48828125,
547
+ "step": 340
548
+ },
549
+ {
550
+ "epoch": 0.73,
551
+ "learning_rate": 1.0054723495346482e-07,
552
+ "logits/chosen": -1.359375,
553
+ "logits/rejected": -1.2890625,
554
+ "logps/chosen": -420.0,
555
+ "logps/rejected": -388.0,
556
+ "loss": 0.6417,
557
+ "rewards/accuracies": 0.6312500238418579,
558
+ "rewards/chosen": -0.37890625,
559
+ "rewards/margins": 0.1455078125,
560
+ "rewards/rejected": -0.5234375,
561
+ "step": 350
562
+ },
563
+ {
564
+ "epoch": 0.75,
565
+ "learning_rate": 8.628481651367875e-08,
566
+ "logits/chosen": -1.3828125,
567
+ "logits/rejected": -1.3515625,
568
+ "logps/chosen": -476.0,
569
+ "logps/rejected": -400.0,
570
+ "loss": 0.6398,
571
+ "rewards/accuracies": 0.6312500238418579,
572
+ "rewards/chosen": -0.37890625,
573
+ "rewards/margins": 0.1396484375,
574
+ "rewards/rejected": -0.51953125,
575
+ "step": 360
576
+ },
577
+ {
578
+ "epoch": 0.77,
579
+ "learning_rate": 7.289996455765748e-08,
580
+ "logits/chosen": -1.421875,
581
+ "logits/rejected": -1.3984375,
582
+ "logps/chosen": -492.0,
583
+ "logps/rejected": -404.0,
584
+ "loss": 0.6472,
585
+ "rewards/accuracies": 0.625,
586
+ "rewards/chosen": -0.376953125,
587
+ "rewards/margins": 0.138671875,
588
+ "rewards/rejected": -0.515625,
589
+ "step": 370
590
+ },
591
+ {
592
+ "epoch": 0.8,
593
+ "learning_rate": 6.046442623320145e-08,
594
+ "logits/chosen": -1.4140625,
595
+ "logits/rejected": -1.3984375,
596
+ "logps/chosen": -448.0,
597
+ "logps/rejected": -416.0,
598
+ "loss": 0.6386,
599
+ "rewards/accuracies": 0.640625,
600
+ "rewards/chosen": -0.39453125,
601
+ "rewards/margins": 0.126953125,
602
+ "rewards/rejected": -0.51953125,
603
+ "step": 380
604
+ },
605
+ {
606
+ "epoch": 0.82,
607
+ "learning_rate": 4.904486005914027e-08,
608
+ "logits/chosen": -1.3515625,
609
+ "logits/rejected": -1.3515625,
610
+ "logps/chosen": -442.0,
611
+ "logps/rejected": -416.0,
612
+ "loss": 0.6437,
613
+ "rewards/accuracies": 0.6312500238418579,
614
+ "rewards/chosen": -0.39453125,
615
+ "rewards/margins": 0.126953125,
616
+ "rewards/rejected": -0.5234375,
617
+ "step": 390
618
+ },
619
+ {
620
+ "epoch": 0.84,
621
+ "learning_rate": 3.8702478614051345e-08,
622
+ "logits/chosen": -1.375,
623
+ "logits/rejected": -1.34375,
624
+ "logps/chosen": -492.0,
625
+ "logps/rejected": -426.0,
626
+ "loss": 0.6462,
627
+ "rewards/accuracies": 0.5625,
628
+ "rewards/chosen": -0.392578125,
629
+ "rewards/margins": 0.1318359375,
630
+ "rewards/rejected": -0.5234375,
631
+ "step": 400
632
+ },
633
+ {
634
+ "epoch": 0.84,
635
+ "eval_logits/chosen": -1.421875,
636
+ "eval_logits/rejected": -1.375,
637
+ "eval_logps/chosen": -464.0,
638
+ "eval_logps/rejected": -420.0,
639
+ "eval_loss": 0.6495468616485596,
640
+ "eval_rewards/accuracies": 0.6388888955116272,
641
+ "eval_rewards/chosen": -0.400390625,
642
+ "eval_rewards/margins": 0.1552734375,
643
+ "eval_rewards/rejected": -0.5546875,
644
+ "eval_runtime": 51.9691,
645
+ "eval_samples_per_second": 38.484,
646
+ "eval_steps_per_second": 1.212,
647
+ "step": 400
648
+ },
649
+ {
650
+ "epoch": 0.86,
651
+ "learning_rate": 2.9492720416985e-08,
652
+ "logits/chosen": -1.328125,
653
+ "logits/rejected": -1.3046875,
654
+ "logps/chosen": -428.0,
655
+ "logps/rejected": -374.0,
656
+ "loss": 0.651,
657
+ "rewards/accuracies": 0.6343749761581421,
658
+ "rewards/chosen": -0.400390625,
659
+ "rewards/margins": 0.130859375,
660
+ "rewards/rejected": -0.53125,
661
+ "step": 410
662
+ },
663
+ {
664
+ "epoch": 0.88,
665
+ "learning_rate": 2.1464952759020856e-08,
666
+ "logits/chosen": -1.40625,
667
+ "logits/rejected": -1.375,
668
+ "logps/chosen": -432.0,
669
+ "logps/rejected": -408.0,
670
+ "loss": 0.6437,
671
+ "rewards/accuracies": 0.6187499761581421,
672
+ "rewards/chosen": -0.396484375,
673
+ "rewards/margins": 0.12109375,
674
+ "rewards/rejected": -0.51953125,
675
+ "step": 420
676
+ },
677
+ {
678
+ "epoch": 0.9,
679
+ "learning_rate": 1.4662207078575684e-08,
680
+ "logits/chosen": -1.375,
681
+ "logits/rejected": -1.359375,
682
+ "logps/chosen": -458.0,
683
+ "logps/rejected": -404.0,
684
+ "loss": 0.6478,
685
+ "rewards/accuracies": 0.643750011920929,
686
+ "rewards/chosen": -0.380859375,
687
+ "rewards/margins": 0.181640625,
688
+ "rewards/rejected": -0.5625,
689
+ "step": 430
690
+ },
691
+ {
692
+ "epoch": 0.92,
693
+ "learning_rate": 9.12094829893642e-09,
694
+ "logits/chosen": -1.421875,
695
+ "logits/rejected": -1.390625,
696
+ "logps/chosen": -420.0,
697
+ "logps/rejected": -392.0,
698
+ "loss": 0.6468,
699
+ "rewards/accuracies": 0.625,
700
+ "rewards/chosen": -0.390625,
701
+ "rewards/margins": 0.138671875,
702
+ "rewards/rejected": -0.52734375,
703
+ "step": 440
704
+ },
705
+ {
706
+ "epoch": 0.94,
707
+ "learning_rate": 4.8708793644441086e-09,
708
+ "logits/chosen": -1.4296875,
709
+ "logits/rejected": -1.359375,
710
+ "logps/chosen": -452.0,
711
+ "logps/rejected": -426.0,
712
+ "loss": 0.6587,
713
+ "rewards/accuracies": 0.5843750238418579,
714
+ "rewards/chosen": -0.40234375,
715
+ "rewards/margins": 0.08447265625,
716
+ "rewards/rejected": -0.48828125,
717
+ "step": 450
718
+ },
719
+ {
720
+ "epoch": 0.96,
721
+ "learning_rate": 1.9347820230782295e-09,
722
+ "logits/chosen": -1.359375,
723
+ "logits/rejected": -1.3359375,
724
+ "logps/chosen": -426.0,
725
+ "logps/rejected": -390.0,
726
+ "loss": 0.6394,
727
+ "rewards/accuracies": 0.65625,
728
+ "rewards/chosen": -0.40234375,
729
+ "rewards/margins": 0.140625,
730
+ "rewards/rejected": -0.54296875,
731
+ "step": 460
732
+ },
733
+ {
734
+ "epoch": 0.98,
735
+ "learning_rate": 3.2839470889836627e-10,
736
+ "logits/chosen": -1.421875,
737
+ "logits/rejected": -1.375,
738
+ "logps/chosen": -470.0,
739
+ "logps/rejected": -422.0,
740
+ "loss": 0.6303,
741
+ "rewards/accuracies": 0.690625011920929,
742
+ "rewards/chosen": -0.36328125,
743
+ "rewards/margins": 0.19921875,
744
+ "rewards/rejected": -0.55859375,
745
+ "step": 470
746
+ },
747
+ {
748
+ "epoch": 1.0,
749
+ "step": 477,
750
+ "total_flos": 0.0,
751
+ "train_loss": 0.6625334221861897,
752
+ "train_runtime": 3684.7873,
753
+ "train_samples_per_second": 16.591,
754
+ "train_steps_per_second": 0.129
755
+ }
756
+ ],
757
+ "logging_steps": 10,
758
+ "max_steps": 477,
759
+ "num_input_tokens_seen": 0,
760
+ "num_train_epochs": 1,
761
+ "save_steps": 100,
762
+ "total_flos": 0.0,
763
+ "train_batch_size": 8,
764
+ "trial_name": null,
765
+ "trial_params": null
766
+ }