lole25 commited on
Commit
6c1d38b
1 Parent(s): 8413830

Model save

Browse files
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - dpo
7
+ - generated_from_trainer
8
+ base_model: mistralai/Mistral-7B-v0.1
9
+ model-index:
10
+ - name: zephyr-7b-gpo-iter2
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # zephyr-7b-gpo-iter2
18
+
19
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.0114
22
+ - Rewards/chosen: -0.0874
23
+ - Rewards/rejected: -0.0645
24
+ - Rewards/accuracies: 0.3940
25
+ - Rewards/margins: -0.0229
26
+ - Logps/rejected: -264.6114
27
+ - Logps/chosen: -288.2511
28
+ - Logits/rejected: -2.1907
29
+ - Logits/chosen: -2.3882
30
+
31
+ ## Model description
32
+
33
+ More information needed
34
+
35
+ ## Intended uses & limitations
36
+
37
+ More information needed
38
+
39
+ ## Training and evaluation data
40
+
41
+ More information needed
42
+
43
+ ## Training procedure
44
+
45
+ ### Training hyperparameters
46
+
47
+ The following hyperparameters were used during training:
48
+ - learning_rate: 5e-06
49
+ - train_batch_size: 1
50
+ - eval_batch_size: 2
51
+ - seed: 42
52
+ - distributed_type: multi-GPU
53
+ - gradient_accumulation_steps: 2
54
+ - total_train_batch_size: 2
55
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
56
+ - lr_scheduler_type: cosine
57
+ - lr_scheduler_warmup_ratio: 0.1
58
+ - num_epochs: 2
59
+
60
+ ### Training results
61
+
62
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
63
+ |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
64
+ | 0.0012 | 0.3 | 100 | 0.0016 | -0.0164 | -0.0160 | 0.5035 | -0.0005 | -259.7555 | -281.1500 | -2.1644 | -2.3583 |
65
+ | 0.0011 | 0.61 | 200 | 0.0018 | -0.0088 | -0.0077 | 0.4815 | -0.0011 | -258.9317 | -280.3858 | -2.1837 | -2.3781 |
66
+ | 0.0015 | 0.91 | 300 | 0.0019 | -0.0167 | -0.0149 | 0.4805 | -0.0017 | -259.6521 | -281.1740 | -2.1796 | -2.3740 |
67
+ | 0.0397 | 1.22 | 400 | 0.0074 | -0.0779 | -0.0627 | 0.4160 | -0.0151 | -264.4323 | -287.2935 | -2.1632 | -2.3568 |
68
+ | 0.0305 | 1.52 | 500 | 0.0117 | -0.0898 | -0.0668 | 0.3945 | -0.0230 | -264.8388 | -288.4842 | -2.1902 | -2.3875 |
69
+ | 0.0366 | 1.82 | 600 | 0.0115 | -0.0876 | -0.0647 | 0.4000 | -0.0230 | -264.6301 | -288.2723 | -2.1900 | -2.3873 |
70
+
71
+
72
+ ### Framework versions
73
+
74
+ - PEFT 0.7.1
75
+ - Transformers 4.36.2
76
+ - Pytorch 2.1.2+cu118
77
+ - Datasets 2.14.6
78
+ - Tokenizers 0.15.2
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a651012283486679ca06d6693c07c8b0ef2b90229640df0ded079e6b80a15750
3
  size 671150064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eeca74f490c374bfa634438505679bb07a5398dfd257f26fcee76c900b989388
3
  size 671150064
all_results.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "eval_logits/chosen": -2.388157844543457,
4
+ "eval_logits/rejected": -2.1907498836517334,
5
+ "eval_logps/chosen": -288.2511291503906,
6
+ "eval_logps/rejected": -264.61138916015625,
7
+ "eval_loss": 0.011413076892495155,
8
+ "eval_rewards/accuracies": 0.39399999380111694,
9
+ "eval_rewards/chosen": -0.08743663877248764,
10
+ "eval_rewards/margins": -0.022926397621631622,
11
+ "eval_rewards/rejected": -0.06451024115085602,
12
+ "eval_runtime": 1422.7975,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 1.406,
15
+ "eval_steps_per_second": 0.703,
16
+ "train_loss": 0.014390302914878392,
17
+ "train_runtime": 11750.1681,
18
+ "train_samples": 61135,
19
+ "train_samples_per_second": 0.112,
20
+ "train_steps_per_second": 0.056
21
+ }
eval_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "eval_logits/chosen": -2.388157844543457,
4
+ "eval_logits/rejected": -2.1907498836517334,
5
+ "eval_logps/chosen": -288.2511291503906,
6
+ "eval_logps/rejected": -264.61138916015625,
7
+ "eval_loss": 0.011413076892495155,
8
+ "eval_rewards/accuracies": 0.39399999380111694,
9
+ "eval_rewards/chosen": -0.08743663877248764,
10
+ "eval_rewards/margins": -0.022926397621631622,
11
+ "eval_rewards/rejected": -0.06451024115085602,
12
+ "eval_runtime": 1422.7975,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 1.406,
15
+ "eval_steps_per_second": 0.703
16
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "train_loss": 0.014390302914878392,
4
+ "train_runtime": 11750.1681,
5
+ "train_samples": 61135,
6
+ "train_samples_per_second": 0.112,
7
+ "train_steps_per_second": 0.056
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1050 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "eval_steps": 100,
6
+ "global_step": 658,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 7.575757575757576e-08,
14
+ "logits/chosen": -3.0401854515075684,
15
+ "logits/rejected": -2.9935803413391113,
16
+ "logps/chosen": -33.6235466003418,
17
+ "logps/rejected": -31.46235466003418,
18
+ "loss": 0.0011,
19
+ "rewards/accuracies": 0.0,
20
+ "rewards/chosen": 0.0,
21
+ "rewards/margins": 0.0,
22
+ "rewards/rejected": 0.0,
23
+ "step": 1
24
+ },
25
+ {
26
+ "epoch": 0.03,
27
+ "learning_rate": 7.575757575757576e-07,
28
+ "logits/chosen": -3.0845224857330322,
29
+ "logits/rejected": -2.939974784851074,
30
+ "logps/chosen": -187.66502380371094,
31
+ "logps/rejected": -172.21742248535156,
32
+ "loss": 0.0011,
33
+ "rewards/accuracies": 0.4444444477558136,
34
+ "rewards/chosen": 4.157695730100386e-05,
35
+ "rewards/margins": -0.0002048778405878693,
36
+ "rewards/rejected": 0.0002464548160787672,
37
+ "step": 10
38
+ },
39
+ {
40
+ "epoch": 0.06,
41
+ "learning_rate": 1.5151515151515152e-06,
42
+ "logits/chosen": -2.975489616394043,
43
+ "logits/rejected": -2.8098347187042236,
44
+ "logps/chosen": -101.97224426269531,
45
+ "logps/rejected": -79.41976928710938,
46
+ "loss": 0.001,
47
+ "rewards/accuracies": 0.6000000238418579,
48
+ "rewards/chosen": 0.0006187028484418988,
49
+ "rewards/margins": 0.0002881823165807873,
50
+ "rewards/rejected": 0.0003305206191726029,
51
+ "step": 20
52
+ },
53
+ {
54
+ "epoch": 0.09,
55
+ "learning_rate": 2.2727272727272728e-06,
56
+ "logits/chosen": -2.9616599082946777,
57
+ "logits/rejected": -2.853991746902466,
58
+ "logps/chosen": -93.24244689941406,
59
+ "logps/rejected": -93.37409973144531,
60
+ "loss": 0.0011,
61
+ "rewards/accuracies": 0.5,
62
+ "rewards/chosen": 0.0018816631054505706,
63
+ "rewards/margins": 0.000262443587416783,
64
+ "rewards/rejected": 0.001619219547137618,
65
+ "step": 30
66
+ },
67
+ {
68
+ "epoch": 0.12,
69
+ "learning_rate": 3.0303030303030305e-06,
70
+ "logits/chosen": -2.957383871078491,
71
+ "logits/rejected": -2.8873894214630127,
72
+ "logps/chosen": -98.40808868408203,
73
+ "logps/rejected": -90.6903305053711,
74
+ "loss": 0.0011,
75
+ "rewards/accuracies": 0.6000000238418579,
76
+ "rewards/chosen": 0.0038822232745587826,
77
+ "rewards/margins": 0.0006505203782580793,
78
+ "rewards/rejected": 0.0032317028380930424,
79
+ "step": 40
80
+ },
81
+ {
82
+ "epoch": 0.15,
83
+ "learning_rate": 3.7878787878787882e-06,
84
+ "logits/chosen": -2.793536424636841,
85
+ "logits/rejected": -2.701014995574951,
86
+ "logps/chosen": -76.89764404296875,
87
+ "logps/rejected": -76.88601684570312,
88
+ "loss": 0.0009,
89
+ "rewards/accuracies": 0.44999998807907104,
90
+ "rewards/chosen": 0.0003440352447796613,
91
+ "rewards/margins": 0.003660330083221197,
92
+ "rewards/rejected": -0.003316295798867941,
93
+ "step": 50
94
+ },
95
+ {
96
+ "epoch": 0.18,
97
+ "learning_rate": 4.5454545454545455e-06,
98
+ "logits/chosen": -2.8517611026763916,
99
+ "logits/rejected": -2.701003313064575,
100
+ "logps/chosen": -111.06632232666016,
101
+ "logps/rejected": -97.74620056152344,
102
+ "loss": 0.001,
103
+ "rewards/accuracies": 0.4000000059604645,
104
+ "rewards/chosen": 0.0011594895040616393,
105
+ "rewards/margins": 0.00348883168771863,
106
+ "rewards/rejected": -0.0023293420672416687,
107
+ "step": 60
108
+ },
109
+ {
110
+ "epoch": 0.21,
111
+ "learning_rate": 4.999436790436924e-06,
112
+ "logits/chosen": -3.074284791946411,
113
+ "logits/rejected": -2.859107494354248,
114
+ "logps/chosen": -175.5392608642578,
115
+ "logps/rejected": -165.30654907226562,
116
+ "loss": 0.0011,
117
+ "rewards/accuracies": 0.5,
118
+ "rewards/chosen": 0.0042682006023824215,
119
+ "rewards/margins": 0.004156609531491995,
120
+ "rewards/rejected": 0.00011159134737681597,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.24,
125
+ "learning_rate": 4.993103596812269e-06,
126
+ "logits/chosen": -2.7597286701202393,
127
+ "logits/rejected": -2.6338610649108887,
128
+ "logps/chosen": -117.37264251708984,
129
+ "logps/rejected": -96.65496826171875,
130
+ "loss": 0.0014,
131
+ "rewards/accuracies": 0.30000001192092896,
132
+ "rewards/chosen": -0.004695770796388388,
133
+ "rewards/margins": -0.0023340280167758465,
134
+ "rewards/rejected": -0.002361743012443185,
135
+ "step": 80
136
+ },
137
+ {
138
+ "epoch": 0.27,
139
+ "learning_rate": 4.979751088147192e-06,
140
+ "logits/chosen": -2.7671074867248535,
141
+ "logits/rejected": -2.5734329223632812,
142
+ "logps/chosen": -124.0124740600586,
143
+ "logps/rejected": -117.49992370605469,
144
+ "loss": 0.0013,
145
+ "rewards/accuracies": 0.550000011920929,
146
+ "rewards/chosen": -0.0014447696739807725,
147
+ "rewards/margins": 0.006112386472523212,
148
+ "rewards/rejected": -0.0075571550987660885,
149
+ "step": 90
150
+ },
151
+ {
152
+ "epoch": 0.3,
153
+ "learning_rate": 4.95941685833271e-06,
154
+ "logits/chosen": -2.841676712036133,
155
+ "logits/rejected": -2.6840147972106934,
156
+ "logps/chosen": -136.9166259765625,
157
+ "logps/rejected": -142.25997924804688,
158
+ "loss": 0.0012,
159
+ "rewards/accuracies": 0.550000011920929,
160
+ "rewards/chosen": -0.005834830459207296,
161
+ "rewards/margins": 0.0021175735164433718,
162
+ "rewards/rejected": -0.007952402345836163,
163
+ "step": 100
164
+ },
165
+ {
166
+ "epoch": 0.3,
167
+ "eval_logits/chosen": -2.3582723140716553,
168
+ "eval_logits/rejected": -2.1644442081451416,
169
+ "eval_logps/chosen": -281.1500244140625,
170
+ "eval_logps/rejected": -259.7554626464844,
171
+ "eval_loss": 0.001627120072953403,
172
+ "eval_rewards/accuracies": 0.5034999847412109,
173
+ "eval_rewards/chosen": -0.016425278037786484,
174
+ "eval_rewards/margins": -0.00047451091813854873,
175
+ "eval_rewards/rejected": -0.0159507654607296,
176
+ "eval_runtime": 1420.6531,
177
+ "eval_samples_per_second": 1.408,
178
+ "eval_steps_per_second": 0.704,
179
+ "step": 100
180
+ },
181
+ {
182
+ "epoch": 0.33,
183
+ "learning_rate": 4.9321581582449365e-06,
184
+ "logits/chosen": -2.8993427753448486,
185
+ "logits/rejected": -2.786620616912842,
186
+ "logps/chosen": -118.65157318115234,
187
+ "logps/rejected": -127.34912109375,
188
+ "loss": 0.0011,
189
+ "rewards/accuracies": 0.44999998807907104,
190
+ "rewards/chosen": -0.0042579686269164085,
191
+ "rewards/margins": 0.004396714735776186,
192
+ "rewards/rejected": -0.008654682897031307,
193
+ "step": 110
194
+ },
195
+ {
196
+ "epoch": 0.36,
197
+ "learning_rate": 4.898051734555676e-06,
198
+ "logits/chosen": -3.0458617210388184,
199
+ "logits/rejected": -2.99739670753479,
200
+ "logps/chosen": -95.90953063964844,
201
+ "logps/rejected": -102.84895324707031,
202
+ "loss": 0.0013,
203
+ "rewards/accuracies": 0.550000011920929,
204
+ "rewards/chosen": -0.004254769999533892,
205
+ "rewards/margins": -0.0005313962465152144,
206
+ "rewards/rejected": -0.0037233736366033554,
207
+ "step": 120
208
+ },
209
+ {
210
+ "epoch": 0.4,
211
+ "learning_rate": 4.857193613652711e-06,
212
+ "logits/chosen": -3.0125811100006104,
213
+ "logits/rejected": -2.8767762184143066,
214
+ "logps/chosen": -66.88941955566406,
215
+ "logps/rejected": -71.67733001708984,
216
+ "loss": 0.0012,
217
+ "rewards/accuracies": 0.5,
218
+ "rewards/chosen": 0.004949026275426149,
219
+ "rewards/margins": 0.0032058332581073046,
220
+ "rewards/rejected": 0.001743193482980132,
221
+ "step": 130
222
+ },
223
+ {
224
+ "epoch": 0.43,
225
+ "learning_rate": 4.809698831278217e-06,
226
+ "logits/chosen": -2.9552953243255615,
227
+ "logits/rejected": -2.7150585651397705,
228
+ "logps/chosen": -75.46265411376953,
229
+ "logps/rejected": -72.64187622070312,
230
+ "loss": 0.001,
231
+ "rewards/accuracies": 0.800000011920929,
232
+ "rewards/chosen": 0.00027392804622650146,
233
+ "rewards/margins": 0.016122380271553993,
234
+ "rewards/rejected": -0.01584845408797264,
235
+ "step": 140
236
+ },
237
+ {
238
+ "epoch": 0.46,
239
+ "learning_rate": 4.7557011086464634e-06,
240
+ "logits/chosen": -2.9132320880889893,
241
+ "logits/rejected": -2.64630126953125,
242
+ "logps/chosen": -76.73291778564453,
243
+ "logps/rejected": -97.9271011352539,
244
+ "loss": 0.0013,
245
+ "rewards/accuracies": 0.6000000238418579,
246
+ "rewards/chosen": -0.00335231376811862,
247
+ "rewards/margins": 0.01173357479274273,
248
+ "rewards/rejected": -0.015085889026522636,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.49,
253
+ "learning_rate": 4.695352475952706e-06,
254
+ "logits/chosen": -3.013190746307373,
255
+ "logits/rejected": -2.926192045211792,
256
+ "logps/chosen": -73.78190612792969,
257
+ "logps/rejected": -84.00215148925781,
258
+ "loss": 0.0016,
259
+ "rewards/accuracies": 0.699999988079071,
260
+ "rewards/chosen": -0.006495082285255194,
261
+ "rewards/margins": 0.01742926612496376,
262
+ "rewards/rejected": -0.023924347013235092,
263
+ "step": 160
264
+ },
265
+ {
266
+ "epoch": 0.52,
267
+ "learning_rate": 4.6288228443332786e-06,
268
+ "logits/chosen": -2.892408847808838,
269
+ "logits/rejected": -2.785109758377075,
270
+ "logps/chosen": -88.50574493408203,
271
+ "logps/rejected": -63.979164123535156,
272
+ "loss": 0.0014,
273
+ "rewards/accuracies": 0.44999998807907104,
274
+ "rewards/chosen": -0.015971587970852852,
275
+ "rewards/margins": 0.0037319730035960674,
276
+ "rewards/rejected": -0.019703561440110207,
277
+ "step": 170
278
+ },
279
+ {
280
+ "epoch": 0.55,
281
+ "learning_rate": 4.556299527482029e-06,
282
+ "logits/chosen": -3.024970293045044,
283
+ "logits/rejected": -2.9614624977111816,
284
+ "logps/chosen": -95.4378433227539,
285
+ "logps/rejected": -86.34746551513672,
286
+ "loss": 0.0009,
287
+ "rewards/accuracies": 0.75,
288
+ "rewards/chosen": 0.0011760194320231676,
289
+ "rewards/margins": 0.010892638936638832,
290
+ "rewards/rejected": -0.009716618806123734,
291
+ "step": 180
292
+ },
293
+ {
294
+ "epoch": 0.58,
295
+ "learning_rate": 4.4779867142699715e-06,
296
+ "logits/chosen": -3.0479438304901123,
297
+ "logits/rejected": -2.903231620788574,
298
+ "logps/chosen": -138.85531616210938,
299
+ "logps/rejected": -146.4200439453125,
300
+ "loss": 0.0017,
301
+ "rewards/accuracies": 0.44999998807907104,
302
+ "rewards/chosen": 0.013675776310265064,
303
+ "rewards/margins": 0.003341741394251585,
304
+ "rewards/rejected": 0.010334033519029617,
305
+ "step": 190
306
+ },
307
+ {
308
+ "epoch": 0.61,
309
+ "learning_rate": 4.394104893853007e-06,
310
+ "logits/chosen": -2.9416115283966064,
311
+ "logits/rejected": -2.8143601417541504,
312
+ "logps/chosen": -85.66619110107422,
313
+ "logps/rejected": -84.1985092163086,
314
+ "loss": 0.0011,
315
+ "rewards/accuracies": 0.6499999761581421,
316
+ "rewards/chosen": 0.003594033420085907,
317
+ "rewards/margins": 0.005169789306819439,
318
+ "rewards/rejected": -0.0015757561195641756,
319
+ "step": 200
320
+ },
321
+ {
322
+ "epoch": 0.61,
323
+ "eval_logits/chosen": -2.378091812133789,
324
+ "eval_logits/rejected": -2.183748960494995,
325
+ "eval_logps/chosen": -280.3858337402344,
326
+ "eval_logps/rejected": -258.9316711425781,
327
+ "eval_loss": 0.0018177788006141782,
328
+ "eval_rewards/accuracies": 0.4814999997615814,
329
+ "eval_rewards/chosen": -0.008783474564552307,
330
+ "eval_rewards/margins": -0.0010705279419198632,
331
+ "eval_rewards/rejected": -0.007712946273386478,
332
+ "eval_runtime": 1420.2588,
333
+ "eval_samples_per_second": 1.408,
334
+ "eval_steps_per_second": 0.704,
335
+ "step": 200
336
+ },
337
+ {
338
+ "epoch": 0.64,
339
+ "learning_rate": 4.3048902348863116e-06,
340
+ "logits/chosen": -2.9989707469940186,
341
+ "logits/rejected": -2.8650078773498535,
342
+ "logps/chosen": -102.37027740478516,
343
+ "logps/rejected": -118.4636001586914,
344
+ "loss": 0.0012,
345
+ "rewards/accuracies": 0.6000000238418579,
346
+ "rewards/chosen": 0.001182993990369141,
347
+ "rewards/margins": 0.004763666074723005,
348
+ "rewards/rejected": -0.003580672200769186,
349
+ "step": 210
350
+ },
351
+ {
352
+ "epoch": 0.67,
353
+ "learning_rate": 4.210593920593201e-06,
354
+ "logits/chosen": -2.9398374557495117,
355
+ "logits/rejected": -2.8116965293884277,
356
+ "logps/chosen": -70.69538879394531,
357
+ "logps/rejected": -75.53007507324219,
358
+ "loss": 0.0013,
359
+ "rewards/accuracies": 0.6000000238418579,
360
+ "rewards/chosen": 0.005323179066181183,
361
+ "rewards/margins": 0.004396263975650072,
362
+ "rewards/rejected": 0.0009269150905311108,
363
+ "step": 220
364
+ },
365
+ {
366
+ "epoch": 0.7,
367
+ "learning_rate": 4.111481441560598e-06,
368
+ "logits/chosen": -2.972592830657959,
369
+ "logits/rejected": -2.852461099624634,
370
+ "logps/chosen": -160.5238037109375,
371
+ "logps/rejected": -129.62954711914062,
372
+ "loss": 0.0013,
373
+ "rewards/accuracies": 0.44999998807907104,
374
+ "rewards/chosen": 0.00035965832648798823,
375
+ "rewards/margins": 0.005134374834597111,
376
+ "rewards/rejected": -0.004774716217070818,
377
+ "step": 230
378
+ },
379
+ {
380
+ "epoch": 0.73,
381
+ "learning_rate": 4.007831848252212e-06,
382
+ "logits/chosen": -3.1746320724487305,
383
+ "logits/rejected": -2.9887349605560303,
384
+ "logps/chosen": -124.45001220703125,
385
+ "logps/rejected": -101.69696807861328,
386
+ "loss": 0.0009,
387
+ "rewards/accuracies": 0.6499999761581421,
388
+ "rewards/chosen": 0.019443057477474213,
389
+ "rewards/margins": 0.012666161172091961,
390
+ "rewards/rejected": 0.0067768944427371025,
391
+ "step": 240
392
+ },
393
+ {
394
+ "epoch": 0.76,
395
+ "learning_rate": 3.899936965343989e-06,
396
+ "logits/chosen": -3.0935516357421875,
397
+ "logits/rejected": -3.028534412384033,
398
+ "logps/chosen": -131.24337768554688,
399
+ "logps/rejected": -116.88172912597656,
400
+ "loss": 0.0016,
401
+ "rewards/accuracies": 0.44999998807907104,
402
+ "rewards/chosen": 0.005977076943963766,
403
+ "rewards/margins": 0.005966751836240292,
404
+ "rewards/rejected": 1.0324455615773331e-05,
405
+ "step": 250
406
+ },
407
+ {
408
+ "epoch": 0.79,
409
+ "learning_rate": 3.7881005700938635e-06,
410
+ "logits/chosen": -2.7336764335632324,
411
+ "logits/rejected": -2.7675654888153076,
412
+ "logps/chosen": -95.82691955566406,
413
+ "logps/rejected": -99.57430267333984,
414
+ "loss": 0.0008,
415
+ "rewards/accuracies": 0.8500000238418579,
416
+ "rewards/chosen": 0.012143732979893684,
417
+ "rewards/margins": 0.008911015465855598,
418
+ "rewards/rejected": 0.003232717514038086,
419
+ "step": 260
420
+ },
421
+ {
422
+ "epoch": 0.82,
423
+ "learning_rate": 3.6726375370590927e-06,
424
+ "logits/chosen": -2.959117889404297,
425
+ "logits/rejected": -2.841716766357422,
426
+ "logps/chosen": -132.79884338378906,
427
+ "logps/rejected": -122.66941833496094,
428
+ "loss": 0.0015,
429
+ "rewards/accuracies": 0.550000011920929,
430
+ "rewards/chosen": 0.003037696471437812,
431
+ "rewards/margins": 0.00202806293964386,
432
+ "rewards/rejected": 0.0010096338810399175,
433
+ "step": 270
434
+ },
435
+ {
436
+ "epoch": 0.85,
437
+ "learning_rate": 3.553872951569236e-06,
438
+ "logits/chosen": -3.0903332233428955,
439
+ "logits/rejected": -3.009970188140869,
440
+ "logps/chosen": -90.32659912109375,
441
+ "logps/rejected": -92.47020721435547,
442
+ "loss": 0.0012,
443
+ "rewards/accuracies": 0.5,
444
+ "rewards/chosen": 0.005868277512490749,
445
+ "rewards/margins": 0.011209800839424133,
446
+ "rewards/rejected": -0.005341522395610809,
447
+ "step": 280
448
+ },
449
+ {
450
+ "epoch": 0.88,
451
+ "learning_rate": 3.432141194450772e-06,
452
+ "logits/chosen": -2.920701503753662,
453
+ "logits/rejected": -2.721304416656494,
454
+ "logps/chosen": -111.44612121582031,
455
+ "logps/rejected": -111.91583251953125,
456
+ "loss": 0.0007,
457
+ "rewards/accuracies": 0.699999988079071,
458
+ "rewards/chosen": -0.004392694681882858,
459
+ "rewards/margins": 0.016653168946504593,
460
+ "rewards/rejected": -0.02104586362838745,
461
+ "step": 290
462
+ },
463
+ {
464
+ "epoch": 0.91,
465
+ "learning_rate": 3.307785000580313e-06,
466
+ "logits/chosen": -2.757887363433838,
467
+ "logits/rejected": -2.5684075355529785,
468
+ "logps/chosen": -128.5594024658203,
469
+ "logps/rejected": -115.97843933105469,
470
+ "loss": 0.0015,
471
+ "rewards/accuracies": 0.4000000059604645,
472
+ "rewards/chosen": -0.006182619370520115,
473
+ "rewards/margins": 0.0031226237770169973,
474
+ "rewards/rejected": -0.009305243380367756,
475
+ "step": 300
476
+ },
477
+ {
478
+ "epoch": 0.91,
479
+ "eval_logits/chosen": -2.374006748199463,
480
+ "eval_logits/rejected": -2.1796298027038574,
481
+ "eval_logps/chosen": -281.17401123046875,
482
+ "eval_logps/rejected": -259.6520690917969,
483
+ "eval_loss": 0.0019075373420491815,
484
+ "eval_rewards/accuracies": 0.4805000126361847,
485
+ "eval_rewards/chosen": -0.016665350645780563,
486
+ "eval_rewards/margins": -0.0017483622068539262,
487
+ "eval_rewards/rejected": -0.014916987158358097,
488
+ "eval_runtime": 1421.116,
489
+ "eval_samples_per_second": 1.407,
490
+ "eval_steps_per_second": 0.704,
491
+ "step": 300
492
+ },
493
+ {
494
+ "epoch": 0.94,
495
+ "learning_rate": 3.1811544939170573e-06,
496
+ "logits/chosen": -2.7975335121154785,
497
+ "logits/rejected": -2.667598009109497,
498
+ "logps/chosen": -161.1660919189453,
499
+ "logps/rejected": -133.93838500976562,
500
+ "loss": 0.0012,
501
+ "rewards/accuracies": 0.44999998807907104,
502
+ "rewards/chosen": -0.010928490199148655,
503
+ "rewards/margins": 0.00381992245092988,
504
+ "rewards/rejected": -0.014748412184417248,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 0.97,
509
+ "learning_rate": 3.052606201731325e-06,
510
+ "logits/chosen": -2.93320894241333,
511
+ "logits/rejected": -2.7140159606933594,
512
+ "logps/chosen": -79.52749633789062,
513
+ "logps/rejected": -84.45579528808594,
514
+ "loss": 0.0008,
515
+ "rewards/accuracies": 0.699999988079071,
516
+ "rewards/chosen": -0.0026647502090781927,
517
+ "rewards/margins": 0.009594769217073917,
518
+ "rewards/rejected": -0.01225951872766018,
519
+ "step": 320
520
+ },
521
+ {
522
+ "epoch": 1.0,
523
+ "learning_rate": 2.9225020508046233e-06,
524
+ "logits/chosen": -2.901916742324829,
525
+ "logits/rejected": -2.736692190170288,
526
+ "logps/chosen": -115.4826431274414,
527
+ "logps/rejected": -130.37356567382812,
528
+ "loss": 0.0313,
529
+ "rewards/accuracies": 0.699999988079071,
530
+ "rewards/chosen": 0.017747070640325546,
531
+ "rewards/margins": 0.05266844481229782,
532
+ "rewards/rejected": -0.034921370446681976,
533
+ "step": 330
534
+ },
535
+ {
536
+ "epoch": 1.03,
537
+ "learning_rate": 2.7912083484274266e-06,
538
+ "logits/chosen": -2.7787222862243652,
539
+ "logits/rejected": -2.63339900970459,
540
+ "logps/chosen": -137.5430450439453,
541
+ "logps/rejected": -127.07392883300781,
542
+ "loss": 0.0436,
543
+ "rewards/accuracies": 0.800000011920929,
544
+ "rewards/chosen": 0.0036686602979898453,
545
+ "rewards/margins": 0.05118006467819214,
546
+ "rewards/rejected": -0.04751140996813774,
547
+ "step": 340
548
+ },
549
+ {
550
+ "epoch": 1.06,
551
+ "learning_rate": 2.659094751063666e-06,
552
+ "logits/chosen": -2.9705729484558105,
553
+ "logits/rejected": -2.832822322845459,
554
+ "logps/chosen": -143.50698852539062,
555
+ "logps/rejected": -140.02352905273438,
556
+ "loss": 0.0133,
557
+ "rewards/accuracies": 0.75,
558
+ "rewards/chosen": 0.008256057277321815,
559
+ "rewards/margins": 0.05859958380460739,
560
+ "rewards/rejected": -0.05034352466464043,
561
+ "step": 350
562
+ },
563
+ {
564
+ "epoch": 1.09,
565
+ "learning_rate": 2.526533223585641e-06,
566
+ "logits/chosen": -2.8163113594055176,
567
+ "logits/rejected": -2.697537660598755,
568
+ "logps/chosen": -85.76821899414062,
569
+ "logps/rejected": -81.71627807617188,
570
+ "loss": 0.0073,
571
+ "rewards/accuracies": 0.6499999761581421,
572
+ "rewards/chosen": 0.010457667522132397,
573
+ "rewards/margins": 0.053769804537296295,
574
+ "rewards/rejected": -0.043312136083841324,
575
+ "step": 360
576
+ },
577
+ {
578
+ "epoch": 1.12,
579
+ "learning_rate": 2.39389699200963e-06,
580
+ "logits/chosen": -3.000455141067505,
581
+ "logits/rejected": -2.8073432445526123,
582
+ "logps/chosen": -107.18000793457031,
583
+ "logps/rejected": -118.2176284790039,
584
+ "loss": 0.019,
585
+ "rewards/accuracies": 0.800000011920929,
586
+ "rewards/chosen": 0.008137053810060024,
587
+ "rewards/margins": 0.07659684866666794,
588
+ "rewards/rejected": -0.06845979392528534,
589
+ "step": 370
590
+ },
591
+ {
592
+ "epoch": 1.16,
593
+ "learning_rate": 2.2615594926807554e-06,
594
+ "logits/chosen": -2.9433727264404297,
595
+ "logits/rejected": -2.817605495452881,
596
+ "logps/chosen": -84.96995544433594,
597
+ "logps/rejected": -95.87684631347656,
598
+ "loss": 0.0277,
599
+ "rewards/accuracies": 0.8999999761581421,
600
+ "rewards/chosen": 0.01651051640510559,
601
+ "rewards/margins": 0.06000148132443428,
602
+ "rewards/rejected": -0.04349096864461899,
603
+ "step": 380
604
+ },
605
+ {
606
+ "epoch": 1.19,
607
+ "learning_rate": 2.129893320865672e-06,
608
+ "logits/chosen": -3.0510168075561523,
609
+ "logits/rejected": -2.938931465148926,
610
+ "logps/chosen": -76.3388442993164,
611
+ "logps/rejected": -62.002540588378906,
612
+ "loss": 0.0302,
613
+ "rewards/accuracies": 0.8500000238418579,
614
+ "rewards/chosen": 0.04483187943696976,
615
+ "rewards/margins": 0.10971853882074356,
616
+ "rewards/rejected": -0.0648866593837738,
617
+ "step": 390
618
+ },
619
+ {
620
+ "epoch": 1.22,
621
+ "learning_rate": 1.9992691817133025e-06,
622
+ "logits/chosen": -3.05735445022583,
623
+ "logits/rejected": -2.815657138824463,
624
+ "logps/chosen": -251.65219116210938,
625
+ "logps/rejected": -255.02658081054688,
626
+ "loss": 0.0397,
627
+ "rewards/accuracies": 0.75,
628
+ "rewards/chosen": -0.03865816444158554,
629
+ "rewards/margins": 0.04983866214752197,
630
+ "rewards/rejected": -0.08849682658910751,
631
+ "step": 400
632
+ },
633
+ {
634
+ "epoch": 1.22,
635
+ "eval_logits/chosen": -2.356828451156616,
636
+ "eval_logits/rejected": -2.1631951332092285,
637
+ "eval_logps/chosen": -287.29345703125,
638
+ "eval_logps/rejected": -264.43231201171875,
639
+ "eval_loss": 0.007440177723765373,
640
+ "eval_rewards/accuracies": 0.41600000858306885,
641
+ "eval_rewards/chosen": -0.0778598040342331,
642
+ "eval_rewards/margins": -0.015140472911298275,
643
+ "eval_rewards/rejected": -0.06271932274103165,
644
+ "eval_runtime": 1421.1528,
645
+ "eval_samples_per_second": 1.407,
646
+ "eval_steps_per_second": 0.704,
647
+ "step": 400
648
+ },
649
+ {
650
+ "epoch": 1.25,
651
+ "learning_rate": 1.8700548465371877e-06,
652
+ "logits/chosen": -3.085784435272217,
653
+ "logits/rejected": -2.8534493446350098,
654
+ "logps/chosen": -150.0517120361328,
655
+ "logps/rejected": -130.98390197753906,
656
+ "loss": 0.0304,
657
+ "rewards/accuracies": 0.75,
658
+ "rewards/chosen": -0.026314150542020798,
659
+ "rewards/margins": 0.05200232192873955,
660
+ "rewards/rejected": -0.07831647992134094,
661
+ "step": 410
662
+ },
663
+ {
664
+ "epoch": 1.28,
665
+ "learning_rate": 1.742614117358029e-06,
666
+ "logits/chosen": -2.8179147243499756,
667
+ "logits/rejected": -2.72475528717041,
668
+ "logps/chosen": -85.7872543334961,
669
+ "logps/rejected": -98.0990219116211,
670
+ "loss": 0.0839,
671
+ "rewards/accuracies": 0.6499999761581421,
672
+ "rewards/chosen": -0.052498847246170044,
673
+ "rewards/margins": 0.009759088046848774,
674
+ "rewards/rejected": -0.06225793436169624,
675
+ "step": 420
676
+ },
677
+ {
678
+ "epoch": 1.31,
679
+ "learning_rate": 1.617305802621748e-06,
680
+ "logits/chosen": -2.978959083557129,
681
+ "logits/rejected": -2.9247641563415527,
682
+ "logps/chosen": -94.34339904785156,
683
+ "logps/rejected": -103.36552429199219,
684
+ "loss": 0.0221,
685
+ "rewards/accuracies": 0.800000011920929,
686
+ "rewards/chosen": 0.004420995246618986,
687
+ "rewards/margins": 0.09026098251342773,
688
+ "rewards/rejected": -0.08583998680114746,
689
+ "step": 430
690
+ },
691
+ {
692
+ "epoch": 1.34,
693
+ "learning_rate": 1.4944827069769125e-06,
694
+ "logits/chosen": -2.94343900680542,
695
+ "logits/rejected": -2.8201680183410645,
696
+ "logps/chosen": -164.69044494628906,
697
+ "logps/rejected": -141.93528747558594,
698
+ "loss": 0.004,
699
+ "rewards/accuracies": 0.550000011920929,
700
+ "rewards/chosen": -0.01349672395735979,
701
+ "rewards/margins": 0.0265080276876688,
702
+ "rewards/rejected": -0.040004752576351166,
703
+ "step": 440
704
+ },
705
+ {
706
+ "epoch": 1.37,
707
+ "learning_rate": 1.3744906379558165e-06,
708
+ "logits/chosen": -2.9248745441436768,
709
+ "logits/rejected": -2.9322562217712402,
710
+ "logps/chosen": -103.63645935058594,
711
+ "logps/rejected": -123.60746765136719,
712
+ "loss": 0.0383,
713
+ "rewards/accuracies": 0.8500000238418579,
714
+ "rewards/chosen": 0.014714512042701244,
715
+ "rewards/margins": 0.11696286499500275,
716
+ "rewards/rejected": -0.10224835574626923,
717
+ "step": 450
718
+ },
719
+ {
720
+ "epoch": 1.4,
721
+ "learning_rate": 1.257667432355893e-06,
722
+ "logits/chosen": -2.974544048309326,
723
+ "logits/rejected": -2.780351161956787,
724
+ "logps/chosen": -76.25318145751953,
725
+ "logps/rejected": -95.62451934814453,
726
+ "loss": 0.0157,
727
+ "rewards/accuracies": 0.8500000238418579,
728
+ "rewards/chosen": 0.0021516154520213604,
729
+ "rewards/margins": 0.10200606286525726,
730
+ "rewards/rejected": -0.09985444694757462,
731
+ "step": 460
732
+ },
733
+ {
734
+ "epoch": 1.43,
735
+ "learning_rate": 1.1443420050626624e-06,
736
+ "logits/chosen": -3.061354875564575,
737
+ "logits/rejected": -2.9399447441101074,
738
+ "logps/chosen": -71.61457824707031,
739
+ "logps/rejected": -85.53517150878906,
740
+ "loss": 0.047,
741
+ "rewards/accuracies": 0.800000011920929,
742
+ "rewards/chosen": -0.00521849375218153,
743
+ "rewards/margins": 0.06701686233282089,
744
+ "rewards/rejected": -0.0722353607416153,
745
+ "step": 470
746
+ },
747
+ {
748
+ "epoch": 1.46,
749
+ "learning_rate": 1.0348334229922677e-06,
750
+ "logits/chosen": -3.0284695625305176,
751
+ "logits/rejected": -2.7861952781677246,
752
+ "logps/chosen": -84.50325012207031,
753
+ "logps/rejected": -80.56702423095703,
754
+ "loss": 0.0331,
755
+ "rewards/accuracies": 0.75,
756
+ "rewards/chosen": 0.010606551542878151,
757
+ "rewards/margins": 0.08576520532369614,
758
+ "rewards/rejected": -0.07515865564346313,
759
+ "step": 480
760
+ },
761
+ {
762
+ "epoch": 1.49,
763
+ "learning_rate": 9.294500067608941e-07,
764
+ "logits/chosen": -2.929229974746704,
765
+ "logits/rejected": -2.727160692214966,
766
+ "logps/chosen": -106.28855895996094,
767
+ "logps/rejected": -96.00597381591797,
768
+ "loss": 0.0183,
769
+ "rewards/accuracies": 0.6000000238418579,
770
+ "rewards/chosen": -0.032337434589862823,
771
+ "rewards/margins": 0.024760348722338676,
772
+ "rewards/rejected": -0.05709778517484665,
773
+ "step": 490
774
+ },
775
+ {
776
+ "epoch": 1.52,
777
+ "learning_rate": 8.284884626103165e-07,
778
+ "logits/chosen": -2.9482462406158447,
779
+ "logits/rejected": -2.7884809970855713,
780
+ "logps/chosen": -77.4437255859375,
781
+ "logps/rejected": -83.18536376953125,
782
+ "loss": 0.0305,
783
+ "rewards/accuracies": 0.8999999761581421,
784
+ "rewards/chosen": 0.025916963815689087,
785
+ "rewards/margins": 0.11217900365591049,
786
+ "rewards/rejected": -0.086262047290802,
787
+ "step": 500
788
+ },
789
+ {
790
+ "epoch": 1.52,
791
+ "eval_logits/chosen": -2.3874802589416504,
792
+ "eval_logits/rejected": -2.1901655197143555,
793
+ "eval_logps/chosen": -288.4842224121094,
794
+ "eval_logps/rejected": -264.83880615234375,
795
+ "eval_loss": 0.011713836342096329,
796
+ "eval_rewards/accuracies": 0.3944999873638153,
797
+ "eval_rewards/chosen": -0.08976726979017258,
798
+ "eval_rewards/margins": -0.022982925176620483,
799
+ "eval_rewards/rejected": -0.06678435206413269,
800
+ "eval_runtime": 1421.6286,
801
+ "eval_samples_per_second": 1.407,
802
+ "eval_steps_per_second": 0.703,
803
+ "step": 500
804
+ },
805
+ {
806
+ "epoch": 1.55,
807
+ "learning_rate": 7.322330470336314e-07,
808
+ "logits/chosen": -3.0825541019439697,
809
+ "logits/rejected": -2.9925718307495117,
810
+ "logps/chosen": -95.14485931396484,
811
+ "logps/rejected": -104.95149993896484,
812
+ "loss": 0.0318,
813
+ "rewards/accuracies": 0.8500000238418579,
814
+ "rewards/chosen": 0.004621284082531929,
815
+ "rewards/margins": 0.12566979229450226,
816
+ "rewards/rejected": -0.12104851007461548,
817
+ "step": 510
818
+ },
819
+ {
820
+ "epoch": 1.58,
821
+ "learning_rate": 6.409547664531734e-07,
822
+ "logits/chosen": -2.9336235523223877,
823
+ "logits/rejected": -2.8529655933380127,
824
+ "logps/chosen": -130.2211151123047,
825
+ "logps/rejected": -122.72911071777344,
826
+ "loss": 0.0063,
827
+ "rewards/accuracies": 0.800000011920929,
828
+ "rewards/chosen": 0.002978086471557617,
829
+ "rewards/margins": 0.05790657550096512,
830
+ "rewards/rejected": -0.0549284890294075,
831
+ "step": 520
832
+ },
833
+ {
834
+ "epoch": 1.61,
835
+ "learning_rate": 5.549106142039018e-07,
836
+ "logits/chosen": -3.001368522644043,
837
+ "logits/rejected": -2.9931020736694336,
838
+ "logps/chosen": -122.59478759765625,
839
+ "logps/rejected": -139.2528839111328,
840
+ "loss": 0.0343,
841
+ "rewards/accuracies": 0.6499999761581421,
842
+ "rewards/chosen": -0.03542733192443848,
843
+ "rewards/margins": 0.028016597032546997,
844
+ "rewards/rejected": -0.06344393640756607,
845
+ "step": 530
846
+ },
847
+ {
848
+ "epoch": 1.64,
849
+ "learning_rate": 4.743428469705336e-07,
850
+ "logits/chosen": -2.895446300506592,
851
+ "logits/rejected": -2.805732011795044,
852
+ "logps/chosen": -77.88665771484375,
853
+ "logps/rejected": -86.38732147216797,
854
+ "loss": 0.0152,
855
+ "rewards/accuracies": 0.8500000238418579,
856
+ "rewards/chosen": 0.012998471967875957,
857
+ "rewards/margins": 0.09006098657846451,
858
+ "rewards/rejected": -0.07706250250339508,
859
+ "step": 540
860
+ },
861
+ {
862
+ "epoch": 1.67,
863
+ "learning_rate": 3.994783027156143e-07,
864
+ "logits/chosen": -2.8439111709594727,
865
+ "logits/rejected": -2.765376567840576,
866
+ "logps/chosen": -124.60064697265625,
867
+ "logps/rejected": -115.8946533203125,
868
+ "loss": 0.0116,
869
+ "rewards/accuracies": 0.800000011920929,
870
+ "rewards/chosen": -0.0016067728865891695,
871
+ "rewards/margins": 0.08273078501224518,
872
+ "rewards/rejected": -0.08433755487203598,
873
+ "step": 550
874
+ },
875
+ {
876
+ "epoch": 1.7,
877
+ "learning_rate": 3.3052776201888266e-07,
878
+ "logits/chosen": -2.8251261711120605,
879
+ "logits/rejected": -2.69478178024292,
880
+ "logps/chosen": -85.72721862792969,
881
+ "logps/rejected": -84.4536361694336,
882
+ "loss": 0.0146,
883
+ "rewards/accuracies": 0.75,
884
+ "rewards/chosen": 0.007551318965852261,
885
+ "rewards/margins": 0.08319269120693207,
886
+ "rewards/rejected": -0.07564137130975723,
887
+ "step": 560
888
+ },
889
+ {
890
+ "epoch": 1.73,
891
+ "learning_rate": 2.676853546260791e-07,
892
+ "logits/chosen": -2.8717586994171143,
893
+ "logits/rejected": -2.805624485015869,
894
+ "logps/chosen": -102.53792572021484,
895
+ "logps/rejected": -89.75617980957031,
896
+ "loss": 0.0086,
897
+ "rewards/accuracies": 0.8999999761581421,
898
+ "rewards/chosen": -0.008890565484762192,
899
+ "rewards/margins": 0.04497462511062622,
900
+ "rewards/rejected": -0.05386519432067871,
901
+ "step": 570
902
+ },
903
+ {
904
+ "epoch": 1.76,
905
+ "learning_rate": 2.111280128780638e-07,
906
+ "logits/chosen": -3.0598039627075195,
907
+ "logits/rejected": -2.9559364318847656,
908
+ "logps/chosen": -84.91885375976562,
909
+ "logps/rejected": -75.99688720703125,
910
+ "loss": 0.0162,
911
+ "rewards/accuracies": 0.75,
912
+ "rewards/chosen": -0.025450268760323524,
913
+ "rewards/margins": 0.03124385140836239,
914
+ "rewards/rejected": -0.056694112718105316,
915
+ "step": 580
916
+ },
917
+ {
918
+ "epoch": 1.79,
919
+ "learning_rate": 1.610149735590949e-07,
920
+ "logits/chosen": -2.929579496383667,
921
+ "logits/rejected": -2.8060431480407715,
922
+ "logps/chosen": -60.950294494628906,
923
+ "logps/rejected": -59.30121994018555,
924
+ "loss": 0.0299,
925
+ "rewards/accuracies": 0.8500000238418579,
926
+ "rewards/chosen": -0.03121623769402504,
927
+ "rewards/margins": 0.015263142995536327,
928
+ "rewards/rejected": -0.04647938907146454,
929
+ "step": 590
930
+ },
931
+ {
932
+ "epoch": 1.82,
933
+ "learning_rate": 1.1748732956682023e-07,
934
+ "logits/chosen": -3.107717275619507,
935
+ "logits/rejected": -2.9001567363739014,
936
+ "logps/chosen": -90.45118713378906,
937
+ "logps/rejected": -84.7816390991211,
938
+ "loss": 0.0366,
939
+ "rewards/accuracies": 0.75,
940
+ "rewards/chosen": -0.021300766617059708,
941
+ "rewards/margins": 0.0555335208773613,
942
+ "rewards/rejected": -0.0768342837691307,
943
+ "step": 600
944
+ },
945
+ {
946
+ "epoch": 1.82,
947
+ "eval_logits/chosen": -2.3873283863067627,
948
+ "eval_logits/rejected": -2.1899969577789307,
949
+ "eval_logps/chosen": -288.27227783203125,
950
+ "eval_logps/rejected": -264.630126953125,
951
+ "eval_loss": 0.011489564552903175,
952
+ "eval_rewards/accuracies": 0.4000000059604645,
953
+ "eval_rewards/chosen": -0.08764798194169998,
954
+ "eval_rewards/margins": -0.02295052260160446,
955
+ "eval_rewards/rejected": -0.06469745188951492,
956
+ "eval_runtime": 1422.4891,
957
+ "eval_samples_per_second": 1.406,
958
+ "eval_steps_per_second": 0.703,
959
+ "step": 600
960
+ },
961
+ {
962
+ "epoch": 1.85,
963
+ "learning_rate": 8.066763266625283e-08,
964
+ "logits/chosen": -2.9506607055664062,
965
+ "logits/rejected": -2.825108051300049,
966
+ "logps/chosen": -100.47017669677734,
967
+ "logps/rejected": -127.78892517089844,
968
+ "loss": 0.0349,
969
+ "rewards/accuracies": 0.6499999761581421,
970
+ "rewards/chosen": 0.004581383429467678,
971
+ "rewards/margins": 0.1253204494714737,
972
+ "rewards/rejected": -0.12073905766010284,
973
+ "step": 610
974
+ },
975
+ {
976
+ "epoch": 1.88,
977
+ "learning_rate": 5.065954844616722e-08,
978
+ "logits/chosen": -2.9829344749450684,
979
+ "logits/rejected": -2.7227559089660645,
980
+ "logps/chosen": -101.5233154296875,
981
+ "logps/rejected": -99.42388916015625,
982
+ "loss": 0.027,
983
+ "rewards/accuracies": 0.699999988079071,
984
+ "rewards/chosen": -0.0389946773648262,
985
+ "rewards/margins": 0.016298165544867516,
986
+ "rewards/rejected": -0.05529283359646797,
987
+ "step": 620
988
+ },
989
+ {
990
+ "epoch": 1.91,
991
+ "learning_rate": 2.7547564449386666e-08,
992
+ "logits/chosen": -2.7666523456573486,
993
+ "logits/rejected": -2.719158887863159,
994
+ "logps/chosen": -215.42599487304688,
995
+ "logps/rejected": -217.2696075439453,
996
+ "loss": 0.0308,
997
+ "rewards/accuracies": 0.699999988079071,
998
+ "rewards/chosen": -0.015156986191868782,
999
+ "rewards/margins": 0.07782644778490067,
1000
+ "rewards/rejected": -0.0929834321141243,
1001
+ "step": 630
1002
+ },
1003
+ {
1004
+ "epoch": 1.95,
1005
+ "learning_rate": 1.1396752298723501e-08,
1006
+ "logits/chosen": -2.9985458850860596,
1007
+ "logits/rejected": -2.927014112472534,
1008
+ "logps/chosen": -109.7343521118164,
1009
+ "logps/rejected": -111.255859375,
1010
+ "loss": 0.0344,
1011
+ "rewards/accuracies": 0.800000011920929,
1012
+ "rewards/chosen": 0.020465310662984848,
1013
+ "rewards/margins": 0.10245206207036972,
1014
+ "rewards/rejected": -0.08198676258325577,
1015
+ "step": 640
1016
+ },
1017
+ {
1018
+ "epoch": 1.98,
1019
+ "learning_rate": 2.252584488296461e-09,
1020
+ "logits/chosen": -2.9206175804138184,
1021
+ "logits/rejected": -2.8202061653137207,
1022
+ "logps/chosen": -157.34161376953125,
1023
+ "logps/rejected": -171.29029846191406,
1024
+ "loss": 0.0403,
1025
+ "rewards/accuracies": 0.8999999761581421,
1026
+ "rewards/chosen": -0.005368991754949093,
1027
+ "rewards/margins": 0.11630947887897491,
1028
+ "rewards/rejected": -0.12167844921350479,
1029
+ "step": 650
1030
+ },
1031
+ {
1032
+ "epoch": 2.0,
1033
+ "step": 658,
1034
+ "total_flos": 0.0,
1035
+ "train_loss": 0.014390302914878392,
1036
+ "train_runtime": 11750.1681,
1037
+ "train_samples_per_second": 0.112,
1038
+ "train_steps_per_second": 0.056
1039
+ }
1040
+ ],
1041
+ "logging_steps": 10,
1042
+ "max_steps": 658,
1043
+ "num_input_tokens_seen": 0,
1044
+ "num_train_epochs": 2,
1045
+ "save_steps": 100,
1046
+ "total_flos": 0.0,
1047
+ "train_batch_size": 1,
1048
+ "trial_name": null,
1049
+ "trial_params": null
1050
+ }