RikkiXu commited on
Commit
05f301d
1 Parent(s): c4b01f4

Model save

Browse files
README.md CHANGED
@@ -13,17 +13,17 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # zephyr-7b-dpo-full
15
 
16
- This model was trained from scratch on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.8003
19
- - Rewards/chosen: -1.8897
20
- - Rewards/rejected: -2.0004
21
- - Rewards/accuracies: 0.5273
22
- - Rewards/margins: 0.1107
23
- - Logps/rejected: -718.4238
24
- - Logps/chosen: -579.4417
25
- - Logits/rejected: -5.6556
26
- - Logits/chosen: -5.3947
27
 
28
  ## Model description
29
 
@@ -42,7 +42,7 @@ More information needed
42
  ### Training hyperparameters
43
 
44
  The following hyperparameters were used during training:
45
- - learning_rate: 1e-07
46
  - train_batch_size: 8
47
  - eval_batch_size: 8
48
  - seed: 42
@@ -60,14 +60,19 @@ The following hyperparameters were used during training:
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
- | 0.3413 | 0.2558 | 100 | 0.7230 | -0.5409 | -0.5757 | 0.5156 | 0.0348 | -575.9554 | -444.5646 | -5.0451 | -4.8217 |
64
- | 0.2653 | 0.5115 | 200 | 0.7765 | -1.4996 | -1.6149 | 0.5430 | 0.1153 | -679.8810 | -540.4390 | -5.5042 | -5.2262 |
65
- | 0.2424 | 0.7673 | 300 | 0.8003 | -1.8897 | -2.0004 | 0.5273 | 0.1107 | -718.4238 | -579.4417 | -5.6556 | -5.3947 |
 
 
 
 
 
66
 
67
 
68
  ### Framework versions
69
 
70
- - Transformers 4.40.2
71
  - Pytorch 2.1.2+cu118
72
- - Datasets 2.19.1
73
  - Tokenizers 0.19.1
 
13
 
14
  # zephyr-7b-dpo-full
15
 
16
+ This model was trained from scratch on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.3183
19
+ - Rewards/chosen: -0.6032
20
+ - Rewards/rejected: -2.1160
21
+ - Rewards/accuracies: 0.8711
22
+ - Rewards/margins: 1.5128
23
+ - Logps/rejected: -584.2130
24
+ - Logps/chosen: -439.6992
25
+ - Logits/rejected: -5.8852
26
+ - Logits/chosen: -5.4031
27
 
28
  ## Model description
29
 
 
42
  ### Training hyperparameters
43
 
44
  The following hyperparameters were used during training:
45
+ - learning_rate: 5e-07
46
  - train_batch_size: 8
47
  - eval_batch_size: 8
48
  - seed: 42
 
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
+ | 0.5118 | 0.1151 | 100 | 0.5923 | -0.1120 | -0.4506 | 0.7070 | 0.3386 | -417.6701 | -390.5766 | -2.1984 | -2.2213 |
64
+ | 0.4206 | 0.2303 | 200 | 0.5055 | -0.2913 | -1.0785 | 0.8008 | 0.7872 | -480.4641 | -408.5089 | -3.2280 | -3.1644 |
65
+ | 0.4144 | 0.3454 | 300 | 0.4504 | -0.3084 | -1.2736 | 0.7773 | 0.9651 | -499.9700 | -410.2218 | -4.0963 | -3.8861 |
66
+ | 0.4011 | 0.4606 | 400 | 0.4135 | -0.4247 | -1.5332 | 0.8086 | 1.1086 | -525.9362 | -421.8441 | -4.8370 | -4.5018 |
67
+ | 0.3915 | 0.5757 | 500 | 0.3740 | -0.3892 | -1.7143 | 0.8516 | 1.3251 | -544.0394 | -418.2938 | -5.1877 | -4.7675 |
68
+ | 0.3726 | 0.6908 | 600 | 0.3468 | -0.4807 | -1.8892 | 0.8438 | 1.4085 | -561.5286 | -427.4439 | -5.6248 | -5.1461 |
69
+ | 0.3522 | 0.8060 | 700 | 0.3249 | -0.5431 | -2.0476 | 0.8789 | 1.5044 | -577.3692 | -433.6906 | -5.6819 | -5.2107 |
70
+ | 0.3643 | 0.9211 | 800 | 0.3183 | -0.6032 | -2.1160 | 0.8711 | 1.5128 | -584.2130 | -439.6992 | -5.8852 | -5.4031 |
71
 
72
 
73
  ### Framework versions
74
 
75
+ - Transformers 4.41.1
76
  - Pytorch 2.1.2+cu118
77
+ - Datasets 2.16.1
78
  - Tokenizers 0.19.1
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
  "total_flos": 0.0,
4
- "train_loss": 0.3220548828315857,
5
- "train_runtime": 6253.066,
6
- "train_samples": 50000,
7
- "train_samples_per_second": 7.996,
8
- "train_steps_per_second": 0.063
9
  }
 
1
  {
2
+ "epoch": 0.9994242947610823,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.4218231642850533,
5
+ "train_runtime": 14967.0092,
6
+ "train_samples": 111134,
7
+ "train_samples_per_second": 7.425,
8
+ "train_steps_per_second": 0.058
9
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/mnt/bn/xuruijie-llm/checkpoints/new_world/v1-ultral",
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
@@ -20,7 +20,7 @@
20
  "sliding_window": 4096,
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
- "transformers_version": "4.40.2",
24
  "use_cache": false,
25
  "vocab_size": 32002
26
  }
 
1
  {
2
+ "_name_or_path": "/mnt/bn/xuruijie-llm/checkpoints/hh-rlhf/sft_0521/checkpoint-5500/",
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
 
20
  "sliding_window": 4096,
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.41.1",
24
  "use_cache": false,
25
  "vocab_size": 32002
26
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 32000,
5
- "transformers_version": "4.40.2"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 32000,
5
+ "transformers_version": "4.41.1"
6
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:704bf613eb27caf571ef226e414e9760489580da4614aaafb40003e65ec17441
3
  size 4943178720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1772010f50ecbe6265c3f184a8d8b5dc8ab8f62be22d0e73f4e9569d256161d8
3
  size 4943178720
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed52e2d62c0960fa29471d46fbfd02182f6f32018d50ca876ce32d35000541e1
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ea8e648eccea058d158b230bb6b95d7502609f2dd4ef32a5d63fbabad21555d
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8183375d0ae670479a1b45a3914794934dd63d5f69e72e6f0fdde82f4a37170
3
  size 4540532728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfe2a9017db3337dfad6fd9632d552b3e3ddfece95fee16dfc25c0568e4b0ece
3
  size 4540532728
runs/May28_00-57-45_n136-082-130/events.out.tfevents.1716829733.n136-082-130.1949438.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8e179a4c81b56c69336e03277ccd93100715fe2883357d2cca38859a90e7a71
3
- size 66596
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3498debe04310e8ca2fc002ae30874b9006ff0f8749900e89bbc45c64c8854a5
3
+ size 71078
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
  "total_flos": 0.0,
4
- "train_loss": 0.3220548828315857,
5
- "train_runtime": 6253.066,
6
- "train_samples": 50000,
7
- "train_samples_per_second": 7.996,
8
- "train_steps_per_second": 0.063
9
  }
 
1
  {
2
+ "epoch": 0.9994242947610823,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.4218231642850533,
5
+ "train_runtime": 14967.0092,
6
+ "train_samples": 111134,
7
+ "train_samples_per_second": 7.425,
8
+ "train_steps_per_second": 0.058
9
  }
trainer_state.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 100,
6
- "global_step": 391,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0025575447570332483,
13
- "grad_norm": 42.05885932037307,
14
- "learning_rate": 2.5e-09,
15
- "logits/chosen": -4.623842239379883,
16
- "logits/rejected": -4.85917854309082,
17
- "logps/chosen": -239.31422424316406,
18
- "logps/rejected": -207.56365966796875,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
@@ -24,653 +24,1450 @@
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.02557544757033248,
28
- "grad_norm": 39.560773735648084,
29
- "learning_rate": 2.5e-08,
30
- "logits/chosen": -4.334544658660889,
31
- "logits/rejected": -4.644796848297119,
32
- "logps/chosen": -265.15618896484375,
33
- "logps/rejected": -215.6714630126953,
34
- "loss": 0.693,
35
- "rewards/accuracies": 0.4166666567325592,
36
- "rewards/chosen": -0.0004928099224343896,
37
- "rewards/margins": -0.0008595392573624849,
38
- "rewards/rejected": 0.00036672933492809534,
39
  "step": 10
40
  },
41
  {
42
- "epoch": 0.05115089514066496,
43
- "grad_norm": 40.83271143256618,
44
- "learning_rate": 5e-08,
45
- "logits/chosen": -4.509532928466797,
46
- "logits/rejected": -4.744012832641602,
47
- "logps/chosen": -267.80267333984375,
48
- "logps/rejected": -216.80471801757812,
49
- "loss": 0.6919,
50
- "rewards/accuracies": 0.612500011920929,
51
- "rewards/chosen": 0.0013727399054914713,
52
- "rewards/margins": 0.0033264080993831158,
53
- "rewards/rejected": -0.0019536681938916445,
54
  "step": 20
55
  },
56
  {
57
- "epoch": 0.07672634271099744,
58
- "grad_norm": 43.48154475134036,
59
- "learning_rate": 7.5e-08,
60
- "logits/chosen": -4.5965423583984375,
61
- "logits/rejected": -4.777901649475098,
62
- "logps/chosen": -257.59088134765625,
63
- "logps/rejected": -215.49658203125,
64
- "loss": 0.6845,
65
- "rewards/accuracies": 0.7875000238418579,
66
- "rewards/chosen": 0.00941941887140274,
67
- "rewards/margins": 0.019057607278227806,
68
- "rewards/rejected": -0.00963818933814764,
69
  "step": 30
70
  },
71
  {
72
- "epoch": 0.10230179028132992,
73
- "grad_norm": 43.11247032025707,
74
- "learning_rate": 1e-07,
75
- "logits/chosen": -4.648722171783447,
76
- "logits/rejected": -4.745718002319336,
77
- "logps/chosen": -250.10897827148438,
78
- "logps/rejected": -223.86532592773438,
79
- "loss": 0.6588,
80
- "rewards/accuracies": 0.856249988079071,
81
- "rewards/chosen": 0.03490210697054863,
82
- "rewards/margins": 0.07684428989887238,
83
- "rewards/rejected": -0.041942186653614044,
84
  "step": 40
85
  },
86
  {
87
- "epoch": 0.1278772378516624,
88
- "grad_norm": 47.11742069616159,
89
- "learning_rate": 9.979985922607475e-08,
90
- "logits/chosen": -4.593738555908203,
91
- "logits/rejected": -4.8337082862854,
92
- "logps/chosen": -267.30694580078125,
93
- "logps/rejected": -239.9588623046875,
94
- "loss": 0.6057,
95
- "rewards/accuracies": 0.862500011920929,
96
- "rewards/chosen": 0.04662395641207695,
97
- "rewards/margins": 0.1874973475933075,
98
- "rewards/rejected": -0.14087337255477905,
99
  "step": 50
100
  },
101
  {
102
- "epoch": 0.1534526854219949,
103
- "grad_norm": 39.18274034042972,
104
- "learning_rate": 9.92010391574745e-08,
105
- "logits/chosen": -4.788964748382568,
106
- "logits/rejected": -4.883444786071777,
107
- "logps/chosen": -237.8981475830078,
108
- "logps/rejected": -257.84942626953125,
109
- "loss": 0.5174,
110
- "rewards/accuracies": 0.8500000238418579,
111
- "rewards/chosen": 0.03224308043718338,
112
- "rewards/margins": 0.46052321791648865,
113
- "rewards/rejected": -0.4282800555229187,
114
  "step": 60
115
  },
116
  {
117
- "epoch": 0.17902813299232737,
118
- "grad_norm": 34.832880831116846,
119
- "learning_rate": 9.820833372667812e-08,
120
- "logits/chosen": -4.657534599304199,
121
- "logits/rejected": -4.817151069641113,
122
- "logps/chosen": -249.996337890625,
123
- "logps/rejected": -280.097412109375,
124
- "loss": 0.4614,
125
- "rewards/accuracies": 0.893750011920929,
126
- "rewards/chosen": 0.031456105411052704,
127
- "rewards/margins": 0.6548057198524475,
128
- "rewards/rejected": -0.6233495473861694,
129
  "step": 70
130
  },
131
  {
132
- "epoch": 0.20460358056265984,
133
- "grad_norm": 36.47722570862778,
134
- "learning_rate": 9.682969016701356e-08,
135
- "logits/chosen": -4.626967430114746,
136
- "logits/rejected": -4.778214454650879,
137
- "logps/chosen": -250.9975128173828,
138
- "logps/rejected": -311.1219177246094,
139
- "loss": 0.3904,
140
- "rewards/accuracies": 0.8500000238418579,
141
- "rewards/chosen": 0.10222460329532623,
142
- "rewards/margins": 0.9102567434310913,
143
- "rewards/rejected": -0.8080320358276367,
144
  "step": 80
145
  },
146
  {
147
- "epoch": 0.23017902813299232,
148
- "grad_norm": 30.998854450156045,
149
- "learning_rate": 9.507614539004081e-08,
150
- "logits/chosen": -4.739785194396973,
151
- "logits/rejected": -4.909841060638428,
152
- "logps/chosen": -237.671875,
153
- "logps/rejected": -307.8204040527344,
154
- "loss": 0.3509,
155
- "rewards/accuracies": 0.9125000238418579,
156
- "rewards/chosen": 0.1354086697101593,
157
- "rewards/margins": 1.1853126287460327,
158
- "rewards/rejected": -1.0499038696289062,
159
  "step": 90
160
  },
161
  {
162
- "epoch": 0.2557544757033248,
163
- "grad_norm": 42.52785579314538,
164
- "learning_rate": 9.296173762811083e-08,
165
- "logits/chosen": -4.647661209106445,
166
- "logits/rejected": -4.924945831298828,
167
- "logps/chosen": -244.45303344726562,
168
- "logps/rejected": -355.6828918457031,
169
- "loss": 0.3413,
170
- "rewards/accuracies": 0.893750011920929,
171
- "rewards/chosen": 0.12265179306268692,
172
- "rewards/margins": 1.4083904027938843,
173
- "rewards/rejected": -1.2857385873794556,
174
  "step": 100
175
  },
176
  {
177
- "epoch": 0.2557544757033248,
178
- "eval_logits/chosen": -4.821703910827637,
179
- "eval_logits/rejected": -5.045117378234863,
180
- "eval_logps/chosen": -444.5645751953125,
181
- "eval_logps/rejected": -575.9554443359375,
182
- "eval_loss": 0.7230384349822998,
183
- "eval_rewards/accuracies": 0.515625,
184
- "eval_rewards/chosen": -0.5409007668495178,
185
- "eval_rewards/margins": 0.03477693349123001,
186
- "eval_rewards/rejected": -0.575677752494812,
187
- "eval_runtime": 98.6304,
188
- "eval_samples_per_second": 20.278,
189
- "eval_steps_per_second": 0.324,
190
  "step": 100
191
  },
192
  {
193
- "epoch": 0.2813299232736573,
194
- "grad_norm": 42.40417010662429,
195
- "learning_rate": 9.050339404945832e-08,
196
- "logits/chosen": -4.8084492683410645,
197
- "logits/rejected": -5.027788162231445,
198
- "logps/chosen": -247.86376953125,
199
- "logps/rejected": -369.55267333984375,
200
- "loss": 0.3143,
201
- "rewards/accuracies": 0.90625,
202
- "rewards/chosen": 0.01321962010115385,
203
- "rewards/margins": 1.5418872833251953,
204
- "rewards/rejected": -1.528667688369751,
205
  "step": 110
206
  },
207
  {
208
- "epoch": 0.3069053708439898,
209
- "grad_norm": 31.11429497548564,
210
- "learning_rate": 8.77207952455395e-08,
211
- "logits/chosen": -4.781357765197754,
212
- "logits/rejected": -5.055319786071777,
213
- "logps/chosen": -271.8451843261719,
214
- "logps/rejected": -396.73046875,
215
- "loss": 0.3042,
216
- "rewards/accuracies": 0.9125000238418579,
217
- "rewards/chosen": 0.041443757712841034,
218
- "rewards/margins": 1.7226619720458984,
219
- "rewards/rejected": -1.6812183856964111,
220
  "step": 120
221
  },
222
  {
223
- "epoch": 0.33248081841432225,
224
- "grad_norm": 32.05773581279916,
225
- "learning_rate": 8.463621767547997e-08,
226
- "logits/chosen": -4.876931190490723,
227
- "logits/rejected": -5.202266693115234,
228
- "logps/chosen": -264.2982482910156,
229
- "logps/rejected": -409.0570983886719,
230
- "loss": 0.2914,
231
- "rewards/accuracies": 0.8500000238418579,
232
- "rewards/chosen": -0.058543670922517776,
233
- "rewards/margins": 1.8787403106689453,
234
- "rewards/rejected": -1.937284231185913,
235
  "step": 130
236
  },
237
  {
238
- "epoch": 0.35805626598465473,
239
- "grad_norm": 31.96087329942538,
240
- "learning_rate": 8.127435532896387e-08,
241
- "logits/chosen": -4.971903324127197,
242
- "logits/rejected": -5.277985095977783,
243
- "logps/chosen": -305.4132385253906,
244
- "logps/rejected": -457.46343994140625,
245
- "loss": 0.274,
246
- "rewards/accuracies": 0.8812500238418579,
247
- "rewards/chosen": -0.2121816873550415,
248
- "rewards/margins": 2.040717601776123,
249
- "rewards/rejected": -2.252899408340454,
250
  "step": 140
251
  },
252
  {
253
- "epoch": 0.3836317135549872,
254
- "grad_norm": 40.46461234858551,
255
- "learning_rate": 7.766212203526569e-08,
256
- "logits/chosen": -5.087113857269287,
257
- "logits/rejected": -5.368134498596191,
258
- "logps/chosen": -274.01080322265625,
259
- "logps/rejected": -457.38330078125,
260
- "loss": 0.2819,
261
- "rewards/accuracies": 0.875,
262
- "rewards/chosen": -0.2159254252910614,
263
- "rewards/margins": 2.166714906692505,
264
- "rewards/rejected": -2.3826401233673096,
265
  "step": 150
266
  },
267
  {
268
- "epoch": 0.4092071611253197,
269
- "grad_norm": 32.057320142788335,
270
- "learning_rate": 7.382843600106538e-08,
271
- "logits/chosen": -5.177260398864746,
272
- "logits/rejected": -5.416450023651123,
273
- "logps/chosen": -284.1901550292969,
274
- "logps/rejected": -474.3257751464844,
275
- "loss": 0.2436,
276
- "rewards/accuracies": 0.887499988079071,
277
- "rewards/chosen": -0.3273366093635559,
278
- "rewards/margins": 2.2598299980163574,
279
- "rewards/rejected": -2.5871663093566895,
280
  "step": 160
281
  },
282
  {
283
- "epoch": 0.43478260869565216,
284
- "grad_norm": 33.151157821087715,
285
- "learning_rate": 6.980398830195784e-08,
286
- "logits/chosen": -5.109088897705078,
287
- "logits/rejected": -5.438628196716309,
288
- "logps/chosen": -296.1925964355469,
289
- "logps/rejected": -516.4288940429688,
290
- "loss": 0.2364,
291
- "rewards/accuracies": 0.9375,
292
- "rewards/chosen": -0.3511837124824524,
293
- "rewards/margins": 2.6150753498077393,
294
- "rewards/rejected": -2.966259002685547,
295
  "step": 170
296
  },
297
  {
298
- "epoch": 0.46035805626598464,
299
- "grad_norm": 34.18806970089564,
300
- "learning_rate": 6.562099718102787e-08,
301
- "logits/chosen": -5.2773332595825195,
302
- "logits/rejected": -5.568037509918213,
303
- "logps/chosen": -284.951904296875,
304
- "logps/rejected": -486.5365295410156,
305
- "loss": 0.2628,
306
- "rewards/accuracies": 0.875,
307
- "rewards/chosen": -0.480882465839386,
308
- "rewards/margins": 2.4242804050445557,
309
- "rewards/rejected": -2.9051625728607178,
310
  "step": 180
311
  },
312
  {
313
- "epoch": 0.4859335038363171,
314
- "grad_norm": 33.03269272782741,
315
- "learning_rate": 6.131295012148612e-08,
316
- "logits/chosen": -5.19248104095459,
317
- "logits/rejected": -5.355208396911621,
318
- "logps/chosen": -311.060791015625,
319
- "logps/rejected": -542.6156005859375,
320
- "loss": 0.2517,
321
- "rewards/accuracies": 0.8687499761581421,
322
- "rewards/chosen": -0.5016793012619019,
323
- "rewards/margins": 2.5728163719177246,
324
- "rewards/rejected": -3.074495792388916,
325
  "step": 190
326
  },
327
  {
328
- "epoch": 0.5115089514066496,
329
- "grad_norm": 40.925552268276135,
330
- "learning_rate": 5.691433575823665e-08,
331
- "logits/chosen": -5.236765384674072,
332
- "logits/rejected": -5.465119361877441,
333
- "logps/chosen": -302.1981201171875,
334
- "logps/rejected": -515.2794799804688,
335
- "loss": 0.2653,
336
- "rewards/accuracies": 0.875,
337
- "rewards/chosen": -0.5005888342857361,
338
- "rewards/margins": 2.4987406730651855,
339
- "rewards/rejected": -2.9993293285369873,
340
  "step": 200
341
  },
342
  {
343
- "epoch": 0.5115089514066496,
344
- "eval_logits/chosen": -5.226232528686523,
345
- "eval_logits/rejected": -5.50424337387085,
346
- "eval_logps/chosen": -540.43896484375,
347
- "eval_logps/rejected": -679.8809814453125,
348
- "eval_loss": 0.7765124440193176,
349
- "eval_rewards/accuracies": 0.54296875,
350
- "eval_rewards/chosen": -1.4996453523635864,
351
- "eval_rewards/margins": 0.11528739333152771,
352
- "eval_rewards/rejected": -1.6149327754974365,
353
- "eval_runtime": 98.5941,
354
- "eval_samples_per_second": 20.285,
355
- "eval_steps_per_second": 0.325,
356
  "step": 200
357
  },
358
  {
359
- "epoch": 0.5370843989769821,
360
- "grad_norm": 26.659672604447973,
361
- "learning_rate": 5.2460367774593905e-08,
362
- "logits/chosen": -5.310137748718262,
363
- "logits/rejected": -5.583542823791504,
364
- "logps/chosen": -303.67047119140625,
365
- "logps/rejected": -573.1016845703125,
366
- "loss": 0.2296,
367
- "rewards/accuracies": 0.925000011920929,
368
- "rewards/chosen": -0.37785404920578003,
369
- "rewards/margins": 3.0667028427124023,
370
- "rewards/rejected": -3.444556713104248,
371
  "step": 210
372
  },
373
  {
374
- "epoch": 0.5626598465473146,
375
- "grad_norm": 35.355172011912686,
376
- "learning_rate": 4.798670299452925e-08,
377
- "logits/chosen": -5.1389665603637695,
378
- "logits/rejected": -5.567061424255371,
379
- "logps/chosen": -304.0540466308594,
380
- "logps/rejected": -569.4851684570312,
381
- "loss": 0.245,
382
- "rewards/accuracies": 0.90625,
383
- "rewards/chosen": -0.41242700815200806,
384
- "rewards/margins": 3.0356929302215576,
385
- "rewards/rejected": -3.4481201171875,
386
  "step": 220
387
  },
388
  {
389
- "epoch": 0.5882352941176471,
390
- "grad_norm": 37.35765448344736,
391
- "learning_rate": 4.3529155927297226e-08,
392
- "logits/chosen": -5.210625648498535,
393
- "logits/rejected": -5.601117134094238,
394
- "logps/chosen": -323.33135986328125,
395
- "logps/rejected": -591.130126953125,
396
- "loss": 0.2477,
397
- "rewards/accuracies": 0.90625,
398
- "rewards/chosen": -0.623622715473175,
399
- "rewards/margins": 3.050567150115967,
400
- "rewards/rejected": -3.674190044403076,
401
  "step": 230
402
  },
403
  {
404
- "epoch": 0.6138107416879796,
405
- "grad_norm": 27.168387739658527,
406
- "learning_rate": 3.9123412049691636e-08,
407
- "logits/chosen": -5.26107120513916,
408
- "logits/rejected": -5.582613945007324,
409
- "logps/chosen": -341.65289306640625,
410
- "logps/rejected": -593.1688232421875,
411
- "loss": 0.2349,
412
- "rewards/accuracies": 0.9125000238418579,
413
- "rewards/chosen": -0.68829745054245,
414
- "rewards/margins": 3.034055233001709,
415
- "rewards/rejected": -3.7223525047302246,
416
  "step": 240
417
  },
418
  {
419
- "epoch": 0.639386189258312,
420
- "grad_norm": 34.59601076495169,
421
- "learning_rate": 3.480474212128766e-08,
422
- "logits/chosen": -5.441601753234863,
423
- "logits/rejected": -5.72822380065918,
424
- "logps/chosen": -329.5417175292969,
425
- "logps/rejected": -537.7394409179688,
426
- "loss": 0.2339,
427
- "rewards/accuracies": 0.84375,
428
- "rewards/chosen": -0.8084025382995605,
429
- "rewards/margins": 2.4929001331329346,
430
- "rewards/rejected": -3.301302433013916,
431
  "step": 250
432
  },
433
  {
434
- "epoch": 0.6649616368286445,
435
- "grad_norm": 44.5395657806438,
436
- "learning_rate": 3.060771981975726e-08,
437
- "logits/chosen": -5.302738666534424,
438
- "logits/rejected": -5.622676372528076,
439
- "logps/chosen": -326.24041748046875,
440
- "logps/rejected": -637.6575927734375,
441
- "loss": 0.2325,
442
- "rewards/accuracies": 0.90625,
443
- "rewards/chosen": -0.7219182848930359,
444
- "rewards/margins": 3.4571731090545654,
445
- "rewards/rejected": -4.179091453552246,
446
  "step": 260
447
  },
448
  {
449
- "epoch": 0.690537084398977,
450
- "grad_norm": 33.64914034772639,
451
- "learning_rate": 2.6565944956764818e-08,
452
- "logits/chosen": -5.4421281814575195,
453
- "logits/rejected": -5.695931911468506,
454
- "logps/chosen": -332.70892333984375,
455
- "logps/rejected": -598.5055541992188,
456
- "loss": 0.2433,
457
- "rewards/accuracies": 0.918749988079071,
458
- "rewards/chosen": -0.7167563438415527,
459
- "rewards/margins": 3.118049144744873,
460
- "rewards/rejected": -3.834805727005005,
461
  "step": 270
462
  },
463
  {
464
- "epoch": 0.7161125319693095,
465
- "grad_norm": 38.28164920230575,
466
- "learning_rate": 2.2711774490274766e-08,
467
- "logits/chosen": -5.344332695007324,
468
- "logits/rejected": -5.591184616088867,
469
- "logps/chosen": -331.06939697265625,
470
- "logps/rejected": -640.959228515625,
471
- "loss": 0.2174,
472
- "rewards/accuracies": 0.925000011920929,
473
- "rewards/chosen": -0.665625810623169,
474
- "rewards/margins": 3.3145720958709717,
475
- "rewards/rejected": -3.9801979064941406,
476
  "step": 280
477
  },
478
  {
479
- "epoch": 0.7416879795396419,
480
- "grad_norm": 42.555865291815444,
481
- "learning_rate": 1.9076063486687256e-08,
482
- "logits/chosen": -5.223475933074951,
483
- "logits/rejected": -5.618660926818848,
484
- "logps/chosen": -328.63055419921875,
485
- "logps/rejected": -579.0905151367188,
486
- "loss": 0.2228,
487
- "rewards/accuracies": 0.918749988079071,
488
- "rewards/chosen": -0.5602216124534607,
489
- "rewards/margins": 3.074389696121216,
490
- "rewards/rejected": -3.634611129760742,
491
  "step": 290
492
  },
493
  {
494
- "epoch": 0.7672634271099744,
495
- "grad_norm": 40.820437800178965,
496
- "learning_rate": 1.5687918106563324e-08,
497
- "logits/chosen": -5.369271755218506,
498
- "logits/rejected": -5.632781028747559,
499
- "logps/chosen": -320.268798828125,
500
- "logps/rejected": -608.9943237304688,
501
- "loss": 0.2424,
502
- "rewards/accuracies": 0.862500011920929,
503
- "rewards/chosen": -0.6686061024665833,
504
- "rewards/margins": 3.2616829872131348,
505
- "rewards/rejected": -3.9302895069122314,
506
  "step": 300
507
  },
508
  {
509
- "epoch": 0.7672634271099744,
510
- "eval_logits/chosen": -5.394677639007568,
511
- "eval_logits/rejected": -5.655616283416748,
512
- "eval_logps/chosen": -579.441650390625,
513
- "eval_logps/rejected": -718.423828125,
514
- "eval_loss": 0.8002758622169495,
515
- "eval_rewards/accuracies": 0.52734375,
516
- "eval_rewards/chosen": -1.889671802520752,
517
- "eval_rewards/margins": 0.11068924516439438,
518
- "eval_rewards/rejected": -2.000361442565918,
519
- "eval_runtime": 98.5861,
520
- "eval_samples_per_second": 20.287,
521
- "eval_steps_per_second": 0.325,
522
  "step": 300
523
  },
524
  {
525
- "epoch": 0.7928388746803069,
526
- "grad_norm": 33.64379879568246,
527
- "learning_rate": 1.257446259144494e-08,
528
- "logits/chosen": -5.246872425079346,
529
- "logits/rejected": -5.653367042541504,
530
- "logps/chosen": -315.7105407714844,
531
- "logps/rejected": -625.9619140625,
532
- "loss": 0.2323,
533
- "rewards/accuracies": 0.9312499761581421,
534
- "rewards/chosen": -0.5457090735435486,
535
- "rewards/margins": 3.491931200027466,
536
- "rewards/rejected": -4.03764009475708,
537
  "step": 310
538
  },
539
  {
540
- "epoch": 0.8184143222506394,
541
- "grad_norm": 35.35694379401523,
542
- "learning_rate": 9.760622117187234e-09,
543
- "logits/chosen": -5.381436824798584,
544
- "logits/rejected": -5.7473673820495605,
545
- "logps/chosen": -314.6531677246094,
546
- "logps/rejected": -594.841552734375,
547
- "loss": 0.2466,
548
- "rewards/accuracies": 0.887499988079071,
549
- "rewards/chosen": -0.6948888897895813,
550
- "rewards/margins": 3.1890125274658203,
551
- "rewards/rejected": -3.8839008808135986,
552
  "step": 320
553
  },
554
  {
555
- "epoch": 0.8439897698209718,
556
- "grad_norm": 34.515465680243125,
557
- "learning_rate": 7.2689232521989885e-09,
558
- "logits/chosen": -5.308783531188965,
559
- "logits/rejected": -5.656357765197754,
560
- "logps/chosen": -347.4857482910156,
561
- "logps/rejected": -629.4615478515625,
562
- "loss": 0.2233,
563
- "rewards/accuracies": 0.875,
564
- "rewards/chosen": -0.7933691143989563,
565
- "rewards/margins": 3.1742498874664307,
566
- "rewards/rejected": -3.967618942260742,
567
  "step": 330
568
  },
569
  {
570
- "epoch": 0.8695652173913043,
571
- "grad_norm": 28.542655038843865,
572
- "learning_rate": 5.119313618049309e-09,
573
- "logits/chosen": -5.346091270446777,
574
- "logits/rejected": -5.756931781768799,
575
- "logps/chosen": -338.65509033203125,
576
- "logps/rejected": -575.4979858398438,
577
- "loss": 0.2174,
578
- "rewards/accuracies": 0.8999999761581421,
579
- "rewards/chosen": -0.6450907588005066,
580
- "rewards/margins": 3.0420687198638916,
581
- "rewards/rejected": -3.687159299850464,
582
  "step": 340
583
  },
584
  {
585
- "epoch": 0.8951406649616368,
586
- "grad_norm": 33.42105425863571,
587
- "learning_rate": 3.3290021961708158e-09,
588
- "logits/chosen": -5.374421119689941,
589
- "logits/rejected": -5.536851406097412,
590
- "logps/chosen": -333.8661193847656,
591
- "logps/rejected": -595.2741088867188,
592
- "loss": 0.2467,
593
- "rewards/accuracies": 0.887499988079071,
594
- "rewards/chosen": -0.7839492559432983,
595
- "rewards/margins": 2.891091823577881,
596
- "rewards/rejected": -3.6750411987304688,
597
  "step": 350
598
  },
599
  {
600
- "epoch": 0.9207161125319693,
601
- "grad_norm": 33.175441995042306,
602
- "learning_rate": 1.9123215591052013e-09,
603
- "logits/chosen": -5.3232526779174805,
604
- "logits/rejected": -5.559803485870361,
605
- "logps/chosen": -337.17694091796875,
606
- "logps/rejected": -596.7660522460938,
607
- "loss": 0.2397,
608
- "rewards/accuracies": 0.9125000238418579,
609
- "rewards/chosen": -0.7554206252098083,
610
- "rewards/margins": 2.937615156173706,
611
- "rewards/rejected": -3.693035840988159,
612
  "step": 360
613
  },
614
  {
615
- "epoch": 0.9462915601023018,
616
- "grad_norm": 33.50889046296721,
617
- "learning_rate": 8.806131292167618e-10,
618
- "logits/chosen": -5.363125801086426,
619
- "logits/rejected": -5.561426162719727,
620
- "logps/chosen": -327.09295654296875,
621
- "logps/rejected": -608.7786865234375,
622
- "loss": 0.238,
623
- "rewards/accuracies": 0.862500011920929,
624
- "rewards/chosen": -0.7027177214622498,
625
- "rewards/margins": 3.01659893989563,
626
- "rewards/rejected": -3.719316005706787,
627
  "step": 370
628
  },
629
  {
630
- "epoch": 0.9718670076726342,
631
- "grad_norm": 42.30140132740828,
632
- "learning_rate": 2.4213638345040867e-10,
633
- "logits/chosen": -5.489308834075928,
634
- "logits/rejected": -5.787456512451172,
635
- "logps/chosen": -332.35858154296875,
636
- "logps/rejected": -607.3480224609375,
637
- "loss": 0.2341,
638
- "rewards/accuracies": 0.918749988079071,
639
- "rewards/chosen": -0.7027586698532104,
640
- "rewards/margins": 3.165475368499756,
641
- "rewards/rejected": -3.8682339191436768,
642
  "step": 380
643
  },
644
  {
645
- "epoch": 0.9974424552429667,
646
- "grad_norm": 37.88179259111206,
647
- "learning_rate": 2.0027310073833516e-12,
648
- "logits/chosen": -5.485334873199463,
649
- "logits/rejected": -5.764852046966553,
650
- "logps/chosen": -331.56610107421875,
651
- "logps/rejected": -614.2138061523438,
652
- "loss": 0.2223,
653
- "rewards/accuracies": 0.90625,
654
- "rewards/chosen": -0.7189357876777649,
655
- "rewards/margins": 3.2180511951446533,
656
- "rewards/rejected": -3.9369864463806152,
657
  "step": 390
658
  },
659
  {
660
- "epoch": 1.0,
661
- "step": 391,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
662
  "total_flos": 0.0,
663
- "train_loss": 0.3220548828315857,
664
- "train_runtime": 6253.066,
665
- "train_samples_per_second": 7.996,
666
- "train_steps_per_second": 0.063
667
  }
668
  ],
669
  "logging_steps": 10,
670
- "max_steps": 391,
671
  "num_input_tokens_seen": 0,
672
  "num_train_epochs": 1,
673
  "save_steps": 100,
 
 
 
 
 
 
 
 
 
 
 
 
674
  "total_flos": 0.0,
675
  "train_batch_size": 8,
676
  "trial_name": null,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9994242947610823,
5
  "eval_steps": 100,
6
+ "global_step": 868,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0011514104778353484,
13
+ "grad_norm": 35.91765211885503,
14
+ "learning_rate": 5.747126436781609e-09,
15
+ "logits/chosen": -2.086653709411621,
16
+ "logits/rejected": -2.069509267807007,
17
+ "logps/chosen": -361.22979736328125,
18
+ "logps/rejected": -328.4201354980469,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
 
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.011514104778353483,
28
+ "grad_norm": 37.62574042925476,
29
+ "learning_rate": 5.747126436781609e-08,
30
+ "logits/chosen": -2.192697763442993,
31
+ "logits/rejected": -2.1893699169158936,
32
+ "logps/chosen": -346.8982238769531,
33
+ "logps/rejected": -305.4053039550781,
34
+ "loss": 0.6929,
35
+ "rewards/accuracies": 0.4652777910232544,
36
+ "rewards/chosen": 0.00022573958267457783,
37
+ "rewards/margins": 0.00043605040991678834,
38
+ "rewards/rejected": -0.00021031053620390594,
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.023028209556706966,
43
+ "grad_norm": 33.76619596156607,
44
+ "learning_rate": 1.1494252873563217e-07,
45
+ "logits/chosen": -2.170515775680542,
46
+ "logits/rejected": -2.1960134506225586,
47
+ "logps/chosen": -322.89593505859375,
48
+ "logps/rejected": -279.732177734375,
49
+ "loss": 0.6923,
50
+ "rewards/accuracies": 0.550000011920929,
51
+ "rewards/chosen": 0.006018324755132198,
52
+ "rewards/margins": 0.0009490437805652618,
53
+ "rewards/rejected": 0.005069280508905649,
54
  "step": 20
55
  },
56
  {
57
+ "epoch": 0.03454231433506045,
58
+ "grad_norm": 36.02949439768653,
59
+ "learning_rate": 1.7241379310344828e-07,
60
+ "logits/chosen": -2.226337194442749,
61
+ "logits/rejected": -2.215334415435791,
62
+ "logps/chosen": -343.44012451171875,
63
+ "logps/rejected": -305.6834411621094,
64
+ "loss": 0.6875,
65
+ "rewards/accuracies": 0.6499999761581421,
66
+ "rewards/chosen": 0.0371861457824707,
67
+ "rewards/margins": 0.012388146482408047,
68
+ "rewards/rejected": 0.02479800209403038,
69
  "step": 30
70
  },
71
  {
72
+ "epoch": 0.04605641911341393,
73
+ "grad_norm": 30.794242683432575,
74
+ "learning_rate": 2.2988505747126435e-07,
75
+ "logits/chosen": -2.3109958171844482,
76
+ "logits/rejected": -2.272737979888916,
77
+ "logps/chosen": -313.8249206542969,
78
+ "logps/rejected": -281.3092956542969,
79
+ "loss": 0.6758,
80
+ "rewards/accuracies": 0.699999988079071,
81
+ "rewards/chosen": 0.10629389435052872,
82
+ "rewards/margins": 0.035184551030397415,
83
+ "rewards/rejected": 0.071109339594841,
84
  "step": 40
85
  },
86
  {
87
+ "epoch": 0.057570523891767415,
88
+ "grad_norm": 29.832104382822315,
89
+ "learning_rate": 2.873563218390804e-07,
90
+ "logits/chosen": -2.4144537448883057,
91
+ "logits/rejected": -2.4051060676574707,
92
+ "logps/chosen": -335.85626220703125,
93
+ "logps/rejected": -322.4024658203125,
94
+ "loss": 0.664,
95
+ "rewards/accuracies": 0.606249988079071,
96
+ "rewards/chosen": 0.20595140755176544,
97
+ "rewards/margins": 0.058795731514692307,
98
+ "rewards/rejected": 0.14715565741062164,
99
  "step": 50
100
  },
101
  {
102
+ "epoch": 0.0690846286701209,
103
+ "grad_norm": 27.97699348851217,
104
+ "learning_rate": 3.4482758620689656e-07,
105
+ "logits/chosen": -2.4252865314483643,
106
+ "logits/rejected": -2.4110381603240967,
107
+ "logps/chosen": -293.0983581542969,
108
+ "logps/rejected": -276.4584655761719,
109
+ "loss": 0.6437,
110
+ "rewards/accuracies": 0.706250011920929,
111
+ "rewards/chosen": 0.26091432571411133,
112
+ "rewards/margins": 0.12072187662124634,
113
+ "rewards/rejected": 0.140192449092865,
114
  "step": 60
115
  },
116
  {
117
+ "epoch": 0.08059873344847437,
118
+ "grad_norm": 26.14817360357517,
119
+ "learning_rate": 4.0229885057471266e-07,
120
+ "logits/chosen": -2.5252156257629395,
121
+ "logits/rejected": -2.488867998123169,
122
+ "logps/chosen": -341.91156005859375,
123
+ "logps/rejected": -308.27032470703125,
124
+ "loss": 0.6192,
125
+ "rewards/accuracies": 0.7749999761581421,
126
+ "rewards/chosen": 0.3610069155693054,
127
+ "rewards/margins": 0.20518210530281067,
128
+ "rewards/rejected": 0.15582481026649475,
129
  "step": 70
130
  },
131
  {
132
+ "epoch": 0.09211283822682786,
133
+ "grad_norm": 26.01503586020309,
134
+ "learning_rate": 4.597701149425287e-07,
135
+ "logits/chosen": -2.443207263946533,
136
+ "logits/rejected": -2.4321365356445312,
137
+ "logps/chosen": -303.1759948730469,
138
+ "logps/rejected": -293.99212646484375,
139
+ "loss": 0.5946,
140
+ "rewards/accuracies": 0.7437499761581421,
141
+ "rewards/chosen": 0.2370336949825287,
142
+ "rewards/margins": 0.22374853491783142,
143
+ "rewards/rejected": 0.013285147957503796,
144
  "step": 80
145
  },
146
  {
147
+ "epoch": 0.10362694300518134,
148
+ "grad_norm": 28.597789728089687,
149
+ "learning_rate": 4.999817969178237e-07,
150
+ "logits/chosen": -2.468017578125,
151
+ "logits/rejected": -2.45894718170166,
152
+ "logps/chosen": -341.286376953125,
153
+ "logps/rejected": -346.0598449707031,
154
+ "loss": 0.5438,
155
+ "rewards/accuracies": 0.8125,
156
+ "rewards/chosen": 0.2997075915336609,
157
+ "rewards/margins": 0.4598621726036072,
158
+ "rewards/rejected": -0.16015461087226868,
159
  "step": 90
160
  },
161
  {
162
+ "epoch": 0.11514104778353483,
163
+ "grad_norm": 31.239635888342793,
164
+ "learning_rate": 4.996582603056428e-07,
165
+ "logits/chosen": -2.290760040283203,
166
+ "logits/rejected": -2.2722649574279785,
167
+ "logps/chosen": -325.2711181640625,
168
+ "logps/rejected": -352.16949462890625,
169
+ "loss": 0.5118,
170
+ "rewards/accuracies": 0.75,
171
+ "rewards/chosen": -0.0031170793808996677,
172
+ "rewards/margins": 0.5678674578666687,
173
+ "rewards/rejected": -0.5709845423698425,
174
  "step": 100
175
  },
176
  {
177
+ "epoch": 0.11514104778353483,
178
+ "eval_logits/chosen": -2.2212953567504883,
179
+ "eval_logits/rejected": -2.1984219551086426,
180
+ "eval_logps/chosen": -390.5766296386719,
181
+ "eval_logps/rejected": -417.6701354980469,
182
+ "eval_loss": 0.592314600944519,
183
+ "eval_rewards/accuracies": 0.70703125,
184
+ "eval_rewards/chosen": -0.11199207603931427,
185
+ "eval_rewards/margins": 0.3385947644710541,
186
+ "eval_rewards/rejected": -0.45058679580688477,
187
+ "eval_runtime": 98.608,
188
+ "eval_samples_per_second": 20.282,
189
+ "eval_steps_per_second": 0.325,
190
  "step": 100
191
  },
192
  {
193
+ "epoch": 0.1266551525618883,
194
+ "grad_norm": 28.87850245767613,
195
+ "learning_rate": 4.989308132738126e-07,
196
+ "logits/chosen": -2.224853754043579,
197
+ "logits/rejected": -2.1996631622314453,
198
+ "logps/chosen": -334.91888427734375,
199
+ "logps/rejected": -380.91668701171875,
200
+ "loss": 0.4719,
201
+ "rewards/accuracies": 0.731249988079071,
202
+ "rewards/chosen": 0.0493912398815155,
203
+ "rewards/margins": 0.8100606203079224,
204
+ "rewards/rejected": -0.7606694102287292,
205
  "step": 110
206
  },
207
  {
208
+ "epoch": 0.1381692573402418,
209
+ "grad_norm": 29.398659404338673,
210
+ "learning_rate": 4.978006327248536e-07,
211
+ "logits/chosen": -2.199742555618286,
212
+ "logits/rejected": -2.1492202281951904,
213
+ "logps/chosen": -314.296142578125,
214
+ "logps/rejected": -369.991455078125,
215
+ "loss": 0.4704,
216
+ "rewards/accuracies": 0.768750011920929,
217
+ "rewards/chosen": 0.09014983475208282,
218
+ "rewards/margins": 0.9132173657417297,
219
+ "rewards/rejected": -0.8230674862861633,
220
  "step": 120
221
  },
222
  {
223
+ "epoch": 0.1496833621185953,
224
+ "grad_norm": 30.44019666597221,
225
+ "learning_rate": 4.962695471250032e-07,
226
+ "logits/chosen": -2.1790311336517334,
227
+ "logits/rejected": -2.1547985076904297,
228
+ "logps/chosen": -302.8690490722656,
229
+ "logps/rejected": -415.23095703125,
230
+ "loss": 0.4555,
231
+ "rewards/accuracies": 0.768750011920929,
232
+ "rewards/chosen": 0.09897075593471527,
233
+ "rewards/margins": 1.2424136400222778,
234
+ "rewards/rejected": -1.1434428691864014,
235
  "step": 130
236
  },
237
  {
238
+ "epoch": 0.16119746689694875,
239
+ "grad_norm": 33.58601902040164,
240
+ "learning_rate": 4.94340033546025e-07,
241
+ "logits/chosen": -2.2502989768981934,
242
+ "logits/rejected": -2.2536580562591553,
243
+ "logps/chosen": -325.1845397949219,
244
+ "logps/rejected": -431.7062072753906,
245
+ "loss": 0.4345,
246
+ "rewards/accuracies": 0.8062499761581421,
247
+ "rewards/chosen": 0.23212842643260956,
248
+ "rewards/margins": 1.2962288856506348,
249
+ "rewards/rejected": -1.0641005039215088,
250
  "step": 140
251
  },
252
  {
253
+ "epoch": 0.17271157167530224,
254
+ "grad_norm": 32.120902840689595,
255
+ "learning_rate": 4.920152136576705e-07,
256
+ "logits/chosen": -2.44754958152771,
257
+ "logits/rejected": -2.4280953407287598,
258
+ "logps/chosen": -325.13916015625,
259
+ "logps/rejected": -465.1835021972656,
260
+ "loss": 0.4604,
261
+ "rewards/accuracies": 0.78125,
262
+ "rewards/chosen": 0.07414035499095917,
263
+ "rewards/margins": 1.312412977218628,
264
+ "rewards/rejected": -1.2382725477218628,
265
  "step": 150
266
  },
267
  {
268
+ "epoch": 0.18422567645365573,
269
+ "grad_norm": 27.142754060910285,
270
+ "learning_rate": 4.892988486772756e-07,
271
+ "logits/chosen": -2.7220418453216553,
272
+ "logits/rejected": -2.731748342514038,
273
+ "logps/chosen": -341.7224426269531,
274
+ "logps/rejected": -451.0387268066406,
275
+ "loss": 0.4331,
276
+ "rewards/accuracies": 0.8062499761581421,
277
+ "rewards/chosen": 0.20678754150867462,
278
+ "rewards/margins": 1.3990733623504639,
279
+ "rewards/rejected": -1.1922857761383057,
280
  "step": 160
281
  },
282
  {
283
+ "epoch": 0.19573978123200922,
284
+ "grad_norm": 29.918359187167102,
285
+ "learning_rate": 4.861953332846629e-07,
286
+ "logits/chosen": -2.796257495880127,
287
+ "logits/rejected": -2.810292959213257,
288
+ "logps/chosen": -360.57257080078125,
289
+ "logps/rejected": -441.2469787597656,
290
+ "loss": 0.4495,
291
+ "rewards/accuracies": 0.762499988079071,
292
+ "rewards/chosen": 0.04986714571714401,
293
+ "rewards/margins": 1.442570447921753,
294
+ "rewards/rejected": -1.3927034139633179,
295
  "step": 170
296
  },
297
  {
298
+ "epoch": 0.20725388601036268,
299
+ "grad_norm": 28.18581518610586,
300
+ "learning_rate": 4.827096885121953e-07,
301
+ "logits/chosen": -2.9461441040039062,
302
+ "logits/rejected": -2.936654567718506,
303
+ "logps/chosen": -342.01666259765625,
304
+ "logps/rejected": -421.7103576660156,
305
+ "loss": 0.435,
306
+ "rewards/accuracies": 0.793749988079071,
307
+ "rewards/chosen": -0.026858195662498474,
308
+ "rewards/margins": 1.3959574699401855,
309
+ "rewards/rejected": -1.4228156805038452,
310
  "step": 180
311
  },
312
  {
313
+ "epoch": 0.21876799078871617,
314
+ "grad_norm": 35.53737142925795,
315
+ "learning_rate": 4.788475536214821e-07,
316
+ "logits/chosen": -3.022078275680542,
317
+ "logits/rejected": -3.0052285194396973,
318
+ "logps/chosen": -336.94830322265625,
319
+ "logps/rejected": -493.62359619140625,
320
+ "loss": 0.4228,
321
+ "rewards/accuracies": 0.7250000238418579,
322
+ "rewards/chosen": 0.03777497634291649,
323
+ "rewards/margins": 1.5011249780654907,
324
+ "rewards/rejected": -1.4633500576019287,
325
  "step": 190
326
  },
327
  {
328
+ "epoch": 0.23028209556706966,
329
+ "grad_norm": 32.357788149040054,
330
+ "learning_rate": 4.746151769798818e-07,
331
+ "logits/chosen": -3.098576545715332,
332
+ "logits/rejected": -3.122755527496338,
333
+ "logps/chosen": -350.237060546875,
334
+ "logps/rejected": -476.60345458984375,
335
+ "loss": 0.4206,
336
+ "rewards/accuracies": 0.7562500238418579,
337
+ "rewards/chosen": 0.029024356976151466,
338
+ "rewards/margins": 1.669870376586914,
339
+ "rewards/rejected": -1.6408460140228271,
340
  "step": 200
341
  },
342
  {
343
+ "epoch": 0.23028209556706966,
344
+ "eval_logits/chosen": -3.164449691772461,
345
+ "eval_logits/rejected": -3.2280213832855225,
346
+ "eval_logps/chosen": -408.5089416503906,
347
+ "eval_logps/rejected": -480.46405029296875,
348
+ "eval_loss": 0.5054616928100586,
349
+ "eval_rewards/accuracies": 0.80078125,
350
+ "eval_rewards/chosen": -0.2913154363632202,
351
+ "eval_rewards/margins": 0.7872099280357361,
352
+ "eval_rewards/rejected": -1.078525424003601,
353
+ "eval_runtime": 98.2744,
354
+ "eval_samples_per_second": 20.351,
355
+ "eval_steps_per_second": 0.326,
356
  "step": 200
357
  },
358
  {
359
+ "epoch": 0.24179620034542315,
360
+ "grad_norm": 33.674165033906036,
361
+ "learning_rate": 4.7001940595156055e-07,
362
+ "logits/chosen": -3.1950924396514893,
363
+ "logits/rejected": -3.276893138885498,
364
+ "logps/chosen": -364.2984313964844,
365
+ "logps/rejected": -458.85418701171875,
366
+ "loss": 0.4096,
367
+ "rewards/accuracies": 0.78125,
368
+ "rewards/chosen": -0.04986700415611267,
369
+ "rewards/margins": 1.6173715591430664,
370
+ "rewards/rejected": -1.6672385931015015,
371
  "step": 210
372
  },
373
  {
374
+ "epoch": 0.2533103051237766,
375
+ "grad_norm": 33.42353087043008,
376
+ "learning_rate": 4.650676758194623e-07,
377
+ "logits/chosen": -3.289186477661133,
378
+ "logits/rejected": -3.4233367443084717,
379
+ "logps/chosen": -340.89410400390625,
380
+ "logps/rejected": -531.8297729492188,
381
+ "loss": 0.417,
382
+ "rewards/accuracies": 0.7749999761581421,
383
+ "rewards/chosen": -0.3013092875480652,
384
+ "rewards/margins": 2.0576224327087402,
385
+ "rewards/rejected": -2.35893177986145,
386
  "step": 220
387
  },
388
  {
389
+ "epoch": 0.26482440990213013,
390
+ "grad_norm": 28.030706610514635,
391
+ "learning_rate": 4.5976799775611215e-07,
392
+ "logits/chosen": -3.4384427070617676,
393
+ "logits/rejected": -3.6002049446105957,
394
+ "logps/chosen": -357.27099609375,
395
+ "logps/rejected": -521.6351318359375,
396
+ "loss": 0.4404,
397
+ "rewards/accuracies": 0.7875000238418579,
398
+ "rewards/chosen": -0.060726016759872437,
399
+ "rewards/margins": 2.054325580596924,
400
+ "rewards/rejected": -2.115051746368408,
401
  "step": 230
402
  },
403
  {
404
+ "epoch": 0.2763385146804836,
405
+ "grad_norm": 30.164608033500873,
406
+ "learning_rate": 4.5412894586271543e-07,
407
+ "logits/chosen": -3.5104153156280518,
408
+ "logits/rejected": -3.591907024383545,
409
+ "logps/chosen": -341.6837463378906,
410
+ "logps/rejected": -471.0796813964844,
411
+ "loss": 0.4392,
412
+ "rewards/accuracies": 0.762499988079071,
413
+ "rewards/chosen": -0.23911134898662567,
414
+ "rewards/margins": 1.611322045326233,
415
+ "rewards/rejected": -1.850433588027954,
416
  "step": 240
417
  },
418
  {
419
+ "epoch": 0.28785261945883706,
420
+ "grad_norm": 31.949435858685035,
421
+ "learning_rate": 4.481596432975201e-07,
422
+ "logits/chosen": -3.528832197189331,
423
+ "logits/rejected": -3.651289463043213,
424
+ "logps/chosen": -336.5597229003906,
425
+ "logps/rejected": -484.8773498535156,
426
+ "loss": 0.425,
427
+ "rewards/accuracies": 0.7562500238418579,
428
+ "rewards/chosen": 0.026675838977098465,
429
+ "rewards/margins": 1.7153713703155518,
430
+ "rewards/rejected": -1.6886956691741943,
431
  "step": 250
432
  },
433
  {
434
+ "epoch": 0.2993667242371906,
435
+ "grad_norm": 27.939909687462926,
436
+ "learning_rate": 4.41869747515886e-07,
437
+ "logits/chosen": -3.489166736602783,
438
+ "logits/rejected": -3.7278106212615967,
439
+ "logps/chosen": -356.98907470703125,
440
+ "logps/rejected": -521.9197387695312,
441
+ "loss": 0.4148,
442
+ "rewards/accuracies": 0.8500000238418579,
443
+ "rewards/chosen": 0.07938538491725922,
444
+ "rewards/margins": 2.32578706741333,
445
+ "rewards/rejected": -2.24640154838562,
446
  "step": 260
447
  },
448
  {
449
+ "epoch": 0.31088082901554404,
450
+ "grad_norm": 34.336437982786,
451
+ "learning_rate": 4.352694346459396e-07,
452
+ "logits/chosen": -3.69819974899292,
453
+ "logits/rejected": -3.856245756149292,
454
+ "logps/chosen": -312.3550109863281,
455
+ "logps/rejected": -512.3087768554688,
456
+ "loss": 0.3868,
457
+ "rewards/accuracies": 0.84375,
458
+ "rewards/chosen": 0.007610364351421595,
459
+ "rewards/margins": 2.3179588317871094,
460
+ "rewards/rejected": -2.3103487491607666,
461
  "step": 270
462
  },
463
  {
464
+ "epoch": 0.3223949337938975,
465
+ "grad_norm": 31.93422033932675,
466
+ "learning_rate": 4.2836938302509256e-07,
467
+ "logits/chosen": -3.8322901725769043,
468
+ "logits/rejected": -4.021459579467773,
469
+ "logps/chosen": -364.43157958984375,
470
+ "logps/rejected": -556.7454223632812,
471
+ "loss": 0.3795,
472
+ "rewards/accuracies": 0.8187500238418579,
473
+ "rewards/chosen": -0.3510279357433319,
474
+ "rewards/margins": 2.118349075317383,
475
+ "rewards/rejected": -2.469377040863037,
476
  "step": 280
477
  },
478
  {
479
+ "epoch": 0.333909038572251,
480
+ "grad_norm": 43.67643614347539,
481
+ "learning_rate": 4.2118075592405874e-07,
482
+ "logits/chosen": -4.014069080352783,
483
+ "logits/rejected": -4.166284561157227,
484
+ "logps/chosen": -366.17498779296875,
485
+ "logps/rejected": -511.95806884765625,
486
+ "loss": 0.4028,
487
+ "rewards/accuracies": 0.800000011920929,
488
+ "rewards/chosen": -0.3753136992454529,
489
+ "rewards/margins": 1.9316318035125732,
490
+ "rewards/rejected": -2.306945562362671,
491
  "step": 290
492
  },
493
  {
494
+ "epoch": 0.3454231433506045,
495
+ "grad_norm": 33.05155256360138,
496
+ "learning_rate": 4.137151834863213e-07,
497
+ "logits/chosen": -3.932748794555664,
498
+ "logits/rejected": -4.1272077560424805,
499
+ "logps/chosen": -338.482666015625,
500
+ "logps/rejected": -491.4756774902344,
501
+ "loss": 0.4144,
502
+ "rewards/accuracies": 0.78125,
503
+ "rewards/chosen": -0.12368359416723251,
504
+ "rewards/margins": 1.6778045892715454,
505
+ "rewards/rejected": -1.8014881610870361,
506
  "step": 300
507
  },
508
  {
509
+ "epoch": 0.3454231433506045,
510
+ "eval_logits/chosen": -3.886050224304199,
511
+ "eval_logits/rejected": -4.0962815284729,
512
+ "eval_logps/chosen": -410.2217712402344,
513
+ "eval_logps/rejected": -499.97003173828125,
514
+ "eval_loss": 0.45044589042663574,
515
+ "eval_rewards/accuracies": 0.77734375,
516
+ "eval_rewards/chosen": -0.3084433674812317,
517
+ "eval_rewards/margins": 0.9651419520378113,
518
+ "eval_rewards/rejected": -1.273585319519043,
519
+ "eval_runtime": 99.0297,
520
+ "eval_samples_per_second": 20.196,
521
+ "eval_steps_per_second": 0.323,
522
  "step": 300
523
  },
524
  {
525
+ "epoch": 0.356937248128958,
526
+ "grad_norm": 30.758950038626843,
527
+ "learning_rate": 4.059847439122671e-07,
528
+ "logits/chosen": -4.072343826293945,
529
+ "logits/rejected": -4.278454780578613,
530
+ "logps/chosen": -332.38323974609375,
531
+ "logps/rejected": -486.20587158203125,
532
+ "loss": 0.4126,
533
+ "rewards/accuracies": 0.768750011920929,
534
+ "rewards/chosen": 0.11183549463748932,
535
+ "rewards/margins": 1.9423106908798218,
536
+ "rewards/rejected": -1.8304752111434937,
537
  "step": 310
538
  },
539
  {
540
+ "epoch": 0.36845135290731146,
541
+ "grad_norm": 35.899670349090925,
542
+ "learning_rate": 3.98001943918432e-07,
543
+ "logits/chosen": -4.233328819274902,
544
+ "logits/rejected": -4.456056594848633,
545
+ "logps/chosen": -370.2253723144531,
546
+ "logps/rejected": -577.809814453125,
547
+ "loss": 0.3732,
548
+ "rewards/accuracies": 0.7437499761581421,
549
+ "rewards/chosen": -0.1710590422153473,
550
+ "rewards/margins": 2.226891279220581,
551
+ "rewards/rejected": -2.3979504108428955,
552
  "step": 320
553
  },
554
  {
555
+ "epoch": 0.3799654576856649,
556
+ "grad_norm": 31.506974249108822,
557
+ "learning_rate": 3.8977969850346866e-07,
558
+ "logits/chosen": -4.291365146636963,
559
+ "logits/rejected": -4.589537143707275,
560
+ "logps/chosen": -402.2667541503906,
561
+ "logps/rejected": -580.32080078125,
562
+ "loss": 0.4158,
563
+ "rewards/accuracies": 0.8062499761581421,
564
+ "rewards/chosen": -0.40963658690452576,
565
+ "rewards/margins": 2.1939713954925537,
566
+ "rewards/rejected": -2.6036081314086914,
567
  "step": 330
568
  },
569
  {
570
+ "epoch": 0.39147956246401844,
571
+ "grad_norm": 42.312479747132286,
572
+ "learning_rate": 3.8133131005357465e-07,
573
+ "logits/chosen": -4.51456356048584,
574
+ "logits/rejected": -4.711074352264404,
575
+ "logps/chosen": -356.7383117675781,
576
+ "logps/rejected": -599.3222045898438,
577
+ "loss": 0.3868,
578
+ "rewards/accuracies": 0.800000011920929,
579
+ "rewards/chosen": -0.3934357762336731,
580
+ "rewards/margins": 2.4568190574645996,
581
+ "rewards/rejected": -2.850255012512207,
582
  "step": 340
583
  },
584
  {
585
+ "epoch": 0.4029936672423719,
586
+ "grad_norm": 34.94322397599626,
587
+ "learning_rate": 3.7267044682118435e-07,
588
+ "logits/chosen": -4.381545066833496,
589
+ "logits/rejected": -4.7945661544799805,
590
+ "logps/chosen": -396.62408447265625,
591
+ "logps/rejected": -617.2008666992188,
592
+ "loss": 0.3886,
593
+ "rewards/accuracies": 0.8687499761581421,
594
+ "rewards/chosen": -0.23957356810569763,
595
+ "rewards/margins": 2.6808698177337646,
596
+ "rewards/rejected": -2.920443296432495,
597
  "step": 350
598
  },
599
  {
600
+ "epoch": 0.41450777202072536,
601
+ "grad_norm": 35.153895155661694,
602
+ "learning_rate": 3.638111208117425e-07,
603
+ "logits/chosen": -4.376262664794922,
604
+ "logits/rejected": -4.689536094665527,
605
+ "logps/chosen": -387.55474853515625,
606
+ "logps/rejected": -586.8858642578125,
607
+ "loss": 0.4037,
608
+ "rewards/accuracies": 0.8374999761581421,
609
+ "rewards/chosen": -0.23621347546577454,
610
+ "rewards/margins": 2.256948232650757,
611
+ "rewards/rejected": -2.493161678314209,
612
  "step": 360
613
  },
614
  {
615
+ "epoch": 0.4260218767990789,
616
+ "grad_norm": 30.56527510711544,
617
+ "learning_rate": 3.5476766511433605e-07,
618
+ "logits/chosen": -4.566588878631592,
619
+ "logits/rejected": -4.897808074951172,
620
+ "logps/chosen": -381.00604248046875,
621
+ "logps/rejected": -585.059814453125,
622
+ "loss": 0.3902,
623
+ "rewards/accuracies": 0.800000011920929,
624
+ "rewards/chosen": -0.14318397641181946,
625
+ "rewards/margins": 2.517329692840576,
626
+ "rewards/rejected": -2.6605141162872314,
627
  "step": 370
628
  },
629
  {
630
+ "epoch": 0.43753598157743234,
631
+ "grad_norm": 34.017679923693805,
632
+ "learning_rate": 3.455547107128602e-07,
633
+ "logits/chosen": -4.60725736618042,
634
+ "logits/rejected": -5.102498531341553,
635
+ "logps/chosen": -385.83770751953125,
636
+ "logps/rejected": -623.3347778320312,
637
+ "loss": 0.3929,
638
+ "rewards/accuracies": 0.800000011920929,
639
+ "rewards/chosen": -0.5362241268157959,
640
+ "rewards/margins": 2.6802401542663574,
641
+ "rewards/rejected": -3.2164645195007324,
642
  "step": 380
643
  },
644
  {
645
+ "epoch": 0.44905008635578586,
646
+ "grad_norm": 33.15867623899776,
647
+ "learning_rate": 3.361871628152338e-07,
648
+ "logits/chosen": -4.563677787780762,
649
+ "logits/rejected": -4.989599227905273,
650
+ "logps/chosen": -367.84814453125,
651
+ "logps/rejected": -567.6351318359375,
652
+ "loss": 0.4213,
653
+ "rewards/accuracies": 0.768750011920929,
654
+ "rewards/chosen": -0.3700157105922699,
655
+ "rewards/margins": 2.4626548290252686,
656
+ "rewards/rejected": -2.8326706886291504,
657
  "step": 390
658
  },
659
  {
660
+ "epoch": 0.4605641911341393,
661
+ "grad_norm": 35.10207305823101,
662
+ "learning_rate": 3.2668017673896077e-07,
663
+ "logits/chosen": -4.686192035675049,
664
+ "logits/rejected": -5.130132675170898,
665
+ "logps/chosen": -351.6319885253906,
666
+ "logps/rejected": -523.5940551757812,
667
+ "loss": 0.4011,
668
+ "rewards/accuracies": 0.78125,
669
+ "rewards/chosen": -0.2101125717163086,
670
+ "rewards/margins": 2.3180549144744873,
671
+ "rewards/rejected": -2.528167247772217,
672
+ "step": 400
673
+ },
674
+ {
675
+ "epoch": 0.4605641911341393,
676
+ "eval_logits/chosen": -4.5018205642700195,
677
+ "eval_logits/rejected": -4.837046146392822,
678
+ "eval_logps/chosen": -421.8441162109375,
679
+ "eval_logps/rejected": -525.9361572265625,
680
+ "eval_loss": 0.4135480225086212,
681
+ "eval_rewards/accuracies": 0.80859375,
682
+ "eval_rewards/chosen": -0.42466747760772705,
683
+ "eval_rewards/margins": 1.1085797548294067,
684
+ "eval_rewards/rejected": -1.5332471132278442,
685
+ "eval_runtime": 98.3292,
686
+ "eval_samples_per_second": 20.34,
687
+ "eval_steps_per_second": 0.325,
688
+ "step": 400
689
+ },
690
+ {
691
+ "epoch": 0.4720782959124928,
692
+ "grad_norm": 33.086992992339596,
693
+ "learning_rate": 3.1704913339205103e-07,
694
+ "logits/chosen": -4.71237850189209,
695
+ "logits/rejected": -5.09951639175415,
696
+ "logps/chosen": -392.43292236328125,
697
+ "logps/rejected": -596.8004150390625,
698
+ "loss": 0.3894,
699
+ "rewards/accuracies": 0.824999988079071,
700
+ "rewards/chosen": -0.45191723108291626,
701
+ "rewards/margins": 2.4984166622161865,
702
+ "rewards/rejected": -2.950334072113037,
703
+ "step": 410
704
+ },
705
+ {
706
+ "epoch": 0.4835924006908463,
707
+ "grad_norm": 36.9499485623677,
708
+ "learning_rate": 3.0730961438896885e-07,
709
+ "logits/chosen": -4.71737003326416,
710
+ "logits/rejected": -5.089630603790283,
711
+ "logps/chosen": -371.7138977050781,
712
+ "logps/rejected": -539.5205078125,
713
+ "loss": 0.3986,
714
+ "rewards/accuracies": 0.7437499761581421,
715
+ "rewards/chosen": -0.6353754997253418,
716
+ "rewards/margins": 1.956162452697754,
717
+ "rewards/rejected": -2.591538190841675,
718
+ "step": 420
719
+ },
720
+ {
721
+ "epoch": 0.49510650546919976,
722
+ "grad_norm": 28.416064555595714,
723
+ "learning_rate": 2.9747737684186795e-07,
724
+ "logits/chosen": -4.5956220626831055,
725
+ "logits/rejected": -5.009639263153076,
726
+ "logps/chosen": -388.5729064941406,
727
+ "logps/rejected": -566.389892578125,
728
+ "loss": 0.3953,
729
+ "rewards/accuracies": 0.800000011920929,
730
+ "rewards/chosen": -0.5186115503311157,
731
+ "rewards/margins": 2.118881940841675,
732
+ "rewards/rejected": -2.63749361038208,
733
+ "step": 430
734
+ },
735
+ {
736
+ "epoch": 0.5066206102475532,
737
+ "grad_norm": 35.02068361332514,
738
+ "learning_rate": 2.8756832786789663e-07,
739
+ "logits/chosen": -4.5723748207092285,
740
+ "logits/rejected": -5.229958534240723,
741
+ "logps/chosen": -344.8235778808594,
742
+ "logps/rejected": -562.1149291992188,
743
+ "loss": 0.3753,
744
+ "rewards/accuracies": 0.84375,
745
+ "rewards/chosen": -0.18356148898601532,
746
+ "rewards/margins": 2.6801793575286865,
747
+ "rewards/rejected": -2.863740921020508,
748
+ "step": 440
749
+ },
750
+ {
751
+ "epoch": 0.5181347150259067,
752
+ "grad_norm": 29.90766637224572,
753
+ "learning_rate": 2.7759849885381747e-07,
754
+ "logits/chosen": -4.58120059967041,
755
+ "logits/rejected": -5.108014106750488,
756
+ "logps/chosen": -380.8218688964844,
757
+ "logps/rejected": -558.5294189453125,
758
+ "loss": 0.395,
759
+ "rewards/accuracies": 0.824999988079071,
760
+ "rewards/chosen": -0.36003825068473816,
761
+ "rewards/margins": 2.234218120574951,
762
+ "rewards/rejected": -2.594256639480591,
763
+ "step": 450
764
+ },
765
+ {
766
+ "epoch": 0.5296488198042603,
767
+ "grad_norm": 43.539308942722826,
768
+ "learning_rate": 2.675840195195762e-07,
769
+ "logits/chosen": -4.849000453948975,
770
+ "logits/rejected": -5.308794975280762,
771
+ "logps/chosen": -353.55523681640625,
772
+ "logps/rejected": -619.9716796875,
773
+ "loss": 0.3685,
774
+ "rewards/accuracies": 0.84375,
775
+ "rewards/chosen": -0.29138100147247314,
776
+ "rewards/margins": 2.825038433074951,
777
+ "rewards/rejected": -3.116419553756714,
778
+ "step": 460
779
+ },
780
+ {
781
+ "epoch": 0.5411629245826137,
782
+ "grad_norm": 33.774855687056665,
783
+ "learning_rate": 2.575410918227829e-07,
784
+ "logits/chosen": -4.863161087036133,
785
+ "logits/rejected": -5.457709312438965,
786
+ "logps/chosen": -411.6463317871094,
787
+ "logps/rejected": -598.97314453125,
788
+ "loss": 0.3821,
789
+ "rewards/accuracies": 0.800000011920929,
790
+ "rewards/chosen": -0.46561044454574585,
791
+ "rewards/margins": 2.4459636211395264,
792
+ "rewards/rejected": -2.911574602127075,
793
+ "step": 470
794
+ },
795
+ {
796
+ "epoch": 0.5526770293609672,
797
+ "grad_norm": 33.53580470090372,
798
+ "learning_rate": 2.474859637463226e-07,
799
+ "logits/chosen": -5.079291343688965,
800
+ "logits/rejected": -5.424225807189941,
801
+ "logps/chosen": -389.027099609375,
802
+ "logps/rejected": -587.9437255859375,
803
+ "loss": 0.3962,
804
+ "rewards/accuracies": 0.7875000238418579,
805
+ "rewards/chosen": -0.4632614254951477,
806
+ "rewards/margins": 2.3001296520233154,
807
+ "rewards/rejected": -2.7633910179138184,
808
+ "step": 480
809
+ },
810
+ {
811
+ "epoch": 0.5641911341393206,
812
+ "grad_norm": 32.1453411001328,
813
+ "learning_rate": 2.3743490301150355e-07,
814
+ "logits/chosen": -5.007067680358887,
815
+ "logits/rejected": -5.361691474914551,
816
+ "logps/chosen": -343.4484558105469,
817
+ "logps/rejected": -570.6577758789062,
818
+ "loss": 0.3902,
819
+ "rewards/accuracies": 0.824999988079071,
820
+ "rewards/chosen": -0.14810001850128174,
821
+ "rewards/margins": 2.4624667167663574,
822
+ "rewards/rejected": -2.6105666160583496,
823
+ "step": 490
824
+ },
825
+ {
826
+ "epoch": 0.5757052389176741,
827
+ "grad_norm": 32.90845084744282,
828
+ "learning_rate": 2.274041707592724e-07,
829
+ "logits/chosen": -4.921438694000244,
830
+ "logits/rejected": -5.355481147766113,
831
+ "logps/chosen": -339.01129150390625,
832
+ "logps/rejected": -556.4103393554688,
833
+ "loss": 0.3915,
834
+ "rewards/accuracies": 0.78125,
835
+ "rewards/chosen": -0.14777924120426178,
836
+ "rewards/margins": 2.432879686355591,
837
+ "rewards/rejected": -2.5806591510772705,
838
+ "step": 500
839
+ },
840
+ {
841
+ "epoch": 0.5757052389176741,
842
+ "eval_logits/chosen": -4.767510890960693,
843
+ "eval_logits/rejected": -5.187655925750732,
844
+ "eval_logps/chosen": -418.29376220703125,
845
+ "eval_logps/rejected": -544.0393676757812,
846
+ "eval_loss": 0.37398749589920044,
847
+ "eval_rewards/accuracies": 0.8515625,
848
+ "eval_rewards/chosen": -0.389164000749588,
849
+ "eval_rewards/margins": 1.3251150846481323,
850
+ "eval_rewards/rejected": -1.7142791748046875,
851
+ "eval_runtime": 98.0381,
852
+ "eval_samples_per_second": 20.4,
853
+ "eval_steps_per_second": 0.326,
854
+ "step": 500
855
+ },
856
+ {
857
+ "epoch": 0.5872193436960277,
858
+ "grad_norm": 31.42761305876207,
859
+ "learning_rate": 2.17409995242075e-07,
860
+ "logits/chosen": -5.038609504699707,
861
+ "logits/rejected": -5.722345352172852,
862
+ "logps/chosen": -372.905517578125,
863
+ "logps/rejected": -569.4352416992188,
864
+ "loss": 0.376,
865
+ "rewards/accuracies": 0.8187500238418579,
866
+ "rewards/chosen": -0.27033573389053345,
867
+ "rewards/margins": 2.4031970500946045,
868
+ "rewards/rejected": -2.6735329627990723,
869
+ "step": 510
870
+ },
871
+ {
872
+ "epoch": 0.5987334484743811,
873
+ "grad_norm": 29.61275457382243,
874
+ "learning_rate": 2.0746854556892544e-07,
875
+ "logits/chosen": -5.438863754272461,
876
+ "logits/rejected": -5.798094749450684,
877
+ "logps/chosen": -407.27008056640625,
878
+ "logps/rejected": -620.6509399414062,
879
+ "loss": 0.3645,
880
+ "rewards/accuracies": 0.8374999761581421,
881
+ "rewards/chosen": -0.43467459082603455,
882
+ "rewards/margins": 2.4455971717834473,
883
+ "rewards/rejected": -2.8802719116210938,
884
+ "step": 520
885
+ },
886
+ {
887
+ "epoch": 0.6102475532527346,
888
+ "grad_norm": 27.24117353879226,
889
+ "learning_rate": 1.9759590554616173e-07,
890
+ "logits/chosen": -5.715832710266113,
891
+ "logits/rejected": -6.058187961578369,
892
+ "logps/chosen": -397.95849609375,
893
+ "logps/rejected": -609.6741943359375,
894
+ "loss": 0.3968,
895
+ "rewards/accuracies": 0.737500011920929,
896
+ "rewards/chosen": -0.6830942034721375,
897
+ "rewards/margins": 2.4185569286346436,
898
+ "rewards/rejected": -3.101651191711426,
899
+ "step": 530
900
+ },
901
+ {
902
+ "epoch": 0.6217616580310881,
903
+ "grad_norm": 30.859422948077256,
904
+ "learning_rate": 1.8780804765620746e-07,
905
+ "logits/chosen": -5.4331769943237305,
906
+ "logits/rejected": -5.7857160568237305,
907
+ "logps/chosen": -373.3824462890625,
908
+ "logps/rejected": -528.5029296875,
909
+ "loss": 0.4178,
910
+ "rewards/accuracies": 0.800000011920929,
911
+ "rewards/chosen": -0.4058764576911926,
912
+ "rewards/margins": 1.9241279363632202,
913
+ "rewards/rejected": -2.3300044536590576,
914
+ "step": 540
915
+ },
916
+ {
917
+ "epoch": 0.6332757628094415,
918
+ "grad_norm": 35.78902948656132,
919
+ "learning_rate": 1.7812080721643973e-07,
920
+ "logits/chosen": -5.20429801940918,
921
+ "logits/rejected": -5.622688293457031,
922
+ "logps/chosen": -401.1048889160156,
923
+ "logps/rejected": -605.438232421875,
924
+ "loss": 0.3956,
925
+ "rewards/accuracies": 0.7749999761581421,
926
+ "rewards/chosen": -0.27011531591415405,
927
+ "rewards/margins": 2.323632001876831,
928
+ "rewards/rejected": -2.593747615814209,
929
+ "step": 550
930
+ },
931
+ {
932
+ "epoch": 0.644789867587795,
933
+ "grad_norm": 31.09337668064834,
934
+ "learning_rate": 1.6854985675997063e-07,
935
+ "logits/chosen": -5.3274736404418945,
936
+ "logits/rejected": -5.779025554656982,
937
+ "logps/chosen": -370.87823486328125,
938
+ "logps/rejected": -599.370361328125,
939
+ "loss": 0.377,
940
+ "rewards/accuracies": 0.7875000238418579,
941
+ "rewards/chosen": -0.30361196398735046,
942
+ "rewards/margins": 2.5692386627197266,
943
+ "rewards/rejected": -2.8728506565093994,
944
+ "step": 560
945
+ },
946
+ {
947
+ "epoch": 0.6563039723661486,
948
+ "grad_norm": 31.49748801480019,
949
+ "learning_rate": 1.5911068067978818e-07,
950
+ "logits/chosen": -5.422667503356934,
951
+ "logits/rejected": -5.991160869598389,
952
+ "logps/chosen": -363.42791748046875,
953
+ "logps/rejected": -606.8687744140625,
954
+ "loss": 0.3651,
955
+ "rewards/accuracies": 0.8687499761581421,
956
+ "rewards/chosen": -0.3893515467643738,
957
+ "rewards/margins": 2.7044646739959717,
958
+ "rewards/rejected": -3.093816041946411,
959
+ "step": 570
960
+ },
961
+ {
962
+ "epoch": 0.667818077144502,
963
+ "grad_norm": 40.80686884426901,
964
+ "learning_rate": 1.4981855017728197e-07,
965
+ "logits/chosen": -5.2194623947143555,
966
+ "logits/rejected": -5.8604302406311035,
967
+ "logps/chosen": -378.5892028808594,
968
+ "logps/rejected": -623.4224853515625,
969
+ "loss": 0.3681,
970
+ "rewards/accuracies": 0.7875000238418579,
971
+ "rewards/chosen": -0.4009127616882324,
972
+ "rewards/margins": 2.839203357696533,
973
+ "rewards/rejected": -3.2401161193847656,
974
+ "step": 580
975
+ },
976
+ {
977
+ "epoch": 0.6793321819228555,
978
+ "grad_norm": 35.637123676945,
979
+ "learning_rate": 1.406884985556804e-07,
980
+ "logits/chosen": -5.340333461761475,
981
+ "logits/rejected": -5.9213457107543945,
982
+ "logps/chosen": -366.98126220703125,
983
+ "logps/rejected": -646.6055297851562,
984
+ "loss": 0.3892,
985
+ "rewards/accuracies": 0.8500000238418579,
986
+ "rewards/chosen": -0.3502456843852997,
987
+ "rewards/margins": 3.1350584030151367,
988
+ "rewards/rejected": -3.4853038787841797,
989
+ "step": 590
990
+ },
991
+ {
992
+ "epoch": 0.690846286701209,
993
+ "grad_norm": 38.133176182262396,
994
+ "learning_rate": 1.3173529689837354e-07,
995
+ "logits/chosen": -5.227208137512207,
996
+ "logits/rejected": -5.730982780456543,
997
+ "logps/chosen": -406.6194152832031,
998
+ "logps/rejected": -642.0016479492188,
999
+ "loss": 0.3726,
1000
+ "rewards/accuracies": 0.8125,
1001
+ "rewards/chosen": -0.19344040751457214,
1002
+ "rewards/margins": 2.756740093231201,
1003
+ "rewards/rejected": -2.9501805305480957,
1004
+ "step": 600
1005
+ },
1006
+ {
1007
+ "epoch": 0.690846286701209,
1008
+ "eval_logits/chosen": -5.146116256713867,
1009
+ "eval_logits/rejected": -5.624752044677734,
1010
+ "eval_logps/chosen": -427.4439392089844,
1011
+ "eval_logps/rejected": -561.528564453125,
1012
+ "eval_loss": 0.3467547297477722,
1013
+ "eval_rewards/accuracies": 0.84375,
1014
+ "eval_rewards/chosen": -0.4806651175022125,
1015
+ "eval_rewards/margins": 1.408505916595459,
1016
+ "eval_rewards/rejected": -1.8891710042953491,
1017
+ "eval_runtime": 98.3003,
1018
+ "eval_samples_per_second": 20.346,
1019
+ "eval_steps_per_second": 0.326,
1020
+ "step": 600
1021
+ },
1022
+ {
1023
+ "epoch": 0.7023603914795624,
1024
+ "grad_norm": 35.76369238749813,
1025
+ "learning_rate": 1.2297343017146726e-07,
1026
+ "logits/chosen": -5.63295316696167,
1027
+ "logits/rejected": -6.0680012702941895,
1028
+ "logps/chosen": -352.22650146484375,
1029
+ "logps/rejected": -569.6236572265625,
1030
+ "loss": 0.3654,
1031
+ "rewards/accuracies": 0.8125,
1032
+ "rewards/chosen": -0.400468111038208,
1033
+ "rewards/margins": 2.252286672592163,
1034
+ "rewards/rejected": -2.65275502204895,
1035
+ "step": 610
1036
+ },
1037
+ {
1038
+ "epoch": 0.713874496257916,
1039
+ "grad_norm": 42.53908245265289,
1040
+ "learning_rate": 1.1441707378923474e-07,
1041
+ "logits/chosen": -5.555817604064941,
1042
+ "logits/rejected": -5.891648292541504,
1043
+ "logps/chosen": -372.3026123046875,
1044
+ "logps/rejected": -608.4457397460938,
1045
+ "loss": 0.3719,
1046
+ "rewards/accuracies": 0.7875000238418579,
1047
+ "rewards/chosen": -0.5105666518211365,
1048
+ "rewards/margins": 2.334003448486328,
1049
+ "rewards/rejected": -2.844569683074951,
1050
+ "step": 620
1051
+ },
1052
+ {
1053
+ "epoch": 0.7253886010362695,
1054
+ "grad_norm": 33.40462593975916,
1055
+ "learning_rate": 1.06080070680377e-07,
1056
+ "logits/chosen": -5.389917850494385,
1057
+ "logits/rejected": -5.883559226989746,
1058
+ "logps/chosen": -380.6363525390625,
1059
+ "logps/rejected": -589.5970458984375,
1060
+ "loss": 0.3608,
1061
+ "rewards/accuracies": 0.824999988079071,
1062
+ "rewards/chosen": -0.4320860803127289,
1063
+ "rewards/margins": 2.423119068145752,
1064
+ "rewards/rejected": -2.8552052974700928,
1065
+ "step": 630
1066
+ },
1067
+ {
1068
+ "epoch": 0.7369027058146229,
1069
+ "grad_norm": 40.31781331240861,
1070
+ "learning_rate": 9.797590889219587e-08,
1071
+ "logits/chosen": -5.418898582458496,
1072
+ "logits/rejected": -6.029601097106934,
1073
+ "logps/chosen": -331.7992248535156,
1074
+ "logps/rejected": -644.7623291015625,
1075
+ "loss": 0.4071,
1076
+ "rewards/accuracies": 0.8374999761581421,
1077
+ "rewards/chosen": -0.26965656876564026,
1078
+ "rewards/margins": 3.317509174346924,
1079
+ "rewards/rejected": -3.5871658325195312,
1080
+ "step": 640
1081
+ },
1082
+ {
1083
+ "epoch": 0.7484168105929764,
1084
+ "grad_norm": 30.964195430126203,
1085
+ "learning_rate": 9.011769976891367e-08,
1086
+ "logits/chosen": -5.33644962310791,
1087
+ "logits/rejected": -5.905170440673828,
1088
+ "logps/chosen": -370.828369140625,
1089
+ "logps/rejected": -630.619140625,
1090
+ "loss": 0.3809,
1091
+ "rewards/accuracies": 0.8500000238418579,
1092
+ "rewards/chosen": -0.340393990278244,
1093
+ "rewards/margins": 2.9275107383728027,
1094
+ "rewards/rejected": -3.267904758453369,
1095
+ "step": 650
1096
+ },
1097
+ {
1098
+ "epoch": 0.7599309153713298,
1099
+ "grad_norm": 34.09027033994428,
1100
+ "learning_rate": 8.251815673944218e-08,
1101
+ "logits/chosen": -5.566973686218262,
1102
+ "logits/rejected": -5.901907444000244,
1103
+ "logps/chosen": -373.8709411621094,
1104
+ "logps/rejected": -626.88720703125,
1105
+ "loss": 0.3664,
1106
+ "rewards/accuracies": 0.800000011920929,
1107
+ "rewards/chosen": -0.31639060378074646,
1108
+ "rewards/margins": 2.5317635536193848,
1109
+ "rewards/rejected": -2.848154067993164,
1110
+ "step": 660
1111
+ },
1112
+ {
1113
+ "epoch": 0.7714450201496834,
1114
+ "grad_norm": 33.748663190230474,
1115
+ "learning_rate": 7.518957474892148e-08,
1116
+ "logits/chosen": -5.544904708862305,
1117
+ "logits/rejected": -6.055120468139648,
1118
+ "logps/chosen": -366.33306884765625,
1119
+ "logps/rejected": -662.8927001953125,
1120
+ "loss": 0.3675,
1121
+ "rewards/accuracies": 0.8812500238418579,
1122
+ "rewards/chosen": -0.4155319333076477,
1123
+ "rewards/margins": 3.206387758255005,
1124
+ "rewards/rejected": -3.621919631958008,
1125
+ "step": 670
1126
+ },
1127
+ {
1128
+ "epoch": 0.7829591249280369,
1129
+ "grad_norm": 33.43366335799461,
1130
+ "learning_rate": 6.814381036730274e-08,
1131
+ "logits/chosen": -5.3579840660095215,
1132
+ "logits/rejected": -5.930968284606934,
1133
+ "logps/chosen": -384.45245361328125,
1134
+ "logps/rejected": -620.3960571289062,
1135
+ "loss": 0.3748,
1136
+ "rewards/accuracies": 0.8125,
1137
+ "rewards/chosen": -0.3938636779785156,
1138
+ "rewards/margins": 2.738201856613159,
1139
+ "rewards/rejected": -3.132065773010254,
1140
+ "step": 680
1141
+ },
1142
+ {
1143
+ "epoch": 0.7944732297063903,
1144
+ "grad_norm": 31.210525154632403,
1145
+ "learning_rate": 6.139226260715872e-08,
1146
+ "logits/chosen": -5.434956073760986,
1147
+ "logits/rejected": -5.966610908508301,
1148
+ "logps/chosen": -387.60162353515625,
1149
+ "logps/rejected": -664.8744506835938,
1150
+ "loss": 0.355,
1151
+ "rewards/accuracies": 0.7749999761581421,
1152
+ "rewards/chosen": -0.4180675446987152,
1153
+ "rewards/margins": 2.967360019683838,
1154
+ "rewards/rejected": -3.385427474975586,
1155
+ "step": 690
1156
+ },
1157
+ {
1158
+ "epoch": 0.8059873344847438,
1159
+ "grad_norm": 33.963445753535076,
1160
+ "learning_rate": 5.4945854481754734e-08,
1161
+ "logits/chosen": -5.527676105499268,
1162
+ "logits/rejected": -5.960885047912598,
1163
+ "logps/chosen": -374.95916748046875,
1164
+ "logps/rejected": -630.1693725585938,
1165
+ "loss": 0.3522,
1166
+ "rewards/accuracies": 0.824999988079071,
1167
+ "rewards/chosen": -0.3166103959083557,
1168
+ "rewards/margins": 2.8152191638946533,
1169
+ "rewards/rejected": -3.1318297386169434,
1170
+ "step": 700
1171
+ },
1172
+ {
1173
+ "epoch": 0.8059873344847438,
1174
+ "eval_logits/chosen": -5.210726261138916,
1175
+ "eval_logits/rejected": -5.681924343109131,
1176
+ "eval_logps/chosen": -433.6905517578125,
1177
+ "eval_logps/rejected": -577.3692016601562,
1178
+ "eval_loss": 0.32489100098609924,
1179
+ "eval_rewards/accuracies": 0.87890625,
1180
+ "eval_rewards/chosen": -0.5431313514709473,
1181
+ "eval_rewards/margins": 1.5044457912445068,
1182
+ "eval_rewards/rejected": -2.047577142715454,
1183
+ "eval_runtime": 98.0334,
1184
+ "eval_samples_per_second": 20.401,
1185
+ "eval_steps_per_second": 0.326,
1186
+ "step": 700
1187
+ },
1188
+ {
1189
+ "epoch": 0.8175014392630973,
1190
+ "grad_norm": 32.382102785679976,
1191
+ "learning_rate": 4.881501533321605e-08,
1192
+ "logits/chosen": -5.631700038909912,
1193
+ "logits/rejected": -6.175845146179199,
1194
+ "logps/chosen": -364.59674072265625,
1195
+ "logps/rejected": -615.4799194335938,
1196
+ "loss": 0.3861,
1197
+ "rewards/accuracies": 0.8500000238418579,
1198
+ "rewards/chosen": -0.4184879660606384,
1199
+ "rewards/margins": 2.884592294692993,
1200
+ "rewards/rejected": -3.3030803203582764,
1201
+ "step": 710
1202
+ },
1203
+ {
1204
+ "epoch": 0.8290155440414507,
1205
+ "grad_norm": 29.844564520231344,
1206
+ "learning_rate": 4.300966395938377e-08,
1207
+ "logits/chosen": -5.579652309417725,
1208
+ "logits/rejected": -6.021969795227051,
1209
+ "logps/chosen": -410.3070373535156,
1210
+ "logps/rejected": -654.1072387695312,
1211
+ "loss": 0.3805,
1212
+ "rewards/accuracies": 0.831250011920929,
1213
+ "rewards/chosen": -0.40225619077682495,
1214
+ "rewards/margins": 2.8050906658172607,
1215
+ "rewards/rejected": -3.2073471546173096,
1216
+ "step": 720
1217
+ },
1218
+ {
1219
+ "epoch": 0.8405296488198043,
1220
+ "grad_norm": 34.64605949847163,
1221
+ "learning_rate": 3.7539192566655246e-08,
1222
+ "logits/chosen": -5.749828338623047,
1223
+ "logits/rejected": -6.230714321136475,
1224
+ "logps/chosen": -372.4962463378906,
1225
+ "logps/rejected": -620.4830932617188,
1226
+ "loss": 0.3701,
1227
+ "rewards/accuracies": 0.856249988079071,
1228
+ "rewards/chosen": -0.3709852397441864,
1229
+ "rewards/margins": 2.7844834327697754,
1230
+ "rewards/rejected": -3.155468702316284,
1231
+ "step": 730
1232
+ },
1233
+ {
1234
+ "epoch": 0.8520437535981578,
1235
+ "grad_norm": 38.917435902608844,
1236
+ "learning_rate": 3.24124515747731e-08,
1237
+ "logits/chosen": -5.770384311676025,
1238
+ "logits/rejected": -6.440248966217041,
1239
+ "logps/chosen": -377.38360595703125,
1240
+ "logps/rejected": -670.9470825195312,
1241
+ "loss": 0.3725,
1242
+ "rewards/accuracies": 0.8500000238418579,
1243
+ "rewards/chosen": -0.46737533807754517,
1244
+ "rewards/margins": 3.3466858863830566,
1245
+ "rewards/rejected": -3.814060926437378,
1246
+ "step": 740
1247
+ },
1248
+ {
1249
+ "epoch": 0.8635578583765112,
1250
+ "grad_norm": 35.39576347923302,
1251
+ "learning_rate": 2.763773529814506e-08,
1252
+ "logits/chosen": -5.80182409286499,
1253
+ "logits/rejected": -6.183619976043701,
1254
+ "logps/chosen": -363.37359619140625,
1255
+ "logps/rejected": -643.1031494140625,
1256
+ "loss": 0.3736,
1257
+ "rewards/accuracies": 0.78125,
1258
+ "rewards/chosen": -0.513085663318634,
1259
+ "rewards/margins": 2.7367725372314453,
1260
+ "rewards/rejected": -3.2498581409454346,
1261
+ "step": 750
1262
+ },
1263
+ {
1264
+ "epoch": 0.8750719631548647,
1265
+ "grad_norm": 35.82536365897154,
1266
+ "learning_rate": 2.3222768526860698e-08,
1267
+ "logits/chosen": -5.800836563110352,
1268
+ "logits/rejected": -6.234482288360596,
1269
+ "logps/chosen": -365.31903076171875,
1270
+ "logps/rejected": -579.0399169921875,
1271
+ "loss": 0.3663,
1272
+ "rewards/accuracies": 0.8187500238418579,
1273
+ "rewards/chosen": -0.5012763738632202,
1274
+ "rewards/margins": 2.1673426628112793,
1275
+ "rewards/rejected": -2.668619394302368,
1276
+ "step": 760
1277
+ },
1278
+ {
1279
+ "epoch": 0.8865860679332181,
1280
+ "grad_norm": 37.880330092886545,
1281
+ "learning_rate": 1.9174694029115146e-08,
1282
+ "logits/chosen": -5.784181594848633,
1283
+ "logits/rejected": -6.484677314758301,
1284
+ "logps/chosen": -376.74908447265625,
1285
+ "logps/rejected": -637.3211059570312,
1286
+ "loss": 0.38,
1287
+ "rewards/accuracies": 0.875,
1288
+ "rewards/chosen": -0.3697873055934906,
1289
+ "rewards/margins": 3.116102933883667,
1290
+ "rewards/rejected": -3.4858901500701904,
1291
+ "step": 770
1292
+ },
1293
+ {
1294
+ "epoch": 0.8981001727115717,
1295
+ "grad_norm": 37.173154353795034,
1296
+ "learning_rate": 1.5500060995258134e-08,
1297
+ "logits/chosen": -5.590546607971191,
1298
+ "logits/rejected": -6.252056121826172,
1299
+ "logps/chosen": -404.06219482421875,
1300
+ "logps/rejected": -671.0790405273438,
1301
+ "loss": 0.3644,
1302
+ "rewards/accuracies": 0.8374999761581421,
1303
+ "rewards/chosen": -0.4821314811706543,
1304
+ "rewards/margins": 2.973552703857422,
1305
+ "rewards/rejected": -3.455684185028076,
1306
+ "step": 780
1307
+ },
1308
+ {
1309
+ "epoch": 0.9096142774899252,
1310
+ "grad_norm": 38.483209821819536,
1311
+ "learning_rate": 1.2204814442165812e-08,
1312
+ "logits/chosen": -5.847277641296387,
1313
+ "logits/rejected": -6.545414924621582,
1314
+ "logps/chosen": -402.4599609375,
1315
+ "logps/rejected": -618.3992309570312,
1316
+ "loss": 0.3744,
1317
+ "rewards/accuracies": 0.8500000238418579,
1318
+ "rewards/chosen": -0.5441657900810242,
1319
+ "rewards/margins": 2.8156542778015137,
1320
+ "rewards/rejected": -3.3598198890686035,
1321
+ "step": 790
1322
+ },
1323
+ {
1324
+ "epoch": 0.9211283822682786,
1325
+ "grad_norm": 36.88952100776894,
1326
+ "learning_rate": 9.294285595075669e-09,
1327
+ "logits/chosen": -5.882547378540039,
1328
+ "logits/rejected": -6.232880115509033,
1329
+ "logps/chosen": -359.8563537597656,
1330
+ "logps/rejected": -655.06787109375,
1331
+ "loss": 0.3643,
1332
+ "rewards/accuracies": 0.84375,
1333
+ "rewards/chosen": -0.496969074010849,
1334
+ "rewards/margins": 2.759918689727783,
1335
+ "rewards/rejected": -3.256887912750244,
1336
+ "step": 800
1337
+ },
1338
+ {
1339
+ "epoch": 0.9211283822682786,
1340
+ "eval_logits/chosen": -5.403136253356934,
1341
+ "eval_logits/rejected": -5.885165214538574,
1342
+ "eval_logps/chosen": -439.6992492675781,
1343
+ "eval_logps/rejected": -584.2129516601562,
1344
+ "eval_loss": 0.31831786036491394,
1345
+ "eval_rewards/accuracies": 0.87109375,
1346
+ "eval_rewards/chosen": -0.6032183170318604,
1347
+ "eval_rewards/margins": 1.5127967596054077,
1348
+ "eval_rewards/rejected": -2.1160147190093994,
1349
+ "eval_runtime": 98.1126,
1350
+ "eval_samples_per_second": 20.385,
1351
+ "eval_steps_per_second": 0.326,
1352
+ "step": 800
1353
+ },
1354
+ {
1355
+ "epoch": 0.9326424870466321,
1356
+ "grad_norm": 43.94120514478602,
1357
+ "learning_rate": 6.773183262446914e-09,
1358
+ "logits/chosen": -5.6489362716674805,
1359
+ "logits/rejected": -6.28032112121582,
1360
+ "logps/chosen": -353.1646423339844,
1361
+ "logps/rejected": -609.9522705078125,
1362
+ "loss": 0.3848,
1363
+ "rewards/accuracies": 0.800000011920929,
1364
+ "rewards/chosen": -0.40985745191574097,
1365
+ "rewards/margins": 2.7903153896331787,
1366
+ "rewards/rejected": -3.2001731395721436,
1367
+ "step": 810
1368
+ },
1369
+ {
1370
+ "epoch": 0.9441565918249856,
1371
+ "grad_norm": 33.525448706821926,
1372
+ "learning_rate": 4.645586217799452e-09,
1373
+ "logits/chosen": -5.750053882598877,
1374
+ "logits/rejected": -6.382951259613037,
1375
+ "logps/chosen": -408.31915283203125,
1376
+ "logps/rejected": -624.9613037109375,
1377
+ "loss": 0.3682,
1378
+ "rewards/accuracies": 0.8125,
1379
+ "rewards/chosen": -0.44873589277267456,
1380
+ "rewards/margins": 2.5182459354400635,
1381
+ "rewards/rejected": -2.966981887817383,
1382
+ "step": 820
1383
+ },
1384
+ {
1385
+ "epoch": 0.9556706966033391,
1386
+ "grad_norm": 32.59312352646331,
1387
+ "learning_rate": 2.9149366008568987e-09,
1388
+ "logits/chosen": -5.68507194519043,
1389
+ "logits/rejected": -6.2285284996032715,
1390
+ "logps/chosen": -345.0586853027344,
1391
+ "logps/rejected": -635.7188720703125,
1392
+ "loss": 0.3761,
1393
+ "rewards/accuracies": 0.831250011920929,
1394
+ "rewards/chosen": -0.39172735810279846,
1395
+ "rewards/margins": 2.9998083114624023,
1396
+ "rewards/rejected": -3.391535520553589,
1397
+ "step": 830
1398
+ },
1399
+ {
1400
+ "epoch": 0.9671848013816926,
1401
+ "grad_norm": 37.49243505993372,
1402
+ "learning_rate": 1.5840343486700215e-09,
1403
+ "logits/chosen": -5.730424404144287,
1404
+ "logits/rejected": -6.221343040466309,
1405
+ "logps/chosen": -356.298583984375,
1406
+ "logps/rejected": -621.7361450195312,
1407
+ "loss": 0.3928,
1408
+ "rewards/accuracies": 0.793749988079071,
1409
+ "rewards/chosen": -0.4219423830509186,
1410
+ "rewards/margins": 2.8504931926727295,
1411
+ "rewards/rejected": -3.272435426712036,
1412
+ "step": 840
1413
+ },
1414
+ {
1415
+ "epoch": 0.9786989061600461,
1416
+ "grad_norm": 33.08948980944996,
1417
+ "learning_rate": 6.550326657293881e-10,
1418
+ "logits/chosen": -5.9162678718566895,
1419
+ "logits/rejected": -6.479850769042969,
1420
+ "logps/chosen": -360.3614196777344,
1421
+ "logps/rejected": -608.4212646484375,
1422
+ "loss": 0.3596,
1423
+ "rewards/accuracies": 0.862500011920929,
1424
+ "rewards/chosen": -0.3865709900856018,
1425
+ "rewards/margins": 2.8733856678009033,
1426
+ "rewards/rejected": -3.2599568367004395,
1427
+ "step": 850
1428
+ },
1429
+ {
1430
+ "epoch": 0.9902130109383995,
1431
+ "grad_norm": 33.68247028780298,
1432
+ "learning_rate": 1.2943454039654467e-10,
1433
+ "logits/chosen": -5.6706414222717285,
1434
+ "logits/rejected": -6.1612443923950195,
1435
+ "logps/chosen": -388.79510498046875,
1436
+ "logps/rejected": -634.7048950195312,
1437
+ "loss": 0.3777,
1438
+ "rewards/accuracies": 0.7562500238418579,
1439
+ "rewards/chosen": -0.3972472846508026,
1440
+ "rewards/margins": 2.8383138179779053,
1441
+ "rewards/rejected": -3.2355613708496094,
1442
+ "step": 860
1443
+ },
1444
+ {
1445
+ "epoch": 0.9994242947610823,
1446
+ "step": 868,
1447
  "total_flos": 0.0,
1448
+ "train_loss": 0.4218231642850533,
1449
+ "train_runtime": 14967.0092,
1450
+ "train_samples_per_second": 7.425,
1451
+ "train_steps_per_second": 0.058
1452
  }
1453
  ],
1454
  "logging_steps": 10,
1455
+ "max_steps": 868,
1456
  "num_input_tokens_seen": 0,
1457
  "num_train_epochs": 1,
1458
  "save_steps": 100,
1459
+ "stateful_callbacks": {
1460
+ "TrainerControl": {
1461
+ "args": {
1462
+ "should_epoch_stop": false,
1463
+ "should_evaluate": false,
1464
+ "should_log": false,
1465
+ "should_save": true,
1466
+ "should_training_stop": false
1467
+ },
1468
+ "attributes": {}
1469
+ }
1470
+ },
1471
  "total_flos": 0.0,
1472
  "train_batch_size": 8,
1473
  "trial_name": null,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aab4573f211e1825da610c91d86ed7a8bc0cfa8f8bba8dbb6800dc69b7080723
3
- size 6328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf525e556a4e72ad76dc3263558be495a00b73c02de0b6ea713d4bfeb6a07eb0
3
+ size 6456