Minbyul commited on
Commit
0720715
1 Parent(s): e09961c

Model save

Browse files
README.md CHANGED
@@ -1,15 +1,9 @@
1
  ---
2
  base_model: dmis-lab/selfbiorag_7b
3
  tags:
4
- - alignment-handbook
5
  - trl
6
  - dpo
7
  - generated_from_trainer
8
- - trl
9
- - dpo
10
- - generated_from_trainer
11
- datasets:
12
- - HuggingFaceH4/ultrafeedback_binarized
13
  model-index:
14
  - name: selfbiorag-7b-dpo-full-wo-healthsearch_qa-ep3
15
  results: []
@@ -20,17 +14,17 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # selfbiorag-7b-dpo-full-wo-healthsearch_qa-ep3
22
 
23
- This model is a fine-tuned version of [dmis-lab/selfbiorag_7b](https://huggingface.co/dmis-lab/selfbiorag_7b) on the HuggingFaceH4/ultrafeedback_binarized dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 0.5131
26
- - Rewards/chosen: 0.3874
27
- - Rewards/rejected: -0.0179
28
- - Rewards/accuracies: 1.0
29
- - Rewards/margins: 0.4053
30
- - Logps/rejected: -92.8718
31
- - Logps/chosen: -349.6196
32
- - Logits/rejected: -1.7621
33
- - Logits/chosen: -1.6801
34
 
35
  ## Model description
36
 
@@ -54,22 +48,25 @@ The following hyperparameters were used during training:
54
  - eval_batch_size: 8
55
  - seed: 42
56
  - distributed_type: multi-GPU
57
- - num_devices: 8
58
  - gradient_accumulation_steps: 2
59
- - total_train_batch_size: 128
60
- - total_eval_batch_size: 64
61
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
62
  - lr_scheduler_type: cosine
63
  - lr_scheduler_warmup_ratio: 0.1
64
- - num_epochs: 3
65
 
66
  ### Training results
67
 
 
 
 
68
 
69
 
70
  ### Framework versions
71
 
72
  - Transformers 4.39.0.dev0
73
- - Pytorch 2.2.1+cu121
74
  - Datasets 2.14.6
75
  - Tokenizers 0.15.2
 
1
  ---
2
  base_model: dmis-lab/selfbiorag_7b
3
  tags:
 
4
  - trl
5
  - dpo
6
  - generated_from_trainer
 
 
 
 
 
7
  model-index:
8
  - name: selfbiorag-7b-dpo-full-wo-healthsearch_qa-ep3
9
  results: []
 
14
 
15
  # selfbiorag-7b-dpo-full-wo-healthsearch_qa-ep3
16
 
17
+ This model is a fine-tuned version of [dmis-lab/selfbiorag_7b](https://huggingface.co/dmis-lab/selfbiorag_7b) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Logits/chosen: -1.6968
20
+ - Logits/rejected: -1.6723
21
+ - Logps/chosen: -158.8701
22
+ - Logps/rejected: -170.2428
23
+ - Loss: 0.6691
24
+ - Rewards/accuracies: 0.6941
25
+ - Rewards/chosen: 0.0706
26
+ - Rewards/margins: 0.0503
27
+ - Rewards/rejected: 0.0202
28
 
29
  ## Model description
30
 
 
48
  - eval_batch_size: 8
49
  - seed: 42
50
  - distributed_type: multi-GPU
51
+ - num_devices: 4
52
  - gradient_accumulation_steps: 2
53
+ - total_train_batch_size: 64
54
+ - total_eval_batch_size: 32
55
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
56
  - lr_scheduler_type: cosine
57
  - lr_scheduler_warmup_ratio: 0.1
58
+ - num_epochs: 1
59
 
60
  ### Training results
61
 
62
+ | Training Loss | Epoch | Step | Logits/chosen | Logits/rejected | Logps/chosen | Logps/rejected | Validation Loss | Rewards/accuracies | Rewards/chosen | Rewards/margins | Rewards/rejected |
63
+ |:-------------:|:-----:|:----:|:-------------:|:---------------:|:------------:|:--------------:|:---------------:|:------------------:|:--------------:|:---------------:|:----------------:|
64
+ | 0.6508 | 0.61 | 100 | -1.6968 | -1.6723 | -158.8701 | -170.2428 | 0.6691 | 0.6941 | 0.0706 | 0.0503 | 0.0202 |
65
 
66
 
67
  ### Framework versions
68
 
69
  - Transformers 4.39.0.dev0
70
+ - Pytorch 2.1.2
71
  - Datasets 2.14.6
72
  - Tokenizers 0.15.2
all_results.json CHANGED
@@ -1,21 +1,8 @@
1
  {
2
- "epoch": 3.0,
3
- "eval_logits/chosen": -1.680073618888855,
4
- "eval_logits/rejected": -1.7620524168014526,
5
- "eval_logps/chosen": -349.61962890625,
6
- "eval_logps/rejected": -92.87176513671875,
7
- "eval_loss": 0.5131469964981079,
8
- "eval_rewards/accuracies": 1.0,
9
- "eval_rewards/chosen": 0.3873787820339203,
10
- "eval_rewards/margins": 0.40528222918510437,
11
- "eval_rewards/rejected": -0.017903532832860947,
12
- "eval_runtime": 67.3461,
13
- "eval_samples": 3077,
14
- "eval_samples_per_second": 45.689,
15
- "eval_steps_per_second": 0.728,
16
- "train_loss": 0.5981406688690185,
17
- "train_runtime": 346.5059,
18
- "train_samples": 1885,
19
- "train_samples_per_second": 16.32,
20
- "train_steps_per_second": 0.13
21
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.2519006322069866,
4
+ "train_runtime": 787.0698,
5
+ "train_samples": 10477,
6
+ "train_samples_per_second": 13.311,
7
+ "train_steps_per_second": 0.208
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c510f855c5a77fa0a39abce44f3906dd30865ea22922bfa3b4c0f5d175b65c9b
3
  size 4939116424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e3bf12878dd219177ecbd7042879cc4e9c7ee0478427bc28d5cd74360bc1c66
3
  size 4939116424
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c100741650f0af22f89c2e95607cfb10da99f616315a183657a64cce801c9e28
3
  size 4947390880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfe885f61c314c6938f605dc911c1de2fe4bbbb4426c99f25c33f776e6ac3dcc
3
  size 4947390880
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff7646b79096dc96e6fc68d15c163e5fbb19476cf9f8dbf71cae1dfbf52ca7ed
3
  size 3590619888
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4779ffacf1ced93b0fae7c156d31864dc9b11c079ff1599f2ad368083d507fef
3
  size 3590619888
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.0,
3
- "train_loss": 0.5981406688690185,
4
- "train_runtime": 346.5059,
5
- "train_samples": 1885,
6
- "train_samples_per_second": 16.32,
7
- "train_steps_per_second": 0.13
8
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.2519006322069866,
4
+ "train_runtime": 787.0698,
5
+ "train_samples": 10477,
6
+ "train_samples_per_second": 13.311,
7
+ "train_steps_per_second": 0.208
8
  }
trainer_state.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
  "eval_steps": 100,
6
- "global_step": 45,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.07,
13
- "grad_norm": 14.650781993147133,
14
- "learning_rate": 1e-07,
15
- "logits/chosen": -1.8544178009033203,
16
- "logits/rejected": -1.5131595134735107,
17
- "logps/chosen": -382.39324951171875,
18
- "logps/rejected": -74.34650421142578,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
@@ -24,79 +24,275 @@
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.67,
28
- "grad_norm": 13.355694493789834,
29
- "learning_rate": 4.809698831278217e-07,
30
- "logits/chosen": -1.7981030941009521,
31
- "logits/rejected": -1.8637254238128662,
32
- "logps/chosen": -297.70697021484375,
33
- "logps/rejected": -91.71724700927734,
34
- "loss": 0.6887,
35
- "rewards/accuracies": 0.625,
36
- "rewards/chosen": 0.0073351990431547165,
37
- "rewards/margins": 0.007683979347348213,
38
- "rewards/rejected": -0.000348779110936448,
39
  "step": 10
40
  },
41
  {
42
- "epoch": 1.33,
43
- "grad_norm": 10.957160162463524,
44
- "learning_rate": 3.4567085809127245e-07,
45
- "logits/chosen": -1.7605440616607666,
46
- "logits/rejected": -1.6980727910995483,
47
- "logps/chosen": -317.1847229003906,
48
- "logps/rejected": -89.17781066894531,
49
- "loss": 0.6424,
50
- "rewards/accuracies": 0.9937499761581421,
51
- "rewards/chosen": 0.10211040079593658,
52
- "rewards/margins": 0.10564006865024567,
53
- "rewards/rejected": -0.003529661800712347,
54
  "step": 20
55
  },
56
  {
57
- "epoch": 2.0,
58
- "grad_norm": 9.631401952273302,
59
- "learning_rate": 1.5432914190872756e-07,
60
- "logits/chosen": -1.7424694299697876,
61
- "logits/rejected": -1.690342903137207,
62
- "logps/chosen": -304.099609375,
63
- "logps/rejected": -91.2957763671875,
64
- "loss": 0.5679,
65
- "rewards/accuracies": 1.0,
66
- "rewards/chosen": 0.263149231672287,
67
- "rewards/margins": 0.2803087830543518,
68
- "rewards/rejected": -0.017159538343548775,
69
  "step": 30
70
  },
71
  {
72
- "epoch": 2.67,
73
- "grad_norm": 9.39281581673066,
74
- "learning_rate": 1.9030116872178314e-08,
75
- "logits/chosen": -1.7320054769515991,
76
- "logits/rejected": -1.6545593738555908,
77
- "logps/chosen": -303.2212829589844,
78
- "logps/rejected": -99.20294189453125,
79
- "loss": 0.5282,
80
- "rewards/accuracies": 1.0,
81
- "rewards/chosen": 0.36085695028305054,
82
- "rewards/margins": 0.40523195266723633,
83
- "rewards/rejected": -0.04437502473592758,
84
  "step": 40
85
  },
86
  {
87
- "epoch": 3.0,
88
- "step": 45,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  "total_flos": 0.0,
90
- "train_loss": 0.5981406688690185,
91
- "train_runtime": 346.5059,
92
- "train_samples_per_second": 16.32,
93
- "train_steps_per_second": 0.13
94
  }
95
  ],
96
  "logging_steps": 10,
97
- "max_steps": 45,
98
  "num_input_tokens_seen": 0,
99
- "num_train_epochs": 3,
100
  "save_steps": 100,
101
  "total_flos": 0.0,
102
  "train_batch_size": 8,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 100,
6
+ "global_step": 164,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.01,
13
+ "grad_norm": 6.5958876428735564,
14
+ "learning_rate": 2.941176470588235e-08,
15
+ "logits/chosen": -1.6130714416503906,
16
+ "logits/rejected": -1.7848026752471924,
17
+ "logps/chosen": -143.55209350585938,
18
+ "logps/rejected": -137.43441772460938,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
 
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.06,
28
+ "grad_norm": 5.967532383605112,
29
+ "learning_rate": 2.941176470588235e-07,
30
+ "logits/chosen": -1.8283494710922241,
31
+ "logits/rejected": -1.7852643728256226,
32
+ "logps/chosen": -158.81536865234375,
33
+ "logps/rejected": -151.6327362060547,
34
+ "loss": 0.693,
35
+ "rewards/accuracies": 0.4722222089767456,
36
+ "rewards/chosen": 8.654648991068825e-05,
37
+ "rewards/margins": 0.0005829257424920797,
38
+ "rewards/rejected": -0.0004963793326169252,
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.12,
43
+ "grad_norm": 5.606818404653461,
44
+ "learning_rate": 4.994863481875841e-07,
45
+ "logits/chosen": -1.8151414394378662,
46
+ "logits/rejected": -1.7734615802764893,
47
+ "logps/chosen": -151.97584533691406,
48
+ "logps/rejected": -164.20437622070312,
49
+ "loss": 0.6923,
50
+ "rewards/accuracies": 0.581250011920929,
51
+ "rewards/chosen": 0.003920617047697306,
52
+ "rewards/margins": 0.0024364024866372347,
53
+ "rewards/rejected": 0.001484214561060071,
54
  "step": 20
55
  },
56
  {
57
+ "epoch": 0.18,
58
+ "grad_norm": 6.452038531330129,
59
+ "learning_rate": 4.904133592102591e-07,
60
+ "logits/chosen": -1.8305763006210327,
61
+ "logits/rejected": -1.7172702550888062,
62
+ "logps/chosen": -154.3677520751953,
63
+ "logps/rejected": -148.50753784179688,
64
+ "loss": 0.6882,
65
+ "rewards/accuracies": 0.6937500238418579,
66
+ "rewards/chosen": 0.016027290374040604,
67
+ "rewards/margins": 0.00950100552290678,
68
+ "rewards/rejected": 0.006526285316795111,
69
  "step": 30
70
  },
71
  {
72
+ "epoch": 0.24,
73
+ "grad_norm": 6.2953570308846825,
74
+ "learning_rate": 4.704015606870022e-07,
75
+ "logits/chosen": -1.7697455883026123,
76
+ "logits/rejected": -1.7966588735580444,
77
+ "logps/chosen": -143.58848571777344,
78
+ "logps/rejected": -166.49522399902344,
79
+ "loss": 0.6829,
80
+ "rewards/accuracies": 0.675000011920929,
81
+ "rewards/chosen": 0.03490619733929634,
82
+ "rewards/margins": 0.02003355883061886,
83
+ "rewards/rejected": 0.014872634783387184,
84
  "step": 40
85
  },
86
  {
87
+ "epoch": 0.3,
88
+ "grad_norm": 6.274119591898531,
89
+ "learning_rate": 4.4036148959228356e-07,
90
+ "logits/chosen": -1.7394487857818604,
91
+ "logits/rejected": -1.804693579673767,
92
+ "logps/chosen": -159.61492919921875,
93
+ "logps/rejected": -136.1581268310547,
94
+ "loss": 0.6763,
95
+ "rewards/accuracies": 0.8062499761581421,
96
+ "rewards/chosen": 0.06023404002189636,
97
+ "rewards/margins": 0.042321957647800446,
98
+ "rewards/rejected": 0.017912080511450768,
99
+ "step": 50
100
+ },
101
+ {
102
+ "epoch": 0.37,
103
+ "grad_norm": 6.180992532830828,
104
+ "learning_rate": 4.016599693735638e-07,
105
+ "logits/chosen": -1.6605278253555298,
106
+ "logits/rejected": -1.724905252456665,
107
+ "logps/chosen": -146.7899932861328,
108
+ "logps/rejected": -148.02505493164062,
109
+ "loss": 0.6733,
110
+ "rewards/accuracies": 0.762499988079071,
111
+ "rewards/chosen": 0.0686994269490242,
112
+ "rewards/margins": 0.04312276840209961,
113
+ "rewards/rejected": 0.02557666040956974,
114
+ "step": 60
115
+ },
116
+ {
117
+ "epoch": 0.43,
118
+ "grad_norm": 5.590599679916071,
119
+ "learning_rate": 3.5605791947475926e-07,
120
+ "logits/chosen": -1.7533237934112549,
121
+ "logits/rejected": -1.702845811843872,
122
+ "logps/chosen": -146.6136474609375,
123
+ "logps/rejected": -140.97921752929688,
124
+ "loss": 0.6631,
125
+ "rewards/accuracies": 0.7250000238418579,
126
+ "rewards/chosen": 0.07920090854167938,
127
+ "rewards/margins": 0.053236376494169235,
128
+ "rewards/rejected": 0.0259645227342844,
129
+ "step": 70
130
+ },
131
+ {
132
+ "epoch": 0.49,
133
+ "grad_norm": 5.096416269116106,
134
+ "learning_rate": 3.056302334890786e-07,
135
+ "logits/chosen": -1.616193413734436,
136
+ "logits/rejected": -1.6094154119491577,
137
+ "logps/chosen": -142.79188537597656,
138
+ "logps/rejected": -140.85447692871094,
139
+ "loss": 0.6609,
140
+ "rewards/accuracies": 0.78125,
141
+ "rewards/chosen": 0.09861920028924942,
142
+ "rewards/margins": 0.0706188827753067,
143
+ "rewards/rejected": 0.028000324964523315,
144
+ "step": 80
145
+ },
146
+ {
147
+ "epoch": 0.55,
148
+ "grad_norm": 5.517912420297569,
149
+ "learning_rate": 2.526713714858433e-07,
150
+ "logits/chosen": -1.608278512954712,
151
+ "logits/rejected": -1.5585658550262451,
152
+ "logps/chosen": -132.39981079101562,
153
+ "logps/rejected": -143.10488891601562,
154
+ "loss": 0.6557,
155
+ "rewards/accuracies": 0.7562500238418579,
156
+ "rewards/chosen": 0.1142318844795227,
157
+ "rewards/margins": 0.07896542549133301,
158
+ "rewards/rejected": 0.0352664515376091,
159
+ "step": 90
160
+ },
161
+ {
162
+ "epoch": 0.61,
163
+ "grad_norm": 5.179137970855667,
164
+ "learning_rate": 1.9959096206109175e-07,
165
+ "logits/chosen": -1.5899827480316162,
166
+ "logits/rejected": -1.5742290019989014,
167
+ "logps/chosen": -136.0356903076172,
168
+ "logps/rejected": -162.7815704345703,
169
+ "loss": 0.6508,
170
+ "rewards/accuracies": 0.7562500238418579,
171
+ "rewards/chosen": 0.11762702465057373,
172
+ "rewards/margins": 0.08622404932975769,
173
+ "rewards/rejected": 0.03140297532081604,
174
+ "step": 100
175
+ },
176
+ {
177
+ "epoch": 0.61,
178
+ "eval_logits/chosen": -1.6967989206314087,
179
+ "eval_logits/rejected": -1.6722551584243774,
180
+ "eval_logps/chosen": -158.87005615234375,
181
+ "eval_logps/rejected": -170.24278259277344,
182
+ "eval_loss": 0.6690559983253479,
183
+ "eval_rewards/accuracies": 0.6940954923629761,
184
+ "eval_rewards/chosen": 0.07056128978729248,
185
+ "eval_rewards/margins": 0.050339534878730774,
186
+ "eval_rewards/rejected": 0.020221758633852005,
187
+ "eval_runtime": 1977.6877,
188
+ "eval_samples_per_second": 9.659,
189
+ "eval_steps_per_second": 0.302,
190
+ "step": 100
191
+ },
192
+ {
193
+ "epoch": 0.67,
194
+ "grad_norm": 5.410829812028072,
195
+ "learning_rate": 1.4880416421940154e-07,
196
+ "logits/chosen": -1.6502714157104492,
197
+ "logits/rejected": -1.6523603200912476,
198
+ "logps/chosen": -134.38687133789062,
199
+ "logps/rejected": -157.00936889648438,
200
+ "loss": 0.6512,
201
+ "rewards/accuracies": 0.8062499761581421,
202
+ "rewards/chosen": 0.11995081603527069,
203
+ "rewards/margins": 0.09394902735948563,
204
+ "rewards/rejected": 0.026001790538430214,
205
+ "step": 110
206
+ },
207
+ {
208
+ "epoch": 0.73,
209
+ "grad_norm": 5.845780336717107,
210
+ "learning_rate": 1.0262177762208507e-07,
211
+ "logits/chosen": -1.565212607383728,
212
+ "logits/rejected": -1.6423566341400146,
213
+ "logps/chosen": -143.96304321289062,
214
+ "logps/rejected": -149.28546142578125,
215
+ "loss": 0.6496,
216
+ "rewards/accuracies": 0.7875000238418579,
217
+ "rewards/chosen": 0.13488885760307312,
218
+ "rewards/margins": 0.10831846296787262,
219
+ "rewards/rejected": 0.026570383459329605,
220
+ "step": 120
221
+ },
222
+ {
223
+ "epoch": 0.79,
224
+ "grad_norm": 5.76403048084688,
225
+ "learning_rate": 6.31451011862412e-08,
226
+ "logits/chosen": -1.6332323551177979,
227
+ "logits/rejected": -1.6044152975082397,
228
+ "logps/chosen": -137.62985229492188,
229
+ "logps/rejected": -159.90980529785156,
230
+ "loss": 0.6439,
231
+ "rewards/accuracies": 0.737500011920929,
232
+ "rewards/chosen": 0.13013367354869843,
233
+ "rewards/margins": 0.10071909427642822,
234
+ "rewards/rejected": 0.02941458486020565,
235
+ "step": 130
236
+ },
237
+ {
238
+ "epoch": 0.85,
239
+ "grad_norm": 5.119446644831888,
240
+ "learning_rate": 3.217032396915265e-08,
241
+ "logits/chosen": -1.569746971130371,
242
+ "logits/rejected": -1.6146259307861328,
243
+ "logps/chosen": -130.83258056640625,
244
+ "logps/rejected": -160.59701538085938,
245
+ "loss": 0.6439,
246
+ "rewards/accuracies": 0.78125,
247
+ "rewards/chosen": 0.1322535276412964,
248
+ "rewards/margins": 0.10249896347522736,
249
+ "rewards/rejected": 0.029754554852843285,
250
+ "step": 140
251
+ },
252
+ {
253
+ "epoch": 0.91,
254
+ "grad_norm": 5.590191167835734,
255
+ "learning_rate": 1.1106798553464802e-08,
256
+ "logits/chosen": -1.6109774112701416,
257
+ "logits/rejected": -1.607143759727478,
258
+ "logps/chosen": -145.5422821044922,
259
+ "logps/rejected": -155.8082733154297,
260
+ "loss": 0.6426,
261
+ "rewards/accuracies": 0.8062499761581421,
262
+ "rewards/chosen": 0.14719954133033752,
263
+ "rewards/margins": 0.11081697046756744,
264
+ "rewards/rejected": 0.03638254478573799,
265
+ "step": 150
266
+ },
267
+ {
268
+ "epoch": 0.98,
269
+ "grad_norm": 5.417981503927173,
270
+ "learning_rate": 9.129154946982687e-10,
271
+ "logits/chosen": -1.5755327939987183,
272
+ "logits/rejected": -1.6533405780792236,
273
+ "logps/chosen": -144.75936889648438,
274
+ "logps/rejected": -150.3732452392578,
275
+ "loss": 0.6439,
276
+ "rewards/accuracies": 0.8062499761581421,
277
+ "rewards/chosen": 0.1261114478111267,
278
+ "rewards/margins": 0.10229575634002686,
279
+ "rewards/rejected": 0.023815687745809555,
280
+ "step": 160
281
+ },
282
+ {
283
+ "epoch": 1.0,
284
+ "step": 164,
285
  "total_flos": 0.0,
286
+ "train_loss": 0.2519006322069866,
287
+ "train_runtime": 787.0698,
288
+ "train_samples_per_second": 13.311,
289
+ "train_steps_per_second": 0.208
290
  }
291
  ],
292
  "logging_steps": 10,
293
+ "max_steps": 164,
294
  "num_input_tokens_seen": 0,
295
+ "num_train_epochs": 1,
296
  "save_steps": 100,
297
  "total_flos": 0.0,
298
  "train_batch_size": 8,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4aa92a8826c6d09e2cf1a597f1a350d4b560367e1b01a843b0bed4d983eaabf7
3
  size 6264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28f15ca60ba3b07f2027c55b34635f9cf9250401baa46a13461b5c1fbbfdd3f5
3
  size 6264