zlucia commited on
Commit
7978995
1 Parent(s): 4c75d24

End of training

Browse files
README.md CHANGED
@@ -16,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 1.1407
20
 
21
  ## Model description
22
 
@@ -35,7 +35,7 @@ More information needed
35
  ### Training hyperparameters
36
 
37
  The following hyperparameters were used during training:
38
- - learning_rate: 5e-05
39
  - train_batch_size: 4
40
  - eval_batch_size: 4
41
  - seed: 42
@@ -50,11 +50,11 @@ The following hyperparameters were used during training:
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:-----:|:----:|:---------------:|
53
- | 1.0956 | 0.34 | 50 | 1.1443 |
54
- | 1.0635 | 0.68 | 100 | 1.1242 |
55
- | 1.0208 | 1.02 | 150 | 1.1208 |
56
- | 0.9505 | 1.36 | 200 | 1.1454 |
57
- | 0.9604 | 1.7 | 250 | 1.1407 |
58
 
59
 
60
  ### Framework versions
 
16
 
17
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 1.1312
20
 
21
  ## Model description
22
 
 
35
  ### Training hyperparameters
36
 
37
  The following hyperparameters were used during training:
38
+ - learning_rate: 3e-05
39
  - train_batch_size: 4
40
  - eval_batch_size: 4
41
  - seed: 42
 
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:-----:|:----:|:---------------:|
53
+ | 1.1066 | 0.34 | 50 | 1.1555 |
54
+ | 1.0655 | 0.68 | 100 | 1.1333 |
55
+ | 1.039 | 1.02 | 150 | 1.1279 |
56
+ | 1.0166 | 1.36 | 200 | 1.1301 |
57
+ | 1.0281 | 1.7 | 250 | 1.1312 |
58
 
59
 
60
  ### Framework versions
adapter_config.json CHANGED
@@ -10,7 +10,7 @@
10
  "layers_to_transform": null,
11
  "loftq_config": {},
12
  "lora_alpha": 16,
13
- "lora_dropout": 0.05,
14
  "megatron_config": null,
15
  "megatron_core": "megatron.core",
16
  "modules_to_save": null,
@@ -20,12 +20,12 @@
20
  "revision": null,
21
  "target_modules": [
22
  "up_proj",
23
- "q_proj",
24
- "down_proj",
25
  "o_proj",
 
 
26
  "k_proj",
27
- "gate_proj",
28
- "v_proj"
29
  ],
30
  "task_type": "CAUSAL_LM"
31
  }
 
10
  "layers_to_transform": null,
11
  "loftq_config": {},
12
  "lora_alpha": 16,
13
+ "lora_dropout": 0.1,
14
  "megatron_config": null,
15
  "megatron_core": "megatron.core",
16
  "modules_to_save": null,
 
20
  "revision": null,
21
  "target_modules": [
22
  "up_proj",
 
 
23
  "o_proj",
24
+ "v_proj",
25
+ "q_proj",
26
  "k_proj",
27
+ "down_proj",
28
+ "gate_proj"
29
  ],
30
  "task_type": "CAUSAL_LM"
31
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce3c104d98913b0d1196f95870d59002ba294eb55e9e70d4cc69e72653f71914
3
  size 335605144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c4d086cbb272795476d23287af5b86202bac7bff9ae8f867b0da552176415b7
3
  size 335605144
all_results.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
  "epoch": 2.0,
3
- "eval_loss": 1.1439318656921387,
4
- "eval_runtime": 15.5616,
5
- "eval_samples_per_second": 16.836,
6
- "eval_steps_per_second": 4.241,
7
- "train_loss": 1.0086434495692351,
8
- "train_runtime": 872.7075,
9
- "train_samples_per_second": 5.399,
10
- "train_steps_per_second": 0.337
11
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "eval_loss": 1.1402091979980469,
4
+ "eval_runtime": 15.8109,
5
+ "eval_samples_per_second": 16.571,
6
+ "eval_steps_per_second": 4.174,
7
+ "train_loss": 1.0460824836679057,
8
+ "train_runtime": 890.7293,
9
+ "train_samples_per_second": 5.29,
10
+ "train_steps_per_second": 0.33
11
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 2.0,
3
- "eval_loss": 1.1439318656921387,
4
- "eval_runtime": 15.5616,
5
- "eval_samples_per_second": 16.836,
6
- "eval_steps_per_second": 4.241
7
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "eval_loss": 1.1402091979980469,
4
+ "eval_runtime": 15.8109,
5
+ "eval_samples_per_second": 16.571,
6
+ "eval_steps_per_second": 4.174
7
  }
metrics.json CHANGED
@@ -1 +1 @@
1
- {"run_name": "./output", "train_runtime": 861.2997, "train_samples_per_second": 5.471, "train_steps_per_second": 0.341, "train_loss": 1.0092324691564858, "epoch": 2.0, "eval_loss": 1.1439318656921387, "eval_runtime": 15.5616, "eval_samples_per_second": 16.836, "eval_steps_per_second": 4.241}
 
1
+ {"run_name": "./output", "train_runtime": 872.7075, "train_samples_per_second": 5.399, "train_steps_per_second": 0.337, "train_loss": 1.0086434495692351, "epoch": 2.0, "eval_loss": 1.1402091979980469, "eval_runtime": 15.8109, "eval_samples_per_second": 16.571, "eval_steps_per_second": 4.174}
tokenizer.json CHANGED
@@ -2,7 +2,7 @@
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
- "max_length": 512,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
 
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
+ "max_length": 256,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 2.0,
3
- "train_loss": 1.0086434495692351,
4
- "train_runtime": 872.7075,
5
- "train_samples_per_second": 5.399,
6
- "train_steps_per_second": 0.337
7
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "train_loss": 1.0460824836679057,
4
+ "train_runtime": 890.7293,
5
+ "train_samples_per_second": 5.29,
6
+ "train_steps_per_second": 0.33
7
  }
trainer_state.json CHANGED
@@ -10,226 +10,226 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.07,
13
- "learning_rate": 5e-05,
14
- "loss": 1.1701,
15
  "step": 10
16
  },
17
  {
18
  "epoch": 0.14,
19
- "learning_rate": 5e-05,
20
- "loss": 1.1349,
21
  "step": 20
22
  },
23
  {
24
  "epoch": 0.2,
25
- "learning_rate": 5e-05,
26
- "loss": 1.124,
27
  "step": 30
28
  },
29
  {
30
  "epoch": 0.27,
31
- "learning_rate": 5e-05,
32
- "loss": 1.0961,
33
  "step": 40
34
  },
35
  {
36
  "epoch": 0.34,
37
- "learning_rate": 5e-05,
38
- "loss": 1.0956,
39
  "step": 50
40
  },
41
  {
42
  "epoch": 0.34,
43
- "eval_loss": 1.1443167924880981,
44
- "eval_runtime": 15.8615,
45
- "eval_samples_per_second": 16.518,
46
- "eval_steps_per_second": 4.161,
47
  "step": 50
48
  },
49
  {
50
  "epoch": 0.41,
51
- "learning_rate": 5e-05,
52
- "loss": 1.0875,
53
  "step": 60
54
  },
55
  {
56
  "epoch": 0.48,
57
- "learning_rate": 5e-05,
58
- "loss": 1.0766,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.54,
63
- "learning_rate": 5e-05,
64
- "loss": 1.1265,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.61,
69
- "learning_rate": 5e-05,
70
- "loss": 1.1126,
71
  "step": 90
72
  },
73
  {
74
  "epoch": 0.68,
75
- "learning_rate": 5e-05,
76
- "loss": 1.0635,
77
  "step": 100
78
  },
79
  {
80
  "epoch": 0.68,
81
- "eval_loss": 1.124241828918457,
82
- "eval_runtime": 15.8746,
83
- "eval_samples_per_second": 16.504,
84
- "eval_steps_per_second": 4.158,
85
  "step": 100
86
  },
87
  {
88
  "epoch": 0.75,
89
- "learning_rate": 5e-05,
90
- "loss": 1.0434,
91
  "step": 110
92
  },
93
  {
94
  "epoch": 0.81,
95
- "learning_rate": 5e-05,
96
- "loss": 1.1071,
97
  "step": 120
98
  },
99
  {
100
  "epoch": 0.88,
101
- "learning_rate": 5e-05,
102
- "loss": 1.0926,
103
  "step": 130
104
  },
105
  {
106
  "epoch": 0.95,
107
- "learning_rate": 5e-05,
108
- "loss": 1.0677,
109
  "step": 140
110
  },
111
  {
112
  "epoch": 1.02,
113
- "learning_rate": 5e-05,
114
- "loss": 1.0208,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 1.02,
119
- "eval_loss": 1.1208317279815674,
120
- "eval_runtime": 15.9371,
121
- "eval_samples_per_second": 16.44,
122
- "eval_steps_per_second": 4.141,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.09,
127
- "learning_rate": 5e-05,
128
- "loss": 0.9435,
129
  "step": 160
130
  },
131
  {
132
  "epoch": 1.15,
133
- "learning_rate": 5e-05,
134
- "loss": 1.0034,
135
  "step": 170
136
  },
137
  {
138
  "epoch": 1.22,
139
- "learning_rate": 5e-05,
140
- "loss": 0.9165,
141
  "step": 180
142
  },
143
  {
144
  "epoch": 1.29,
145
- "learning_rate": 5e-05,
146
- "loss": 0.8848,
147
  "step": 190
148
  },
149
  {
150
  "epoch": 1.36,
151
- "learning_rate": 5e-05,
152
- "loss": 0.9505,
153
  "step": 200
154
  },
155
  {
156
  "epoch": 1.36,
157
- "eval_loss": 1.1453500986099243,
158
- "eval_runtime": 15.8776,
159
- "eval_samples_per_second": 16.501,
160
- "eval_steps_per_second": 4.157,
161
  "step": 200
162
  },
163
  {
164
  "epoch": 1.43,
165
- "learning_rate": 5e-05,
166
- "loss": 0.9259,
167
  "step": 210
168
  },
169
  {
170
  "epoch": 1.49,
171
- "learning_rate": 5e-05,
172
- "loss": 0.8383,
173
  "step": 220
174
  },
175
  {
176
  "epoch": 1.56,
177
- "learning_rate": 5e-05,
178
- "loss": 0.9317,
179
  "step": 230
180
  },
181
  {
182
  "epoch": 1.63,
183
- "learning_rate": 5e-05,
184
- "loss": 0.9336,
185
  "step": 240
186
  },
187
  {
188
  "epoch": 1.7,
189
- "learning_rate": 5e-05,
190
- "loss": 0.9604,
191
  "step": 250
192
  },
193
  {
194
  "epoch": 1.7,
195
- "eval_loss": 1.1406688690185547,
196
- "eval_runtime": 15.8651,
197
- "eval_samples_per_second": 16.514,
198
- "eval_steps_per_second": 4.16,
199
  "step": 250
200
  },
201
  {
202
  "epoch": 1.77,
203
- "learning_rate": 5e-05,
204
- "loss": 0.8884,
205
  "step": 260
206
  },
207
  {
208
  "epoch": 1.83,
209
- "learning_rate": 5e-05,
210
- "loss": 0.9757,
211
  "step": 270
212
  },
213
  {
214
  "epoch": 1.9,
215
- "learning_rate": 5e-05,
216
- "loss": 0.8977,
217
  "step": 280
218
  },
219
  {
220
  "epoch": 1.97,
221
- "learning_rate": 5e-05,
222
- "loss": 0.8645,
223
  "step": 290
224
  },
225
  {
226
  "epoch": 2.0,
227
  "step": 294,
228
- "total_flos": 7.119009482145792e+16,
229
- "train_loss": 1.0086434495692351,
230
- "train_runtime": 872.7075,
231
- "train_samples_per_second": 5.399,
232
- "train_steps_per_second": 0.337
233
  }
234
  ],
235
  "logging_steps": 10,
@@ -237,7 +237,7 @@
237
  "num_input_tokens_seen": 0,
238
  "num_train_epochs": 2,
239
  "save_steps": 250,
240
- "total_flos": 7.119009482145792e+16,
241
  "train_batch_size": 4,
242
  "trial_name": null,
243
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.07,
13
+ "learning_rate": 3e-05,
14
+ "loss": 1.2187,
15
  "step": 10
16
  },
17
  {
18
  "epoch": 0.14,
19
+ "learning_rate": 3e-05,
20
+ "loss": 1.1481,
21
  "step": 20
22
  },
23
  {
24
  "epoch": 0.2,
25
+ "learning_rate": 3e-05,
26
+ "loss": 1.1437,
27
  "step": 30
28
  },
29
  {
30
  "epoch": 0.27,
31
+ "learning_rate": 3e-05,
32
+ "loss": 1.1076,
33
  "step": 40
34
  },
35
  {
36
  "epoch": 0.34,
37
+ "learning_rate": 3e-05,
38
+ "loss": 1.1066,
39
  "step": 50
40
  },
41
  {
42
  "epoch": 0.34,
43
+ "eval_loss": 1.1555328369140625,
44
+ "eval_runtime": 15.6804,
45
+ "eval_samples_per_second": 16.709,
46
+ "eval_steps_per_second": 4.209,
47
  "step": 50
48
  },
49
  {
50
  "epoch": 0.41,
51
+ "learning_rate": 3e-05,
52
+ "loss": 1.0988,
53
  "step": 60
54
  },
55
  {
56
  "epoch": 0.48,
57
+ "learning_rate": 3e-05,
58
+ "loss": 1.0917,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.54,
63
+ "learning_rate": 3e-05,
64
+ "loss": 1.1344,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.61,
69
+ "learning_rate": 3e-05,
70
+ "loss": 1.1167,
71
  "step": 90
72
  },
73
  {
74
  "epoch": 0.68,
75
+ "learning_rate": 3e-05,
76
+ "loss": 1.0655,
77
  "step": 100
78
  },
79
  {
80
  "epoch": 0.68,
81
+ "eval_loss": 1.133277177810669,
82
+ "eval_runtime": 15.7203,
83
+ "eval_samples_per_second": 16.666,
84
+ "eval_steps_per_second": 4.198,
85
  "step": 100
86
  },
87
  {
88
  "epoch": 0.75,
89
+ "learning_rate": 3e-05,
90
+ "loss": 1.0595,
91
  "step": 110
92
  },
93
  {
94
  "epoch": 0.81,
95
+ "learning_rate": 3e-05,
96
+ "loss": 1.1085,
97
  "step": 120
98
  },
99
  {
100
  "epoch": 0.88,
101
+ "learning_rate": 3e-05,
102
+ "loss": 1.0945,
103
  "step": 130
104
  },
105
  {
106
  "epoch": 0.95,
107
+ "learning_rate": 3e-05,
108
+ "loss": 1.0726,
109
  "step": 140
110
  },
111
  {
112
  "epoch": 1.02,
113
+ "learning_rate": 3e-05,
114
+ "loss": 1.039,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 1.02,
119
+ "eval_loss": 1.1278975009918213,
120
+ "eval_runtime": 15.7982,
121
+ "eval_samples_per_second": 16.584,
122
+ "eval_steps_per_second": 4.178,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.09,
127
+ "learning_rate": 3e-05,
128
+ "loss": 0.9891,
129
  "step": 160
130
  },
131
  {
132
  "epoch": 1.15,
133
+ "learning_rate": 3e-05,
134
+ "loss": 1.0617,
135
  "step": 170
136
  },
137
  {
138
  "epoch": 1.22,
139
+ "learning_rate": 3e-05,
140
+ "loss": 0.98,
141
  "step": 180
142
  },
143
  {
144
  "epoch": 1.29,
145
+ "learning_rate": 3e-05,
146
+ "loss": 0.9531,
147
  "step": 190
148
  },
149
  {
150
  "epoch": 1.36,
151
+ "learning_rate": 3e-05,
152
+ "loss": 1.0166,
153
  "step": 200
154
  },
155
  {
156
  "epoch": 1.36,
157
+ "eval_loss": 1.1301459074020386,
158
+ "eval_runtime": 15.7471,
159
+ "eval_samples_per_second": 16.638,
160
+ "eval_steps_per_second": 4.191,
161
  "step": 200
162
  },
163
  {
164
  "epoch": 1.43,
165
+ "learning_rate": 3e-05,
166
+ "loss": 0.9869,
167
  "step": 210
168
  },
169
  {
170
  "epoch": 1.49,
171
+ "learning_rate": 3e-05,
172
+ "loss": 0.9066,
173
  "step": 220
174
  },
175
  {
176
  "epoch": 1.56,
177
+ "learning_rate": 3e-05,
178
+ "loss": 0.9884,
179
  "step": 230
180
  },
181
  {
182
  "epoch": 1.63,
183
+ "learning_rate": 3e-05,
184
+ "loss": 0.9885,
185
  "step": 240
186
  },
187
  {
188
  "epoch": 1.7,
189
+ "learning_rate": 3e-05,
190
+ "loss": 1.0281,
191
  "step": 250
192
  },
193
  {
194
  "epoch": 1.7,
195
+ "eval_loss": 1.1312241554260254,
196
+ "eval_runtime": 15.6611,
197
+ "eval_samples_per_second": 16.729,
198
+ "eval_steps_per_second": 4.214,
199
  "step": 250
200
  },
201
  {
202
  "epoch": 1.77,
203
+ "learning_rate": 3e-05,
204
+ "loss": 0.9723,
205
  "step": 260
206
  },
207
  {
208
  "epoch": 1.83,
209
+ "learning_rate": 3e-05,
210
+ "loss": 1.0428,
211
  "step": 270
212
  },
213
  {
214
  "epoch": 1.9,
215
+ "learning_rate": 3e-05,
216
+ "loss": 0.961,
217
  "step": 280
218
  },
219
  {
220
  "epoch": 1.97,
221
+ "learning_rate": 3e-05,
222
+ "loss": 0.9311,
223
  "step": 290
224
  },
225
  {
226
  "epoch": 2.0,
227
  "step": 294,
228
+ "total_flos": 7.002531209517466e+16,
229
+ "train_loss": 1.0460824836679057,
230
+ "train_runtime": 890.7293,
231
+ "train_samples_per_second": 5.29,
232
+ "train_steps_per_second": 0.33
233
  }
234
  ],
235
  "logging_steps": 10,
 
237
  "num_input_tokens_seen": 0,
238
  "num_train_epochs": 2,
239
  "save_steps": 250,
240
+ "total_flos": 7.002531209517466e+16,
241
  "train_batch_size": 4,
242
  "trial_name": null,
243
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc89040a415c72790d5fd7b4dba8d591833a97e8e25926a0bf37205f0fa84bae
3
- size 6712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7354e725785da8b7b0a039b25e28ac3e5e811514125d390c40da0e60c2ef9472
3
+ size 6648