Andrea Colombo commited on
Commit
1c89313
1 Parent(s): 4793ab9

updated model

Browse files
README.md CHANGED
@@ -21,9 +21,6 @@ A Mistral-7B-instruct-v0.1 model to extract a title from the text of Italian law
21
  - **Finetuned from model:** mistralai/Mistral-7B-Instruct-v0.1
22
 
23
 
24
- ## How to Get Started with the Model
25
-
26
-
27
  ## Training Details
28
 
29
  ### Training Procedure
@@ -34,5 +31,5 @@ We use the paged Adam optimizer, a learning rate of 0.004, and a cosine learning
34
 
35
  ## Evaluation
36
 
37
- The best model reported an evaluation loss of 1.241.
38
 
 
21
  - **Finetuned from model:** mistralai/Mistral-7B-Instruct-v0.1
22
 
23
 
 
 
 
24
  ## Training Details
25
 
26
  ### Training Procedure
 
31
 
32
  ## Evaluation
33
 
34
+ The best model reported an evaluation loss of 1.0030452013015747
35
 
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e293ab6a52bb26bb69612ccc6c47d2b1c68f04232cc0dad369b1fc4a9456de0
3
  size 109069176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:437dddfa09ec4b75291a5a4b67d5e895271aabe35ccd9f94f8d7cd2f0026b8ce
3
  size 109069176
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:111e9cd1e94b04428204e1f0d3962c18104226f8b74c3852cf0d4fcad2ec7c12
3
  size 218211962
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69c244805725ec0e2048087095e20b54f816a06e887a2c967137a7874a5cddf3
3
  size 218211962
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16efb5e84d31149576cd7f684c4f49c3efb70127b39a234041019d54c6ea9612
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2273825db7916378144c89e647988b4652a7fa2205b00f782cfe68d02a3a468b
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81007ec48272bbdc4f9622c046f9c026bf8120ed11d1398fd97bb5168a6f3dda
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d082bc4e6445151e7041dd6814dd33a4ee1bada774f792aa86b6581d4dd20f7
3
  size 1064
tokenizer.json CHANGED
@@ -31,23 +31,13 @@
31
  "special": true
32
  }
33
  ],
34
- "normalizer": {
35
- "type": "Sequence",
36
- "normalizers": [
37
- {
38
- "type": "Prepend",
39
- "prepend": "▁"
40
- },
41
- {
42
- "type": "Replace",
43
- "pattern": {
44
- "String": " "
45
- },
46
- "content": "▁"
47
- }
48
- ]
49
  },
50
- "pre_tokenizer": null,
51
  "post_processor": {
52
  "type": "TemplateProcessing",
53
  "single": [
 
31
  "special": true
32
  }
33
  ],
34
+ "normalizer": null,
35
+ "pre_tokenizer": {
36
+ "type": "Metaspace",
37
+ "replacement": "▁",
38
+ "prepend_scheme": "first",
39
+ "split": false
 
 
 
 
 
 
 
 
 
40
  },
 
41
  "post_processor": {
42
  "type": "TemplateProcessing",
43
  "single": [
tokenizer_config.json CHANGED
@@ -31,7 +31,7 @@
31
  "bos_token": "<s>",
32
  "clean_up_tokenization_spaces": false,
33
  "eos_token": "</s>",
34
- "legacy": true,
35
  "model_max_length": 1000000000000000019884624838656,
36
  "pad_token": "</s>",
37
  "sp_model_kwargs": {},
 
31
  "bos_token": "<s>",
32
  "clean_up_tokenization_spaces": false,
33
  "eos_token": "</s>",
34
+ "legacy": false,
35
  "model_max_length": 1000000000000000019884624838656,
36
  "pad_token": "</s>",
37
  "sp_model_kwargs": {},
trainer_state.json CHANGED
@@ -1,128 +1,344 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.008880994671403197,
5
- "eval_steps": 20,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0008880994671403197,
13
- "grad_norm": 0.3568251430988312,
14
  "learning_rate": 0.0002,
15
- "loss": 1.6891,
16
- "step": 10
17
  },
18
  {
19
- "epoch": 0.0017761989342806395,
20
- "grad_norm": 0.29317253828048706,
21
  "learning_rate": 0.0002,
22
- "loss": 1.5007,
23
- "step": 20
24
  },
25
  {
26
- "epoch": 0.0017761989342806395,
27
- "eval_loss": 1.4433443546295166,
28
- "eval_runtime": 2364.2696,
29
- "eval_samples_per_second": 4.79,
30
- "eval_steps_per_second": 0.599,
31
- "step": 20
32
  },
33
  {
34
- "epoch": 0.0026642984014209592,
35
- "grad_norm": 0.2844996154308319,
36
  "learning_rate": 0.0002,
37
- "loss": 1.4145,
38
- "step": 30
39
  },
40
  {
41
- "epoch": 0.003552397868561279,
42
- "grad_norm": 0.26554158329963684,
43
  "learning_rate": 0.0002,
44
- "loss": 1.3632,
45
- "step": 40
46
  },
47
  {
48
- "epoch": 0.003552397868561279,
49
- "eval_loss": 1.3292763233184814,
50
- "eval_runtime": 2364.4165,
51
- "eval_samples_per_second": 4.79,
52
- "eval_steps_per_second": 0.599,
53
- "step": 40
54
  },
55
  {
56
- "epoch": 0.004440497335701598,
57
- "grad_norm": 0.2824796140193939,
58
  "learning_rate": 0.0002,
59
- "loss": 1.3502,
60
- "step": 50
61
  },
62
  {
63
- "epoch": 0.0053285968028419185,
64
- "grad_norm": 0.2961088716983795,
65
  "learning_rate": 0.0002,
66
- "loss": 1.2801,
67
- "step": 60
68
  },
69
  {
70
- "epoch": 0.0053285968028419185,
71
- "eval_loss": 1.2857508659362793,
72
- "eval_runtime": 2364.2023,
73
- "eval_samples_per_second": 4.79,
74
- "eval_steps_per_second": 0.599,
75
- "step": 60
76
  },
77
  {
78
- "epoch": 0.006216696269982238,
79
- "grad_norm": 0.3122478723526001,
80
  "learning_rate": 0.0002,
81
- "loss": 1.2985,
82
- "step": 70
83
  },
84
  {
85
- "epoch": 0.007104795737122558,
86
- "grad_norm": 0.2596866488456726,
 
 
 
 
 
 
 
 
87
  "learning_rate": 0.0002,
88
- "loss": 1.3061,
89
- "step": 80
90
  },
91
  {
92
- "epoch": 0.007104795737122558,
93
- "eval_loss": 1.2607264518737793,
94
- "eval_runtime": 2364.551,
95
- "eval_samples_per_second": 4.789,
96
- "eval_steps_per_second": 0.599,
97
- "step": 80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  },
99
  {
100
- "epoch": 0.007992895204262877,
101
- "grad_norm": 0.28420358896255493,
102
  "learning_rate": 0.0002,
103
- "loss": 1.2627,
104
- "step": 90
105
  },
106
  {
107
- "epoch": 0.008880994671403197,
108
- "grad_norm": 0.27427056431770325,
109
  "learning_rate": 0.0002,
110
- "loss": 1.2262,
111
- "step": 100
112
  },
113
  {
114
- "epoch": 0.008880994671403197,
115
- "eval_loss": 1.2411292791366577,
116
- "eval_runtime": 2364.3817,
117
- "eval_samples_per_second": 4.79,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  "eval_steps_per_second": 0.599,
119
- "step": 100
120
  }
121
  ],
122
- "logging_steps": 10,
123
- "max_steps": 100,
124
  "num_input_tokens_seen": 0,
125
- "num_train_epochs": 1,
126
  "save_steps": 500,
127
  "stateful_callbacks": {
128
  "TrainerControl": {
@@ -136,7 +352,7 @@
136
  "attributes": {}
137
  }
138
  },
139
- "total_flos": 3.50843194834944e+16,
140
  "train_batch_size": 4,
141
  "trial_name": null,
142
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.0,
5
+ "eval_steps": 500,
6
+ "global_step": 8500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.09411764705882353,
13
+ "grad_norm": 0.30087053775787354,
14
  "learning_rate": 0.0002,
15
+ "loss": 1.4482,
16
+ "step": 200
17
  },
18
  {
19
+ "epoch": 0.18823529411764706,
20
+ "grad_norm": 0.26450973749160767,
21
  "learning_rate": 0.0002,
22
+ "loss": 1.3122,
23
+ "step": 400
24
  },
25
  {
26
+ "epoch": 0.2823529411764706,
27
+ "grad_norm": 0.2747337520122528,
28
+ "learning_rate": 0.0002,
29
+ "loss": 1.2564,
30
+ "step": 600
 
31
  },
32
  {
33
+ "epoch": 0.3764705882352941,
34
+ "grad_norm": 0.291003555059433,
35
  "learning_rate": 0.0002,
36
+ "loss": 1.2267,
37
+ "step": 800
38
  },
39
  {
40
+ "epoch": 0.47058823529411764,
41
+ "grad_norm": 0.29030027985572815,
42
  "learning_rate": 0.0002,
43
+ "loss": 1.2034,
44
+ "step": 1000
45
  },
46
  {
47
+ "epoch": 0.5647058823529412,
48
+ "grad_norm": 0.41770946979522705,
49
+ "learning_rate": 0.0002,
50
+ "loss": 1.1856,
51
+ "step": 1200
 
52
  },
53
  {
54
+ "epoch": 0.6588235294117647,
55
+ "grad_norm": 0.29782745242118835,
56
  "learning_rate": 0.0002,
57
+ "loss": 1.1751,
58
+ "step": 1400
59
  },
60
  {
61
+ "epoch": 0.7529411764705882,
62
+ "grad_norm": 0.28214672207832336,
63
  "learning_rate": 0.0002,
64
+ "loss": 1.1574,
65
+ "step": 1600
66
  },
67
  {
68
+ "epoch": 0.8470588235294118,
69
+ "grad_norm": 0.29813048243522644,
70
+ "learning_rate": 0.0002,
71
+ "loss": 1.1432,
72
+ "step": 1800
 
73
  },
74
  {
75
+ "epoch": 0.9411764705882353,
76
+ "grad_norm": 0.3031373918056488,
77
  "learning_rate": 0.0002,
78
+ "loss": 1.1341,
79
+ "step": 2000
80
  },
81
  {
82
+ "epoch": 1.0,
83
+ "eval_loss": 1.1254174709320068,
84
+ "eval_runtime": 443.4321,
85
+ "eval_samples_per_second": 4.792,
86
+ "eval_steps_per_second": 0.6,
87
+ "step": 2125
88
+ },
89
+ {
90
+ "epoch": 1.035294117647059,
91
+ "grad_norm": 0.30628809332847595,
92
  "learning_rate": 0.0002,
93
+ "loss": 1.111,
94
+ "step": 2200
95
  },
96
  {
97
+ "epoch": 1.1294117647058823,
98
+ "grad_norm": 0.3264883756637573,
99
+ "learning_rate": 0.0002,
100
+ "loss": 1.0876,
101
+ "step": 2400
102
+ },
103
+ {
104
+ "epoch": 1.223529411764706,
105
+ "grad_norm": 0.3304358720779419,
106
+ "learning_rate": 0.0002,
107
+ "loss": 1.0777,
108
+ "step": 2600
109
+ },
110
+ {
111
+ "epoch": 1.3176470588235294,
112
+ "grad_norm": 0.3507118821144104,
113
+ "learning_rate": 0.0002,
114
+ "loss": 1.075,
115
+ "step": 2800
116
+ },
117
+ {
118
+ "epoch": 1.4117647058823528,
119
+ "grad_norm": 0.34798240661621094,
120
+ "learning_rate": 0.0002,
121
+ "loss": 1.0705,
122
+ "step": 3000
123
+ },
124
+ {
125
+ "epoch": 1.5058823529411764,
126
+ "grad_norm": 0.33348146080970764,
127
+ "learning_rate": 0.0002,
128
+ "loss": 1.0616,
129
+ "step": 3200
130
+ },
131
+ {
132
+ "epoch": 1.6,
133
+ "grad_norm": 0.3142307698726654,
134
+ "learning_rate": 0.0002,
135
+ "loss": 1.0592,
136
+ "step": 3400
137
+ },
138
+ {
139
+ "epoch": 1.6941176470588235,
140
+ "grad_norm": 0.33189332485198975,
141
+ "learning_rate": 0.0002,
142
+ "loss": 1.0563,
143
+ "step": 3600
144
+ },
145
+ {
146
+ "epoch": 1.788235294117647,
147
+ "grad_norm": 0.31737592816352844,
148
+ "learning_rate": 0.0002,
149
+ "loss": 1.0508,
150
+ "step": 3800
151
+ },
152
+ {
153
+ "epoch": 1.8823529411764706,
154
+ "grad_norm": 0.2998281717300415,
155
+ "learning_rate": 0.0002,
156
+ "loss": 1.0468,
157
+ "step": 4000
158
+ },
159
+ {
160
+ "epoch": 1.9764705882352942,
161
+ "grad_norm": 0.34619805216789246,
162
+ "learning_rate": 0.0002,
163
+ "loss": 1.0422,
164
+ "step": 4200
165
+ },
166
+ {
167
+ "epoch": 2.0,
168
+ "eval_loss": 1.061407208442688,
169
+ "eval_runtime": 443.4749,
170
+ "eval_samples_per_second": 4.792,
171
+ "eval_steps_per_second": 0.6,
172
+ "step": 4250
173
+ },
174
+ {
175
+ "epoch": 2.070588235294118,
176
+ "grad_norm": 0.35598576068878174,
177
+ "learning_rate": 0.0002,
178
+ "loss": 1.0008,
179
+ "step": 4400
180
+ },
181
+ {
182
+ "epoch": 2.164705882352941,
183
+ "grad_norm": 0.3873290419578552,
184
+ "learning_rate": 0.0002,
185
+ "loss": 0.9917,
186
+ "step": 4600
187
+ },
188
+ {
189
+ "epoch": 2.2588235294117647,
190
+ "grad_norm": 0.3637497127056122,
191
+ "learning_rate": 0.0002,
192
+ "loss": 0.9911,
193
+ "step": 4800
194
+ },
195
+ {
196
+ "epoch": 2.3529411764705883,
197
+ "grad_norm": 0.35753560066223145,
198
+ "learning_rate": 0.0002,
199
+ "loss": 0.9929,
200
+ "step": 5000
201
+ },
202
+ {
203
+ "epoch": 2.447058823529412,
204
+ "grad_norm": 0.3278402090072632,
205
+ "learning_rate": 0.0002,
206
+ "loss": 0.9876,
207
+ "step": 5200
208
+ },
209
+ {
210
+ "epoch": 2.541176470588235,
211
+ "grad_norm": 0.3679386377334595,
212
+ "learning_rate": 0.0002,
213
+ "loss": 0.9842,
214
+ "step": 5400
215
+ },
216
+ {
217
+ "epoch": 2.635294117647059,
218
+ "grad_norm": 0.3931664526462555,
219
+ "learning_rate": 0.0002,
220
+ "loss": 0.9885,
221
+ "step": 5600
222
+ },
223
+ {
224
+ "epoch": 2.7294117647058824,
225
+ "grad_norm": 0.3553083539009094,
226
+ "learning_rate": 0.0002,
227
+ "loss": 0.9806,
228
+ "step": 5800
229
+ },
230
+ {
231
+ "epoch": 2.8235294117647056,
232
+ "grad_norm": 0.37587428092956543,
233
+ "learning_rate": 0.0002,
234
+ "loss": 0.9796,
235
+ "step": 6000
236
+ },
237
+ {
238
+ "epoch": 2.9176470588235293,
239
+ "grad_norm": 0.3934173285961151,
240
+ "learning_rate": 0.0002,
241
+ "loss": 0.9786,
242
+ "step": 6200
243
+ },
244
+ {
245
+ "epoch": 3.0,
246
+ "eval_loss": 1.0245047807693481,
247
+ "eval_runtime": 443.1629,
248
+ "eval_samples_per_second": 4.795,
249
+ "eval_steps_per_second": 0.6,
250
+ "step": 6375
251
+ },
252
+ {
253
+ "epoch": 3.011764705882353,
254
+ "grad_norm": 0.4304977059364319,
255
+ "learning_rate": 0.0002,
256
+ "loss": 0.9719,
257
+ "step": 6400
258
  },
259
  {
260
+ "epoch": 3.1058823529411765,
261
+ "grad_norm": 0.39775729179382324,
262
  "learning_rate": 0.0002,
263
+ "loss": 0.9174,
264
+ "step": 6600
265
  },
266
  {
267
+ "epoch": 3.2,
268
+ "grad_norm": 0.40233707427978516,
269
  "learning_rate": 0.0002,
270
+ "loss": 0.9271,
271
+ "step": 6800
272
  },
273
  {
274
+ "epoch": 3.2941176470588234,
275
+ "grad_norm": 0.39777445793151855,
276
+ "learning_rate": 0.0002,
277
+ "loss": 0.9155,
278
+ "step": 7000
279
+ },
280
+ {
281
+ "epoch": 3.388235294117647,
282
+ "grad_norm": 0.4547841548919678,
283
+ "learning_rate": 0.0002,
284
+ "loss": 0.9265,
285
+ "step": 7200
286
+ },
287
+ {
288
+ "epoch": 3.4823529411764707,
289
+ "grad_norm": 0.3900696039199829,
290
+ "learning_rate": 0.0002,
291
+ "loss": 0.9314,
292
+ "step": 7400
293
+ },
294
+ {
295
+ "epoch": 3.576470588235294,
296
+ "grad_norm": 0.5135142207145691,
297
+ "learning_rate": 0.0002,
298
+ "loss": 0.9302,
299
+ "step": 7600
300
+ },
301
+ {
302
+ "epoch": 3.6705882352941175,
303
+ "grad_norm": 0.40233081579208374,
304
+ "learning_rate": 0.0002,
305
+ "loss": 0.9227,
306
+ "step": 7800
307
+ },
308
+ {
309
+ "epoch": 3.764705882352941,
310
+ "grad_norm": 0.40172523260116577,
311
+ "learning_rate": 0.0002,
312
+ "loss": 0.9268,
313
+ "step": 8000
314
+ },
315
+ {
316
+ "epoch": 3.8588235294117648,
317
+ "grad_norm": 0.38751304149627686,
318
+ "learning_rate": 0.0002,
319
+ "loss": 0.9243,
320
+ "step": 8200
321
+ },
322
+ {
323
+ "epoch": 3.9529411764705884,
324
+ "grad_norm": 0.39530816674232483,
325
+ "learning_rate": 0.0002,
326
+ "loss": 0.9293,
327
+ "step": 8400
328
+ },
329
+ {
330
+ "epoch": 4.0,
331
+ "eval_loss": 1.0030452013015747,
332
+ "eval_runtime": 443.7686,
333
+ "eval_samples_per_second": 4.789,
334
  "eval_steps_per_second": 0.599,
335
+ "step": 8500
336
  }
337
  ],
338
+ "logging_steps": 200,
339
+ "max_steps": 8500,
340
  "num_input_tokens_seen": 0,
341
+ "num_train_epochs": 4,
342
  "save_steps": 500,
343
  "stateful_callbacks": {
344
  "TrainerControl": {
 
352
  "attributes": {}
353
  }
354
  },
355
+ "total_flos": 2.981816312902189e+18,
356
  "train_batch_size": 4,
357
  "trial_name": null,
358
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a27525a3487d07abf90399803e3b6b9e91108ad784516779b81492131f72e08b
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef090e649e75bdbc5f76322c821cf4080d1354b66700ada68766b6363474c05a
3
  size 5368