osanseviero HF staff commited on
Commit
dde4984
β€’
1 Parent(s): c6e1f7a

Update spaCy pipeline

Browse files
README.md CHANGED
@@ -63,8 +63,8 @@ Chinese pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter,
63
  | Feature | Description |
64
  | --- | --- |
65
  | **Name** | `zh_core_web_lg` |
66
- | **Version** | `3.1.0` |
67
- | **spaCy** | `>=3.1.0,<3.2.0` |
68
  | **Default Pipeline** | `tok2vec`, `tagger`, `parser`, `attribute_ruler`, `ner` |
69
  | **Components** | `tok2vec`, `tagger`, `parser`, `senter`, `attribute_ruler`, `ner` |
70
  | **Vectors** | 500000 keys, 500000 unique vectors (300 dimensions) |
@@ -92,12 +92,15 @@ Chinese pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter,
92
  | Type | Score |
93
  | --- | --- |
94
  | `TOKEN_ACC` | 97.88 |
 
 
 
95
  | `TAG_ACC` | 90.37 |
 
 
 
96
  | `DEP_UAS` | 70.69 |
97
  | `DEP_LAS` | 65.55 |
98
  | `ENTS_P` | 73.59 |
99
  | `ENTS_R` | 69.11 |
100
- | `ENTS_F` | 71.28 |
101
- | `SENTS_P` | 78.96 |
102
- | `SENTS_R` | 72.86 |
103
- | `SENTS_F` | 75.79 |
 
63
  | Feature | Description |
64
  | --- | --- |
65
  | **Name** | `zh_core_web_lg` |
66
+ | **Version** | `3.2.0` |
67
+ | **spaCy** | `>=3.2.0,<3.3.0` |
68
  | **Default Pipeline** | `tok2vec`, `tagger`, `parser`, `attribute_ruler`, `ner` |
69
  | **Components** | `tok2vec`, `tagger`, `parser`, `senter`, `attribute_ruler`, `ner` |
70
  | **Vectors** | 500000 keys, 500000 unique vectors (300 dimensions) |
 
92
  | Type | Score |
93
  | --- | --- |
94
  | `TOKEN_ACC` | 97.88 |
95
+ | `TOKEN_P` | 94.58 |
96
+ | `TOKEN_R` | 91.36 |
97
+ | `TOKEN_F` | 92.94 |
98
  | `TAG_ACC` | 90.37 |
99
+ | `SENTS_P` | 78.96 |
100
+ | `SENTS_R` | 72.86 |
101
+ | `SENTS_F` | 75.79 |
102
  | `DEP_UAS` | 70.69 |
103
  | `DEP_LAS` | 65.55 |
104
  | `ENTS_P` | 73.59 |
105
  | `ENTS_R` | 69.11 |
106
+ | `ENTS_F` | 71.28 |
 
 
 
accuracy.json CHANGED
@@ -1,15 +1,14 @@
1
  {
2
  "token_acc": 0.9788303388,
 
 
 
3
  "tag_acc": 0.9037457747,
4
- "dep_uas": 0.7069146954,
5
- "dep_las": 0.6555390607,
6
- "ents_p": 0.7358998362,
7
- "ents_r": 0.6910989011,
8
- "ents_f": 0.7127961011,
9
  "sents_p": 0.7896445968,
10
  "sents_r": 0.7286499084,
11
  "sents_f": 0.7579220779,
12
- "speed": 9733.8076235494,
 
13
  "dep_las_per_type": {
14
  "dep": {
15
  "p": 0.4876810512,
@@ -237,6 +236,9 @@
237
  "f": 0.9176470588
238
  }
239
  },
 
 
 
240
  "ents_per_type": {
241
  "DATE": {
242
  "p": 0.7675925926,
@@ -328,5 +330,6 @@
328
  "r": 0.5555555556,
329
  "f": 0.5555555556
330
  }
331
- }
 
332
  }
 
1
  {
2
  "token_acc": 0.9788303388,
3
+ "token_p": 0.9458325855,
4
+ "token_r": 0.9136060443,
5
+ "token_f": 0.9294400505,
6
  "tag_acc": 0.9037457747,
 
 
 
 
 
7
  "sents_p": 0.7896445968,
8
  "sents_r": 0.7286499084,
9
  "sents_f": 0.7579220779,
10
+ "dep_uas": 0.7069146954,
11
+ "dep_las": 0.6555390607,
12
  "dep_las_per_type": {
13
  "dep": {
14
  "p": 0.4876810512,
 
236
  "f": 0.9176470588
237
  }
238
  },
239
+ "ents_p": 0.7358998362,
240
+ "ents_r": 0.6910989011,
241
+ "ents_f": 0.7127961011,
242
  "ents_per_type": {
243
  "DATE": {
244
  "p": 0.7675925926,
 
330
  "r": 0.5555555556,
331
  "f": 0.5555555556
332
  }
333
+ },
334
+ "speed": 7127.6040150529
335
  }
attribute_ruler/patterns CHANGED
Binary files a/attribute_ruler/patterns and b/attribute_ruler/patterns differ
 
config.cfg CHANGED
@@ -1,10 +1,8 @@
1
  [paths]
2
- train = "corpus/zh-core-news/train.spacy"
3
- dev = "corpus/zh-core-news/dev.spacy"
4
- vectors = "corpus/zh_vectors"
5
- raw = null
6
  init_tok2vec = null
7
- vocab_data = null
8
 
9
  [system]
10
  gpu_allocator = null
@@ -27,12 +25,14 @@ segmenter = "pkuseg"
27
 
28
  [components.attribute_ruler]
29
  factory = "attribute_ruler"
 
30
  validate = false
31
 
32
  [components.ner]
33
  factory = "ner"
34
  incorrect_spans_key = null
35
  moves = null
 
36
  update_with_oracle_cut_size = 100
37
 
38
  [components.ner.model]
@@ -66,6 +66,7 @@ factory = "parser"
66
  learn_tokens = false
67
  min_action_freq = 30
68
  moves = null
 
69
  update_with_oracle_cut_size = 100
70
 
71
  [components.parser.model]
@@ -84,6 +85,8 @@ upstream = "tok2vec"
84
 
85
  [components.senter]
86
  factory = "senter"
 
 
87
 
88
  [components.senter.model]
89
  @architectures = "spacy.Tagger.v1"
@@ -108,6 +111,8 @@ maxout_pieces = 2
108
 
109
  [components.tagger]
110
  factory = "tagger"
 
 
111
 
112
  [components.tagger.model]
113
  @architectures = "spacy.Tagger.v1"
@@ -142,17 +147,17 @@ maxout_pieces = 3
142
 
143
  [corpora.dev]
144
  @readers = "spacy.Corpus.v1"
145
- limit = 0
146
- max_length = 0
147
- path = ${paths:dev}
148
  gold_preproc = false
 
 
149
  augmenter = null
150
 
151
  [corpora.train]
152
  @readers = "spacy.Corpus.v1"
153
- path = ${paths:train}
154
- max_length = 5000
155
  gold_preproc = false
 
156
  limit = 0
157
  augmenter = null
158
 
@@ -185,9 +190,8 @@ compound = 1.001
185
  t = 0.0
186
 
187
  [training.logger]
188
- @loggers = "spacy.WandbLogger.v1"
189
- project_name = "spacy-v3.0.0a2"
190
- remove_config_values = []
191
 
192
  [training.optimizer]
193
  @optimizers = "Adam.v1"
@@ -201,22 +205,23 @@ eps = 0.00000001
201
  learn_rate = 0.001
202
 
203
  [training.score_weights]
204
- tag_acc = 0.24
205
  dep_uas = 0.0
206
- dep_las = 0.24
207
  dep_las_per_type = null
208
  sents_p = null
209
  sents_r = null
210
- sents_f = 0.03
211
- ents_f = 0.5
212
  ents_p = 0.0
213
  ents_r = 0.0
214
  ents_per_type = null
 
215
 
216
  [pretraining]
217
 
218
  [initialize]
219
- vocab_data = ${paths.vocab_data}
220
  vectors = ${paths.vectors}
221
  init_tok2vec = ${paths.init_tok2vec}
222
  before_init = null
 
1
  [paths]
2
+ train = null
3
+ dev = null
4
+ vectors = null
 
5
  init_tok2vec = null
 
6
 
7
  [system]
8
  gpu_allocator = null
 
25
 
26
  [components.attribute_ruler]
27
  factory = "attribute_ruler"
28
+ scorer = {"@scorers":"spacy.attribute_ruler_scorer.v1"}
29
  validate = false
30
 
31
  [components.ner]
32
  factory = "ner"
33
  incorrect_spans_key = null
34
  moves = null
35
+ scorer = {"@scorers":"spacy.ner_scorer.v1"}
36
  update_with_oracle_cut_size = 100
37
 
38
  [components.ner.model]
 
66
  learn_tokens = false
67
  min_action_freq = 30
68
  moves = null
69
+ scorer = {"@scorers":"spacy.parser_scorer.v1"}
70
  update_with_oracle_cut_size = 100
71
 
72
  [components.parser.model]
 
85
 
86
  [components.senter]
87
  factory = "senter"
88
+ overwrite = false
89
+ scorer = {"@scorers":"spacy.senter_scorer.v1"}
90
 
91
  [components.senter.model]
92
  @architectures = "spacy.Tagger.v1"
 
111
 
112
  [components.tagger]
113
  factory = "tagger"
114
+ overwrite = false
115
+ scorer = {"@scorers":"spacy.tagger_scorer.v1"}
116
 
117
  [components.tagger.model]
118
  @architectures = "spacy.Tagger.v1"
 
147
 
148
  [corpora.dev]
149
  @readers = "spacy.Corpus.v1"
150
+ path = ${paths.dev}
 
 
151
  gold_preproc = false
152
+ max_length = 0
153
+ limit = 0
154
  augmenter = null
155
 
156
  [corpora.train]
157
  @readers = "spacy.Corpus.v1"
158
+ path = ${paths.train}
 
159
  gold_preproc = false
160
+ max_length = 0
161
  limit = 0
162
  augmenter = null
163
 
 
190
  t = 0.0
191
 
192
  [training.logger]
193
+ @loggers = "spacy.ConsoleLogger.v1"
194
+ progress_bar = false
 
195
 
196
  [training.optimizer]
197
  @optimizers = "Adam.v1"
 
205
  learn_rate = 0.001
206
 
207
  [training.score_weights]
208
+ tag_acc = 0.32
209
  dep_uas = 0.0
210
+ dep_las = 0.32
211
  dep_las_per_type = null
212
  sents_p = null
213
  sents_r = null
214
+ sents_f = 0.04
215
+ ents_f = 0.32
216
  ents_p = 0.0
217
  ents_r = 0.0
218
  ents_per_type = null
219
+ speed = 0.0
220
 
221
  [pretraining]
222
 
223
  [initialize]
224
+ vocab_data = null
225
  vectors = ${paths.vectors}
226
  init_tok2vec = ${paths.init_tok2vec}
227
  before_init = null
meta.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "lang":"zh",
3
  "name":"core_web_lg",
4
- "version":"3.1.0",
5
  "description":"Chinese pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler.",
6
  "author":"Explosion",
7
  "email":"contact@explosion.ai",
8
  "url":"https://explosion.ai",
9
  "license":"MIT",
10
- "spacy_version":">=3.1.0,<3.2.0",
11
- "spacy_git_version":"caba63b74",
12
  "vectors":{
13
  "width":300,
14
  "vectors":500000,
@@ -152,16 +152,15 @@
152
  ],
153
  "performance":{
154
  "token_acc":0.9788303388,
 
 
 
155
  "tag_acc":0.9037457747,
156
- "dep_uas":0.7069146954,
157
- "dep_las":0.6555390607,
158
- "ents_p":0.7358998362,
159
- "ents_r":0.6910989011,
160
- "ents_f":0.7127961011,
161
  "sents_p":0.7896445968,
162
  "sents_r":0.7286499084,
163
  "sents_f":0.7579220779,
164
- "speed":9733.8076235494,
 
165
  "dep_las_per_type":{
166
  "dep":{
167
  "p":0.4876810512,
@@ -389,6 +388,9 @@
389
  "f":0.9176470588
390
  }
391
  },
 
 
 
392
  "ents_per_type":{
393
  "DATE":{
394
  "p":0.7675925926,
@@ -480,7 +482,8 @@
480
  "r":0.5555555556,
481
  "f":0.5555555556
482
  }
483
- }
 
484
  },
485
  "sources":[
486
  {
 
1
  {
2
  "lang":"zh",
3
  "name":"core_web_lg",
4
+ "version":"3.2.0",
5
  "description":"Chinese pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler.",
6
  "author":"Explosion",
7
  "email":"contact@explosion.ai",
8
  "url":"https://explosion.ai",
9
  "license":"MIT",
10
+ "spacy_version":">=3.2.0,<3.3.0",
11
+ "spacy_git_version":"bb26550e2",
12
  "vectors":{
13
  "width":300,
14
  "vectors":500000,
 
152
  ],
153
  "performance":{
154
  "token_acc":0.9788303388,
155
+ "token_p":0.9458325855,
156
+ "token_r":0.9136060443,
157
+ "token_f":0.9294400505,
158
  "tag_acc":0.9037457747,
 
 
 
 
 
159
  "sents_p":0.7896445968,
160
  "sents_r":0.7286499084,
161
  "sents_f":0.7579220779,
162
+ "dep_uas":0.7069146954,
163
+ "dep_las":0.6555390607,
164
  "dep_las_per_type":{
165
  "dep":{
166
  "p":0.4876810512,
 
388
  "f":0.9176470588
389
  }
390
  },
391
+ "ents_p":0.7358998362,
392
+ "ents_r":0.6910989011,
393
+ "ents_f":0.7127961011,
394
  "ents_per_type":{
395
  "DATE":{
396
  "p":0.7675925926,
 
482
  "r":0.5555555556,
483
  "f":0.5555555556
484
  }
485
+ },
486
+ "speed":7127.6040150529
487
  },
488
  "sources":[
489
  {
senter/cfg CHANGED
@@ -1,3 +1,3 @@
1
  {
2
-
3
  }
 
1
  {
2
+ "overwrite":false
3
  }
tagger/cfg CHANGED
@@ -36,5 +36,6 @@
36
  "VE",
37
  "VV",
38
  "X"
39
- ]
 
40
  }
 
36
  "VE",
37
  "VV",
38
  "X"
39
+ ],
40
+ "overwrite":false
41
  }
vocab/strings.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:401539f9b54cffa79ffd8de96bdd43f4a6caff75dbb63a9cb3655696190fcfb6
3
- size 9845085
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9860bff8f8b50d10c77f43b97e932359ecb16be487fab650fd5e7ae3895101fc
3
+ size 10513704
vocab/vectors.cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "mode":"default"
3
+ }
zh_core_web_lg-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:292a92db6ef0ef5c60756e6de7bc98bb43fdf92655b6def5fb7558e2e8cd8474
3
- size 603784210
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ad7469433d4402b3d24083af28f41c8b1f7da5cd016146a843b7c35efc4745f
3
+ size 603932201