Update spaCy pipeline
Browse files- README.md +11 -11
- attribute_ruler/patterns +0 -0
- config.cfg +8 -5
- custom_functions.py +9 -6
- en_engagement_spl_RoBERTa_base_attention-any-py3-none-any.whl +2 -2
- meta.json +25 -25
- ner/model +0 -0
- parser/model +0 -0
- spancat/cfg +1 -3
- spancat/model +2 -2
- tagger/cfg +0 -1
- tagger/model +0 -0
- tokenizer +0 -0
- trainable_transformer/model +2 -2
- transformer/model +2 -2
- vocab/strings.json +2 -1
README.md
CHANGED
@@ -54,13 +54,13 @@ model-index:
|
|
54 |
metrics:
|
55 |
- name: Sentences F-Score
|
56 |
type: f_score
|
57 |
-
value: 0.
|
58 |
---
|
59 |
| Feature | Description |
|
60 |
| --- | --- |
|
61 |
| **Name** | `en_engagement_spl_RoBERTa_base_attention` |
|
62 |
-
| **Version** | `0.0.
|
63 |
-
| **spaCy** | `>=3.
|
64 |
| **Default Pipeline** | `transformer`, `parser`, `tagger`, `ner`, `attribute_ruler`, `lemmatizer`, `trainable_transformer`, `spancat` |
|
65 |
| **Components** | `transformer`, `parser`, `tagger`, `ner`, `attribute_ruler`, `lemmatizer`, `trainable_transformer`, `spancat` |
|
66 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
@@ -90,16 +90,16 @@ model-index:
|
|
90 |
| `DEP_UAS` | 0.00 |
|
91 |
| `DEP_LAS` | 0.00 |
|
92 |
| `DEP_LAS_PER_TYPE` | 0.00 |
|
93 |
-
| `SENTS_P` |
|
94 |
-
| `SENTS_R` |
|
95 |
-
| `SENTS_F` |
|
96 |
| `TAG_ACC` | 0.00 |
|
97 |
| `ENTS_F` | 0.00 |
|
98 |
| `ENTS_P` | 0.00 |
|
99 |
| `ENTS_R` | 0.00 |
|
100 |
| `LEMMA_ACC` | 0.00 |
|
101 |
-
| `SPANS_SC_F` |
|
102 |
-
| `SPANS_SC_P` |
|
103 |
-
| `SPANS_SC_R` |
|
104 |
-
| `TRAINABLE_TRANSFORMER_LOSS` |
|
105 |
-
| `SPANCAT_LOSS` |
|
|
|
54 |
metrics:
|
55 |
- name: Sentences F-Score
|
56 |
type: f_score
|
57 |
+
value: 0.9144831558
|
58 |
---
|
59 |
| Feature | Description |
|
60 |
| --- | --- |
|
61 |
| **Name** | `en_engagement_spl_RoBERTa_base_attention` |
|
62 |
+
| **Version** | `0.0.2` |
|
63 |
+
| **spaCy** | `>=3.4.4,<3.5.0` |
|
64 |
| **Default Pipeline** | `transformer`, `parser`, `tagger`, `ner`, `attribute_ruler`, `lemmatizer`, `trainable_transformer`, `spancat` |
|
65 |
| **Components** | `transformer`, `parser`, `tagger`, `ner`, `attribute_ruler`, `lemmatizer`, `trainable_transformer`, `spancat` |
|
66 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
|
|
90 |
| `DEP_UAS` | 0.00 |
|
91 |
| `DEP_LAS` | 0.00 |
|
92 |
| `DEP_LAS_PER_TYPE` | 0.00 |
|
93 |
+
| `SENTS_P` | 89.82 |
|
94 |
+
| `SENTS_R` | 93.14 |
|
95 |
+
| `SENTS_F` | 91.45 |
|
96 |
| `TAG_ACC` | 0.00 |
|
97 |
| `ENTS_F` | 0.00 |
|
98 |
| `ENTS_P` | 0.00 |
|
99 |
| `ENTS_R` | 0.00 |
|
100 |
| `LEMMA_ACC` | 0.00 |
|
101 |
+
| `SPANS_SC_F` | 76.99 |
|
102 |
+
| `SPANS_SC_P` | 77.84 |
|
103 |
+
| `SPANS_SC_R` | 76.17 |
|
104 |
+
| `TRAINABLE_TRANSFORMER_LOSS` | 482.53 |
|
105 |
+
| `SPANCAT_LOSS` | 68571.83 |
|
attribute_ruler/patterns
CHANGED
Binary files a/attribute_ruler/patterns and b/attribute_ruler/patterns differ
|
|
config.cfg
CHANGED
@@ -85,11 +85,16 @@ spans_key = ${vars.spans_key}
|
|
85 |
threshold = 0.5
|
86 |
|
87 |
[components.spancat.model]
|
88 |
-
@architectures = "Attention_SpanCategorizer.
|
|
|
|
|
|
|
89 |
|
90 |
[components.spancat.model.reducer]
|
91 |
-
@layers = "
|
92 |
hidden_size = 128
|
|
|
|
|
93 |
|
94 |
[components.spancat.model.scorer]
|
95 |
@layers = "spacy.LinearLogistic.v1"
|
@@ -108,7 +113,6 @@ sizes = [1,2,3,4,5,6,7,8,9,10,11,12]
|
|
108 |
|
109 |
[components.tagger]
|
110 |
factory = "tagger"
|
111 |
-
label_smoothing = 0.0
|
112 |
neg_prefix = "!"
|
113 |
overwrite = false
|
114 |
scorer = {"@scorers":"spacy.tagger_scorer.v1"}
|
@@ -147,8 +151,8 @@ max_batch_items = 4096
|
|
147 |
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
|
148 |
|
149 |
[components.transformer.model]
|
150 |
-
name = "roberta-base"
|
151 |
@architectures = "spacy-transformers.TransformerModel.v3"
|
|
|
152 |
mixed_precision = false
|
153 |
|
154 |
[components.transformer.model.get_spans]
|
@@ -195,7 +199,6 @@ eval_frequency = 200
|
|
195 |
frozen_components = ["transformer","parser","tagger","ner","attribute_ruler","lemmatizer"]
|
196 |
annotating_components = ["parser"]
|
197 |
before_to_disk = null
|
198 |
-
before_update = null
|
199 |
|
200 |
[training.batcher]
|
201 |
@batchers = "spacy.batch_by_words.v1"
|
|
|
85 |
threshold = 0.5
|
86 |
|
87 |
[components.spancat.model]
|
88 |
+
@architectures = "Attention_SpanCategorizer.v4"
|
89 |
+
LSTMdepth = 1
|
90 |
+
LSTMdropout = 0.0
|
91 |
+
LSTMhidden = 200
|
92 |
|
93 |
[components.spancat.model.reducer]
|
94 |
+
@layers = "mean_max_reducer.v1.5"
|
95 |
hidden_size = 128
|
96 |
+
dropout = 0.0
|
97 |
+
depth = 1
|
98 |
|
99 |
[components.spancat.model.scorer]
|
100 |
@layers = "spacy.LinearLogistic.v1"
|
|
|
113 |
|
114 |
[components.tagger]
|
115 |
factory = "tagger"
|
|
|
116 |
neg_prefix = "!"
|
117 |
overwrite = false
|
118 |
scorer = {"@scorers":"spacy.tagger_scorer.v1"}
|
|
|
151 |
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
|
152 |
|
153 |
[components.transformer.model]
|
|
|
154 |
@architectures = "spacy-transformers.TransformerModel.v3"
|
155 |
+
name = "roberta-base"
|
156 |
mixed_precision = false
|
157 |
|
158 |
[components.transformer.model.get_spans]
|
|
|
199 |
frozen_components = ["transformer","parser","tagger","ner","attribute_ruler","lemmatizer"]
|
200 |
annotating_components = ["parser"]
|
201 |
before_to_disk = null
|
|
|
202 |
|
203 |
[training.batcher]
|
204 |
@batchers = "spacy.batch_by_words.v1"
|
custom_functions.py
CHANGED
@@ -368,7 +368,9 @@ def build_spancat_LSTM_model(
|
|
368 |
tok2vec: Model[List[Doc], List[Floats2d]],
|
369 |
reducer: Model[Ragged, Floats2d],
|
370 |
scorer: Model[Floats2d, Floats2d],
|
371 |
-
|
|
|
|
|
372 |
"""Build a span categorizer model, given a token-to-vector model, a
|
373 |
reducer model to map the sequence of vectors for each span down to a single
|
374 |
vector, and a scorer model to map the vectors to probabilities.
|
@@ -383,17 +385,18 @@ def build_spancat_LSTM_model(
|
|
383 |
0,
|
384 |
chain(
|
385 |
tok2vec,
|
|
|
|
|
|
|
|
|
|
|
386 |
cast(Model[List[Floats2d], Ragged], list2ragged()))))
|
387 |
|
388 |
-
attention_layer = chain(
|
389 |
-
ParametricAttention(nO = width),
|
390 |
-
list2ragged())
|
391 |
|
392 |
-
|
393 |
model = chain(
|
394 |
embedding,
|
395 |
-
attention_layer,
|
396 |
extract_spans(),
|
|
|
397 |
reducer,
|
398 |
scorer,
|
399 |
)
|
|
|
368 |
tok2vec: Model[List[Doc], List[Floats2d]],
|
369 |
reducer: Model[Ragged, Floats2d],
|
370 |
scorer: Model[Floats2d, Floats2d],
|
371 |
+
LSTMdepth: int = 1,
|
372 |
+
LSTMdropout: float = 0.0,
|
373 |
+
LSTMhidden: int = 200) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
|
374 |
"""Build a span categorizer model, given a token-to-vector model, a
|
375 |
reducer model to map the sequence of vectors for each span down to a single
|
376 |
vector, and a scorer model to map the vectors to probabilities.
|
|
|
385 |
0,
|
386 |
chain(
|
387 |
tok2vec,
|
388 |
+
PyTorchLSTM(nI=768,
|
389 |
+
nO=LSTMhidden,
|
390 |
+
bi=True,
|
391 |
+
depth=LSTMdepth,
|
392 |
+
dropout=LSTMdropout),
|
393 |
cast(Model[List[Floats2d], Ragged], list2ragged()))))
|
394 |
|
|
|
|
|
|
|
395 |
|
|
|
396 |
model = chain(
|
397 |
embedding,
|
|
|
398 |
extract_spans(),
|
399 |
+
ParametricAttention(nO = width),
|
400 |
reducer,
|
401 |
scorer,
|
402 |
)
|
en_engagement_spl_RoBERTa_base_attention-any-py3-none-any.whl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d645450810f3d7ad0f9567642780fce0ede7d726fcbc5d62f0d48b7f39e80295
|
3 |
+
size 903169587
|
meta.json
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
{
|
2 |
"lang":"en",
|
3 |
"name":"engagement_spl_RoBERTa_base_attention",
|
4 |
-
"version":"0.0.
|
5 |
"description":"",
|
6 |
"author":"",
|
7 |
"email":"",
|
8 |
"url":"",
|
9 |
"license":"",
|
10 |
-
"spacy_version":">=3.
|
11 |
-
"spacy_git_version":"
|
12 |
"vectors":{
|
13 |
"width":0,
|
14 |
"vectors":0,
|
@@ -186,9 +186,9 @@
|
|
186 |
"dep_uas":0.0,
|
187 |
"dep_las":0.0,
|
188 |
"dep_las_per_type":0.0,
|
189 |
-
"sents_p":0.
|
190 |
-
"sents_r":0.
|
191 |
-
"sents_f":0.
|
192 |
"tag_acc":0.0,
|
193 |
"ents_f":0.0,
|
194 |
"ents_p":0.0,
|
@@ -204,27 +204,22 @@
|
|
204 |
"r":0.0,
|
205 |
"f":0.0
|
206 |
},
|
207 |
-
"PROCLAIM":{
|
208 |
-
"p":0.0,
|
209 |
-
"r":0.0,
|
210 |
-
"f":0.0
|
211 |
-
},
|
212 |
"COUNTER":{
|
213 |
"p":0.0,
|
214 |
"r":0.0,
|
215 |
"f":0.0
|
216 |
},
|
217 |
-
"
|
218 |
"p":0.0,
|
219 |
"r":0.0,
|
220 |
"f":0.0
|
221 |
},
|
222 |
-
"
|
223 |
"p":0.0,
|
224 |
"r":0.0,
|
225 |
"f":0.0
|
226 |
},
|
227 |
-
"
|
228 |
"p":0.0,
|
229 |
"r":0.0,
|
230 |
"f":0.0
|
@@ -274,22 +269,27 @@
|
|
274 |
"r":0.0,
|
275 |
"f":0.0
|
276 |
},
|
277 |
-
"
|
278 |
"p":0.0,
|
279 |
"r":0.0,
|
280 |
"f":0.0
|
281 |
},
|
282 |
-
"
|
283 |
"p":0.0,
|
284 |
"r":0.0,
|
285 |
"f":0.0
|
286 |
},
|
287 |
-
"
|
288 |
"p":0.0,
|
289 |
"r":0.0,
|
290 |
"f":0.0
|
291 |
},
|
292 |
-
"
|
|
|
|
|
|
|
|
|
|
|
293 |
"p":0.0,
|
294 |
"r":0.0,
|
295 |
"f":0.0
|
@@ -316,14 +316,14 @@
|
|
316 |
}
|
317 |
},
|
318 |
"lemma_acc":0.0,
|
319 |
-
"spans_sc_f":0.
|
320 |
-
"spans_sc_p":0.
|
321 |
-
"spans_sc_r":0.
|
322 |
-
"trainable_transformer_loss":
|
323 |
-
"spancat_loss":
|
324 |
},
|
325 |
"requirements":[
|
326 |
-
"spacy-transformers>=1.
|
327 |
-
"spacy-experimental>=0.6.
|
328 |
]
|
329 |
}
|
|
|
1 |
{
|
2 |
"lang":"en",
|
3 |
"name":"engagement_spl_RoBERTa_base_attention",
|
4 |
+
"version":"0.0.2",
|
5 |
"description":"",
|
6 |
"author":"",
|
7 |
"email":"",
|
8 |
"url":"",
|
9 |
"license":"",
|
10 |
+
"spacy_version":">=3.4.4,<3.5.0",
|
11 |
+
"spacy_git_version":"77833bfef",
|
12 |
"vectors":{
|
13 |
"width":0,
|
14 |
"vectors":0,
|
|
|
186 |
"dep_uas":0.0,
|
187 |
"dep_las":0.0,
|
188 |
"dep_las_per_type":0.0,
|
189 |
+
"sents_p":0.8981900452,
|
190 |
+
"sents_r":0.9313782991,
|
191 |
+
"sents_f":0.9144831558,
|
192 |
"tag_acc":0.0,
|
193 |
"ents_f":0.0,
|
194 |
"ents_p":0.0,
|
|
|
204 |
"r":0.0,
|
205 |
"f":0.0
|
206 |
},
|
|
|
|
|
|
|
|
|
|
|
207 |
"COUNTER":{
|
208 |
"p":0.0,
|
209 |
"r":0.0,
|
210 |
"f":0.0
|
211 |
},
|
212 |
+
"PROCLAIM":{
|
213 |
"p":0.0,
|
214 |
"r":0.0,
|
215 |
"f":0.0
|
216 |
},
|
217 |
+
"PERSON":{
|
218 |
"p":0.0,
|
219 |
"r":0.0,
|
220 |
"f":0.0
|
221 |
},
|
222 |
+
"FAC":{
|
223 |
"p":0.0,
|
224 |
"r":0.0,
|
225 |
"f":0.0
|
|
|
269 |
"r":0.0,
|
270 |
"f":0.0
|
271 |
},
|
272 |
+
"LAW":{
|
273 |
"p":0.0,
|
274 |
"r":0.0,
|
275 |
"f":0.0
|
276 |
},
|
277 |
+
"NORP":{
|
278 |
"p":0.0,
|
279 |
"r":0.0,
|
280 |
"f":0.0
|
281 |
},
|
282 |
+
"GPE":{
|
283 |
"p":0.0,
|
284 |
"r":0.0,
|
285 |
"f":0.0
|
286 |
},
|
287 |
+
"QUANTITY":{
|
288 |
+
"p":0.0,
|
289 |
+
"r":0.0,
|
290 |
+
"f":0.0
|
291 |
+
},
|
292 |
+
"WORK_OF_ART":{
|
293 |
"p":0.0,
|
294 |
"r":0.0,
|
295 |
"f":0.0
|
|
|
316 |
}
|
317 |
},
|
318 |
"lemma_acc":0.0,
|
319 |
+
"spans_sc_f":0.7699386503,
|
320 |
+
"spans_sc_p":0.7783750258,
|
321 |
+
"spans_sc_r":0.7616831883,
|
322 |
+
"trainable_transformer_loss":4.8252773336,
|
323 |
+
"spancat_loss":685.7182957505
|
324 |
},
|
325 |
"requirements":[
|
326 |
+
"spacy-transformers>=1.1.8,<1.2.0",
|
327 |
+
"spacy-experimental>=0.6.1,<0.7.0"
|
328 |
]
|
329 |
}
|
ner/model
CHANGED
Binary files a/ner/model and b/ner/model differ
|
|
parser/model
CHANGED
Binary files a/parser/model and b/parser/model differ
|
|
spancat/cfg
CHANGED
@@ -13,7 +13,5 @@
|
|
13 |
],
|
14 |
"spans_key":"sc",
|
15 |
"threshold":0.5,
|
16 |
-
"max_positive":null
|
17 |
-
"negative_weight":null,
|
18 |
-
"allow_overlap":true
|
19 |
}
|
|
|
13 |
],
|
14 |
"spans_key":"sc",
|
15 |
"threshold":0.5,
|
16 |
+
"max_positive":null
|
|
|
|
|
17 |
}
|
spancat/model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a619bfcf65a09d7ae3530f875da94ff971cb9cc9541c75a46ba323935c6e99e5
|
3 |
+
size 4025694
|
tagger/cfg
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
{
|
2 |
-
"label_smoothing":0.0,
|
3 |
"labels":[
|
4 |
"$",
|
5 |
"''",
|
|
|
1 |
{
|
|
|
2 |
"labels":[
|
3 |
"$",
|
4 |
"''",
|
tagger/model
CHANGED
Binary files a/tagger/model and b/tagger/model differ
|
|
tokenizer
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
trainable_transformer/model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0563af895d09415d3a4bb71890938adf4e29777c45ff9d3ebf484677d74f8101
|
3 |
+
size 502027816
|
transformer/model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5cb2327f9fbf15d08de679e5cfd9b1ae1fc4c565d3f3438d8e627d85a3c577a3
|
3 |
+
size 502027925
|
vocab/strings.json
CHANGED
@@ -3384,7 +3384,6 @@
|
|
3384 |
"3500",
|
3385 |
"35087.38",
|
3386 |
"350Ms",
|
3387 |
-
"350m",
|
3388 |
"350ms",
|
3389 |
"351",
|
3390 |
"351.2",
|
@@ -68892,6 +68891,7 @@
|
|
68892 |
"pennzoil",
|
68893 |
"pens",
|
68894 |
"pensacola",
|
|
|
68895 |
"pensees",
|
68896 |
"pension",
|
68897 |
"pensions",
|
@@ -86547,6 +86547,7 @@
|
|
86547 |
"wolves",
|
86548 |
"womack",
|
86549 |
"woman",
|
|
|
86550 |
"womanizing",
|
86551 |
"womanly",
|
86552 |
"womans",
|
|
|
3384 |
"3500",
|
3385 |
"35087.38",
|
3386 |
"350Ms",
|
|
|
3387 |
"350ms",
|
3388 |
"351",
|
3389 |
"351.2",
|
|
|
68891 |
"pennzoil",
|
68892 |
"pens",
|
68893 |
"pensacola",
|
68894 |
+
"pensee",
|
68895 |
"pensees",
|
68896 |
"pension",
|
68897 |
"pensions",
|
|
|
86547 |
"wolves",
|
86548 |
"womack",
|
86549 |
"woman",
|
86550 |
+
"womanize",
|
86551 |
"womanizing",
|
86552 |
"womanly",
|
86553 |
"womans",
|