|
from spacy.tokens import Doc, Span, Token |
|
from spacy.matcher import PhraseMatcher |
|
from spacy.util import filter_spans |
|
from spacy.language import Language |
|
import re |
|
|
|
|
|
@Language.factory("count_extraction_component") |
|
class CountExtractorComponent(object): |
|
|
|
def __init__(self, nlp, name, label="CONNECTION"): |
|
self.label = label |
|
self.reg_left = re.compile(r"^(?P<count>\d+)\s*[xX]\s*(?P<name>.+)$") |
|
self.reg_right = re.compile(r"^(?P<name>.+)\s*[xX]\s*(?P<count>\d+)$") |
|
self.reg_right_inverted = re.compile(r"^(?P<name>.+)\s*(?P<count>\d+)\s*[xX]$") |
|
|
|
|
|
Span.set_extension("count", default=None, force=True) |
|
Span.set_extension("text", default=None, force=True) |
|
|
|
def __call__(self, doc): |
|
for ent in doc.ents: |
|
text = ent.text.strip() |
|
if ent.label_ != self.label: |
|
ent._.text = text |
|
continue |
|
|
|
m = self.reg_left.match(text) |
|
if m is not None: |
|
map = m.groupdict() |
|
ent._.text = map["name"].strip() |
|
ent._.count = int(map["count"]) |
|
continue |
|
|
|
m = self.reg_right.match(text) |
|
if m is not None: |
|
map = m.groupdict() |
|
ent._.text = map["name"].strip() |
|
ent._.count = int(map["count"]) |
|
continue |
|
|
|
m = self.reg_right_inverted.match(text) |
|
if m is not None: |
|
map = m.groupdict() |
|
ent._.text = map["name"].strip() |
|
ent._.count = int(map["count"]) |
|
continue |
|
|
|
ent._.text = text |
|
ent._.count = 1 |
|
return doc |
|
|