en_setec_mk_tv / count_extraction_component.py
oh201516's picture
Update spaCy pipeline
ecea783 verified
from spacy.tokens import Doc, Span, Token
from spacy.matcher import PhraseMatcher
from spacy.util import filter_spans
from spacy.language import Language
import re
# https://spacy.io/usage/processing-pipelines#custom-components
@Language.factory("count_extraction_component")
class CountExtractorComponent(object):
# By default it only extracts count from CONNECTION type but this can be changed.
def __init__(self, nlp, name, label="CONNECTION"):
self.label = label
self.reg_left = re.compile(r"^(?P<count>\d+)\s*[xX]\s*(?P<name>.+)$")
self.reg_right = re.compile(r"^(?P<name>.+)\s*[xX]\s*(?P<count>\d+)$")
self.reg_right_inverted = re.compile(r"^(?P<name>.+)\s*(?P<count>\d+)\s*[xX]$")
# set extensions to tokens, spans and docs
Span.set_extension("count", default=None, force=True)
Span.set_extension("text", default=None, force=True)
def __call__(self, doc):
for ent in doc.ents:
text = ent.text.strip()
if ent.label_ != self.label:
ent._.text = text
continue
m = self.reg_left.match(text)
if m is not None:
map = m.groupdict()
ent._.text = map["name"].strip()
ent._.count = int(map["count"])
continue
m = self.reg_right.match(text)
if m is not None:
map = m.groupdict()
ent._.text = map["name"].strip()
ent._.count = int(map["count"])
continue
m = self.reg_right_inverted.match(text)
if m is not None:
map = m.groupdict()
ent._.text = map["name"].strip()
ent._.count = int(map["count"])
continue
ent._.text = text
ent._.count = 1
return doc