|
""" |
|
This module contains various rule-based components aiming to improve on baseline lemmatization tools. |
|
""" |
|
|
|
import re |
|
from typing import List, Callable |
|
|
|
from spacy.lang.hu import Hungarian |
|
from spacy.pipeline import Pipe |
|
from spacy.tokens import Token |
|
from spacy.tokens.doc import Doc |
|
|
|
|
|
@Hungarian.component( |
|
"lemma_case_smoother", |
|
assigns=["token.lemma"], |
|
requires=["token.lemma", "token.pos"], |
|
) |
|
def lemma_case_smoother(doc: Doc) -> Doc: |
|
"""Smooth lemma casing by POS. |
|
|
|
DEPRECATED: This is not needed anymore, as the lemmatizer is now case-insensitive. |
|
|
|
Args: |
|
doc (Doc): Input document. |
|
|
|
Returns: |
|
Doc: Output document. |
|
""" |
|
for token in doc: |
|
if token.is_sent_start and token.tag_ != "PROPN": |
|
token.lemma_ = token.lemma_.lower() |
|
|
|
return doc |
|
|
|
|
|
class LemmaSmoother(Pipe): |
|
"""Smooths lemma by fixing common errors of the edit-tree lemmatizer.""" |
|
|
|
_DATE_PATTERN = re.compile(r"(\d+)-j?[éá]?n?a?(t[őó]l)?") |
|
_NUMBER_PATTERN = re.compile(r"(\d+([-,/_.:]?(._)?\d+)*%?)") |
|
|
|
|
|
@staticmethod |
|
@Hungarian.factory("lemma_smoother", assigns=["token.lemma"], requires=["token.lemma", "token.pos"]) |
|
def create_lemma_smoother(nlp: Hungarian, name: str) -> "LemmaSmoother": |
|
return LemmaSmoother() |
|
|
|
def __call__(self, doc: Doc) -> Doc: |
|
rules: List[Callable] = [ |
|
self._remove_exclamation_marks, |
|
self._remove_question_marks, |
|
self._remove_date_suffixes, |
|
self._remove_suffix_after_numbers, |
|
] |
|
|
|
for token in doc: |
|
for rule in rules: |
|
rule(token) |
|
|
|
return doc |
|
|
|
@classmethod |
|
def _remove_exclamation_marks(cls, token: Token) -> None: |
|
"""Removes exclamation marks from the lemma. |
|
|
|
Args: |
|
token (Token): The original token. |
|
""" |
|
|
|
if "!" != token.lemma_: |
|
exclamation_mark_index = token.lemma_.find("!") |
|
if exclamation_mark_index != -1: |
|
token.lemma_ = token.lemma_[:exclamation_mark_index] |
|
|
|
@classmethod |
|
def _remove_question_marks(cls, token: Token) -> None: |
|
"""Removes question marks from the lemma. |
|
|
|
Args: |
|
token (Token): The original token. |
|
""" |
|
|
|
if "?" != token.lemma_: |
|
question_mark_index = token.lemma_.find("?") |
|
if question_mark_index != -1: |
|
token.lemma_ = token.lemma_[:question_mark_index] |
|
|
|
@classmethod |
|
def _remove_date_suffixes(cls, token: Token) -> None: |
|
"""Fixes the suffixes of dates. |
|
|
|
Args: |
|
token (Token): The original token. |
|
""" |
|
|
|
if token.pos_ == "NOUN": |
|
match = cls._DATE_PATTERN.match(token.lemma_) |
|
if match is not None: |
|
token.lemma_ = match.group(1) + "." |
|
|
|
@classmethod |
|
def _remove_suffix_after_numbers(cls, token: Token) -> None: |
|
"""Removes suffixes after numbers. |
|
|
|
Args: |
|
token (str): The original token. |
|
""" |
|
|
|
if token.pos_ == "NUM": |
|
match = cls._NUMBER_PATTERN.match(token.text) |
|
if match is not None: |
|
token.lemma_ = match.group(0) |
|
|