|
"""
|
|
This module contains various rule-based components aiming to improve on baseline lemmatization tools.
|
|
"""
|
|
|
|
import re
|
|
from typing import List, Callable
|
|
|
|
from spacy.lang.hu import Hungarian
|
|
from spacy.pipeline import Pipe
|
|
from spacy.tokens import Token
|
|
from spacy.tokens.doc import Doc
|
|
|
|
|
|
@Hungarian.component(
|
|
"lemma_case_smoother",
|
|
assigns=["token.lemma"],
|
|
requires=["token.lemma", "token.pos"],
|
|
)
|
|
def lemma_case_smoother(doc: Doc) -> Doc:
|
|
"""Smooth lemma casing by POS.
|
|
|
|
DEPRECATED: This is not needed anymore, as the lemmatizer is now case-insensitive.
|
|
|
|
Args:
|
|
doc (Doc): Input document.
|
|
|
|
Returns:
|
|
Doc: Output document.
|
|
"""
|
|
for token in doc:
|
|
if token.is_sent_start and token.tag_ != "PROPN":
|
|
token.lemma_ = token.lemma_.lower()
|
|
|
|
return doc
|
|
|
|
|
|
class LemmaSmoother(Pipe):
|
|
"""Smooths lemma by fixing common errors of the edit-tree lemmatizer."""
|
|
|
|
_DATE_PATTERN = re.compile(r"(\d+)-j?[éá]?n?a?(t[őó]l)?")
|
|
_NUMBER_PATTERN = re.compile(r"(\d+([-,/_.:]?(._)?\d+)*%?)")
|
|
|
|
|
|
@staticmethod
|
|
@Hungarian.factory("lemma_smoother", assigns=["token.lemma"], requires=["token.lemma", "token.pos"])
|
|
def create_lemma_smoother(nlp: Hungarian, name: str) -> "LemmaSmoother":
|
|
return LemmaSmoother()
|
|
|
|
def __call__(self, doc: Doc) -> Doc:
|
|
rules: List[Callable] = [
|
|
self._remove_exclamation_marks,
|
|
self._remove_question_marks,
|
|
self._remove_date_suffixes,
|
|
self._remove_suffix_after_numbers,
|
|
]
|
|
|
|
for token in doc:
|
|
for rule in rules:
|
|
rule(token)
|
|
|
|
return doc
|
|
|
|
@classmethod
|
|
def _remove_exclamation_marks(cls, token: Token) -> None:
|
|
"""Removes exclamation marks from the lemma.
|
|
|
|
Args:
|
|
token (Token): The original token.
|
|
"""
|
|
|
|
if "!" != token.lemma_:
|
|
exclamation_mark_index = token.lemma_.find("!")
|
|
if exclamation_mark_index != -1:
|
|
token.lemma_ = token.lemma_[:exclamation_mark_index]
|
|
|
|
@classmethod
|
|
def _remove_question_marks(cls, token: Token) -> None:
|
|
"""Removes question marks from the lemma.
|
|
|
|
Args:
|
|
token (Token): The original token.
|
|
"""
|
|
|
|
if "?" != token.lemma_:
|
|
question_mark_index = token.lemma_.find("?")
|
|
if question_mark_index != -1:
|
|
token.lemma_ = token.lemma_[:question_mark_index]
|
|
|
|
@classmethod
|
|
def _remove_date_suffixes(cls, token: Token) -> None:
|
|
"""Fixes the suffixes of dates.
|
|
|
|
Args:
|
|
token (Token): The original token.
|
|
"""
|
|
|
|
if token.pos_ == "NOUN":
|
|
match = cls._DATE_PATTERN.match(token.lemma_)
|
|
if match is not None:
|
|
token.lemma_ = match.group(1) + "."
|
|
|
|
@classmethod
|
|
def _remove_suffix_after_numbers(cls, token: Token) -> None:
|
|
"""Removes suffixes after numbers.
|
|
|
|
Args:
|
|
token (str): The original token.
|
|
"""
|
|
|
|
if token.pos_ == "NUM":
|
|
match = cls._NUMBER_PATTERN.match(token.text)
|
|
if match is not None:
|
|
token.lemma_ = match.group(0)
|
|
|