import os import json from pathlib import Path from typing import Dict, List, Literal, Optional, Union, Iterable from typing_extensions import TypedDict, NotRequired from spacy.language import Language from spacy.pipeline import Pipe from spacy.pipeline.lemmatizer import lemmatizer_score from spacy.util import ensure_path from spacy.tokens import Doc, Token MATCH_ORDER = [ "upos", "Tense", "VerbForm", "Voice", "Case", "Gender", "Number", "Degree", "Mood", "Person", "Aspect", "Definite", "PronType", "Polarity", "Poss", "Reflex", ] class TableEntry(TypedDict): form: str lemma: str upos: str frequency: int Tense: NotRequired[str] VerbForm: NotRequired[str] Voice: NotRequired[str] Case: NotRequired[str] Gender: NotRequired[str] Number: NotRequired[str] Degree: NotRequired[str] Mood: NotRequired[str] Person: NotRequired[str] Aspect: NotRequired[str] Definite: NotRequired[str] PronType: NotRequired[str] Polarity: NotRequired[str] Poss: NotRequired[str] Reflex: NotRequired[str] FrequencyTable = Dict[str, List[TableEntry]] LookupTable = Dict[str, str] @Language.factory( "frequency_lemmatizer", assigns=["token.lemma"], default_config={ "overwrite": True, "fallback_priority": "lookup", }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( nlp: Language, name: str, overwrite: bool, fallback_priority: Literal["lemma", "lookup"], ): return FrequencyLemmatizer( nlp=nlp, name=name, overwrite=overwrite, fallback_priority=fallback_priority, ) # type: ignore def max_freq_lemma(entries: List[TableEntry]) -> str: """Returns lemma with highest frequency from the given entries.""" max_index = 0 n_entries = len(entries) for index in range(1, n_entries): if entries[index]["frequency"] > entries[max_index]["frequency"]: max_index = index return entries[max_index]["lemma"] def match_lemma( token_entry: TableEntry, table: FrequencyTable ) -> Optional[str]: """Returns a lemma for a token if it can be found in the frequency table. """ # Tries to find the entries associated with the token in the table match = table.get(token_entry["form"], []) if not match: return None # We go through all the properties to be matched for match_property in MATCH_ORDER: match_new = [ entry for entry in match if entry.get(match_property, "") == token_entry.get(match_property, "") ] if not match_new: return max_freq_lemma(entries=match) match = match_new return max_freq_lemma(entries=match) def read_json(path: str) -> Dict: with open(path) as file: res = json.load(file) return res def write_json(object: Dict, path: str) -> None: with open(path, "w") as file: json.dump(object, file) class FrequencyLemmatizer(Pipe): """ Part-of-speech and morphology, and frequency sensitive rule-based lemmatizer. Parameters ---------- overwrite: bool, default True Specifies whether the frequency lemmatizer should overwrite already assigned lemmas. fallback_priority: 'lemma' or 'lookup', default 'lookup' Specifies which fallback should have higher priority if the lemma is not found in the primary table. """ def __init__( self, nlp: Language, name: str = "freq_lemmatizer", *, overwrite: bool = True, fallback_priority: Literal["lemma", "lookup"] = "lookup", ): self.name = name self.overwrite = overwrite self.scorer = lemmatizer_score self.fallback_priority = fallback_priority def initialize( self, get_examples=None, *, nlp=None, table: Optional[FrequencyTable] = None, lookup: Optional[LookupTable] = None, ) -> None: """Initializes the frequency lemmatizer from given lemma table and lookup. Parameters ---------- table: iterable of entries or None, default None Iterable of all entries in the lemma table with pos tags morph features and frequencies. lookup: dict of str to str or None, default None Backoff lookup table for simple token-lemma lookup. """ if table is None: self.table = None else: self.table = table self.lookup = lookup def backoff(self, token: Token) -> str: """Gets backoff token based on priority.""" orth = token.orth_.lower() lookup = self.lookup in_lookup = (lookup is not None) and (orth in lookup) priority = self.fallback_priority has_lemma = (token.lemma != 0) and (token.lemma_ != token.orth_) if in_lookup: if priority == "lookup": return lookup[orth] # type: ignore else: if has_lemma: return token.lemma_ else: return token.orth_ else: if has_lemma: return token.lemma_ else: return token.orth_ def lemmatize(self, token: Token) -> str: """Lemmatizes token.""" backoff = self.backoff(token) orth = token.orth_.lower() # If the table is empty we early return if self.table is None: return backoff # I only add frequency for type compatibility token_entry: TableEntry = TableEntry( form=orth, upos=token.pos_, frequency=-1, **token.morph.to_dict() ) lemma = match_lemma(token_entry=token_entry, table=self.table) if lemma is None: return backoff else: return lemma def __call__(self, doc: Doc) -> Doc: """Apply the lemmatization to a document.""" error_handler = self.get_error_handler() try: for token in doc: if self.overwrite or token.lemma == 0: token.lemma_ = self.lemmatize(token) return doc except Exception as e: error_handler(self.name, self, [doc], e) def to_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() ): """Save frequency lemmatizer data to a directory.""" path = ensure_path(path) Path(path).mkdir(parents=True, exist_ok=True) config = dict( overwrite=self.overwrite, fallback_priority=self.fallback_priority ) with open(os.path.join(path, "config.json"), "w") as config_file: json.dump(config, config_file) if self.table is not None: table_path = os.path.join(path, "table.json") write_json(self.table, path=table_path) if self.lookup is not None: lookup_path = os.path.join(path, "lookup.json") write_json(self.lookup, path=lookup_path) def from_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() ) -> "FrequencyLemmatizer": """Load component from disk.""" path = ensure_path(path) config = read_json(os.path.join(path, "config.json")) self.overwrite = config.get("overwrite", self.overwrite) self.fallback_priority = config.get( "fallback_priority", self.fallback_priority ) try: table: Optional[FrequencyTable] = read_json( os.path.join(path, "table.json") ) except FileNotFoundError: table = None try: lookup: Optional[LookupTable] = read_json( os.path.join(path, "lookup.json") ) except FileNotFoundError: lookup = None self.initialize(table=table, lookup=lookup) return self