Update spaCy pipeline

7771474 over 1 year ago

8.07 kB

	import os
	import json
	from pathlib import Path
	from typing import Dict, List, Literal, Optional, Union, Iterable
	from typing_extensions import TypedDict, NotRequired

	from spacy.language import Language
	from spacy.pipeline import Pipe
	from spacy.pipeline.lemmatizer import lemmatizer_score
	from spacy.util import ensure_path
	from spacy.tokens import Doc, Token

	MATCH_ORDER = [
	"upos",
	"Tense",
	"VerbForm",
	"Voice",
	"Case",
	"Gender",
	"Number",
	"Degree",
	"Mood",
	"Person",
	"Aspect",
	"Definite",
	"PronType",
	"Polarity",
	"Poss",
	"Reflex",
	]


	class TableEntry(TypedDict):
	form: str
	lemma: str
	upos: str
	frequency: int
	Tense: NotRequired[str]
	VerbForm: NotRequired[str]
	Voice: NotRequired[str]
	Case: NotRequired[str]
	Gender: NotRequired[str]
	Number: NotRequired[str]
	Degree: NotRequired[str]
	Mood: NotRequired[str]
	Person: NotRequired[str]
	Aspect: NotRequired[str]
	Definite: NotRequired[str]
	PronType: NotRequired[str]
	Polarity: NotRequired[str]
	Poss: NotRequired[str]
	Reflex: NotRequired[str]


	FrequencyTable = Dict[str, List[TableEntry]]

	LookupTable = Dict[str, str]


	@Language.factory(
	"frequency_lemmatizer",
	assigns=["token.lemma"],
	default_config={
	"overwrite": True,
	"fallback_priority": "lookup",
	},
	default_score_weights={"lemma_acc": 1.0},
	)
	def make_lemmatizer(
	nlp: Language,
	name: str,
	overwrite: bool,
	fallback_priority: Literal["lemma", "lookup"],
	):
	return FrequencyLemmatizer(
	nlp=nlp,
	name=name,
	overwrite=overwrite,
	fallback_priority=fallback_priority,
	) # type: ignore


	def max_freq_lemma(entries: List[TableEntry]) -> str:
	"""Returns lemma with highest frequency from the given entries."""
	max_index = 0
	n_entries = len(entries)
	for index in range(1, n_entries):
	if entries[index]["frequency"] > entries[max_index]["frequency"]:
	max_index = index
	return entries[max_index]["lemma"]


	def match_lemma(
	token_entry: TableEntry, table: FrequencyTable
	) -> Optional[str]:
	"""Returns a lemma for a token if it
	can be found in the frequency table.
	"""
	# Tries to find the entries associated with the token in the table
	match = table.get(token_entry["form"], [])
	if not match:
	return None
	# We go through all the properties to be matched
	for match_property in MATCH_ORDER:
	match_new = [
	entry
	for entry in match
	if entry.get(match_property, "")
	== token_entry.get(match_property, "")
	]
	if not match_new:
	return max_freq_lemma(entries=match)
	match = match_new
	return max_freq_lemma(entries=match)


	def read_json(path: str) -> Dict:
	with open(path) as file:
	res = json.load(file)
	return res


	def write_json(object: Dict, path: str) -> None:
	with open(path, "w") as file:
	json.dump(object, file)


	class FrequencyLemmatizer(Pipe):
	"""
	Part-of-speech and morphology, and frequency
	sensitive rule-based lemmatizer.

	Parameters
	----------
	overwrite: bool, default True
	Specifies whether the frequency lemmatizer should overwrite
	already assigned lemmas.
	fallback_priority: 'lemma' or 'lookup', default 'lookup'
	Specifies which fallback should have higher priority
	if the lemma is not found in
	the primary table.
	"""

	def __init__(
	self,
	nlp: Language,
	name: str = "freq_lemmatizer",
	*,
	overwrite: bool = True,
	fallback_priority: Literal["lemma", "lookup"] = "lookup",
	):
	self.name = name
	self.overwrite = overwrite
	self.scorer = lemmatizer_score
	self.fallback_priority = fallback_priority

	def initialize(
	self,
	get_examples=None,
	*,
	nlp=None,
	table: Optional[FrequencyTable] = None,
	lookup: Optional[LookupTable] = None,
	) -> None:
	"""Initializes the frequency lemmatizer from given lemma table and lookup.

	Parameters
	----------
	table: iterable of entries or None, default None
	Iterable of all entries in the lemma table
	with pos tags morph features and frequencies.
	lookup: dict of str to str or None, default None
	Backoff lookup table for simple token-lemma lookup.
	"""
	if table is None:
	self.table = None
	else:
	self.table = table
	self.lookup = lookup

	def backoff(self, token: Token) -> str:
	"""Gets backoff token based on priority."""
	orth = token.orth_.lower()
	lookup = self.lookup
	in_lookup = (lookup is not None) and (orth in lookup)
	priority = self.fallback_priority
	has_lemma = (token.lemma != 0) and (token.lemma_ != token.orth_)
	if in_lookup:
	if priority == "lookup":
	return lookup[orth] # type: ignore
	else:
	if has_lemma:
	return token.lemma_
	else:
	return token.orth_
	else:
	if has_lemma:
	return token.lemma_
	else:
	return token.orth_

	def lemmatize(self, token: Token) -> str:
	"""Lemmatizes token."""
	backoff = self.backoff(token)
	orth = token.orth_.lower()
	# If the table is empty we early return
	if self.table is None:
	return backoff
	# I only add frequency for type compatibility
	token_entry: TableEntry = TableEntry(
	form=orth, upos=token.pos_, frequency=-1, **token.morph.to_dict()
	)
	lemma = match_lemma(token_entry=token_entry, table=self.table)
	if lemma is None:
	return backoff
	else:
	return lemma

	def __call__(self, doc: Doc) -> Doc:
	"""Apply the lemmatization to a document."""
	error_handler = self.get_error_handler()
	try:
	for token in doc:
	if self.overwrite or token.lemma == 0:
	token.lemma_ = self.lemmatize(token)
	return doc
	except Exception as e:
	error_handler(self.name, self, [doc], e)

	def to_disk(
	self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
	):
	"""Save frequency lemmatizer data to a directory."""
	path = ensure_path(path)
	Path(path).mkdir(parents=True, exist_ok=True)
	config = dict(
	overwrite=self.overwrite, fallback_priority=self.fallback_priority
	)
	with open(os.path.join(path, "config.json"), "w") as config_file:
	json.dump(config, config_file)
	if self.table is not None:
	table_path = os.path.join(path, "table.json")
	write_json(self.table, path=table_path)
	if self.lookup is not None:
	lookup_path = os.path.join(path, "lookup.json")
	write_json(self.lookup, path=lookup_path)

	def from_disk(
	self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
	) -> "FrequencyLemmatizer":
	"""Load component from disk."""
	path = ensure_path(path)
	config = read_json(os.path.join(path, "config.json"))
	self.overwrite = config.get("overwrite", self.overwrite)
	self.fallback_priority = config.get(
	"fallback_priority", self.fallback_priority
	)
	try:
	table: Optional[FrequencyTable] = read_json(
	os.path.join(path, "table.json")
	)
	except FileNotFoundError:
	table = None
	try:
	lookup: Optional[LookupTable] = read_json(
	os.path.join(path, "lookup.json")
	)
	except FileNotFoundError:
	lookup = None
	self.initialize(table=table, lookup=lookup)
	return self