Spaces:

m3hrdadfi
/

zabanshenas

Runtime error

App Files Files Community

zabanshenas / libs /normalizer.py

m3hrdadfi

Update sync_streamlit_to_space.yml

7a6f591 almost 4 years ago

raw

history blame contribute delete

2.49 kB

	import re
	import regex
	import sys
	import textwrap
	from typing import Any, Dict, Optional

	punctuations = [
	'!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '.',
	'/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_',
	'`', '{', '\|', '}', '~', '»', '«', '“', '”', "-",
	]


	class Normalizer:
	"""A general normalizer for every language"""

	_whitelist = r"[" + "\p{N}\p{L}\p{M}" + re.escape("".join(punctuations)) + "]+"
	_dictionary = {}

	def __init__(
	self,
	whitelist: str = None,
	dictionary: Dict[str, str] = None,
	) -> None:
	self.whitelist = whitelist if whitelist and isinstance(whitelist, str) else self._whitelist
	self.dictionary = dictionary if dictionary and isinstance(dictionary, dict) else self._dictionary

	def chars_to_map(self, sentence: str) -> str:
	"""Maps every character, words, and phrase into a proper one.

	Args:
	sentence (str): A piece of text.
	"""
	if not len(self.dictionary) > 0:
	return sentence

	pattern = "\|".join(map(re.escape, self.dictionary.keys()))
	return re.sub(pattern, lambda m: self.dictionary[m.group()], str(sentence))

	def chars_to_preserve(
	self,
	sentence: str,
	) -> str:
	"""Keeps specified characters from sentence

	Args:
	sentence (str): A piece of text.
	"""
	try:
	tokenized = regex.findall(self.whitelist, sentence)
	return " ".join(tokenized)
	except Exception as error:
	print(
	textwrap.dedent(
	f"""
	Bad characters range {self.whitelist},
	{error}
	"""
	)
	)
	raise

	def text_level_normalizer(self, text: str) -> str:
	"""A text level of normalization"""

	text = regex.sub(r"([" + re.escape("".join(punctuations)) + "])", r" \1 ", text)
	text = text.strip()

	return text

	def __call__(
	self,
	text: str,
	do_lowercase: Optional[bool] = False
	) -> Any:
	"""Normalization caller"""

	text = self.chars_to_map(text)
	text = self.chars_to_preserve(text)
	text = self.text_level_normalizer(text)
	text = re.sub(r"\s+", " ", text)

	if do_lowercase:
	text = text.lower()

	return text