Spaces:
Runtime error
Runtime error
import re | |
import regex | |
import sys | |
import textwrap | |
from typing import Any, Dict, Optional | |
punctuations = [ | |
'!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '.', | |
'/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', | |
'`', '{', '|', '}', '~', '»', '«', '“', '”', "-", | |
] | |
class Normalizer: | |
"""A general normalizer for every language""" | |
_whitelist = r"[" + "\p{N}\p{L}\p{M}" + re.escape("".join(punctuations)) + "]+" | |
_dictionary = {} | |
def __init__( | |
self, | |
whitelist: str = None, | |
dictionary: Dict[str, str] = None, | |
) -> None: | |
self.whitelist = whitelist if whitelist and isinstance(whitelist, str) else self._whitelist | |
self.dictionary = dictionary if dictionary and isinstance(dictionary, dict) else self._dictionary | |
def chars_to_map(self, sentence: str) -> str: | |
"""Maps every character, words, and phrase into a proper one. | |
Args: | |
sentence (str): A piece of text. | |
""" | |
if not len(self.dictionary) > 0: | |
return sentence | |
pattern = "|".join(map(re.escape, self.dictionary.keys())) | |
return re.sub(pattern, lambda m: self.dictionary[m.group()], str(sentence)) | |
def chars_to_preserve( | |
self, | |
sentence: str, | |
) -> str: | |
"""Keeps specified characters from sentence | |
Args: | |
sentence (str): A piece of text. | |
""" | |
try: | |
tokenized = regex.findall(self.whitelist, sentence) | |
return " ".join(tokenized) | |
except Exception as error: | |
print( | |
textwrap.dedent( | |
f""" | |
Bad characters range {self.whitelist}, | |
{error} | |
""" | |
) | |
) | |
raise | |
def text_level_normalizer(self, text: str) -> str: | |
"""A text level of normalization""" | |
text = regex.sub(r"([" + re.escape("".join(punctuations)) + "])", r" \1 ", text) | |
text = text.strip() | |
return text | |
def __call__( | |
self, | |
text: str, | |
do_lowercase: Optional[bool] = False | |
) -> Any: | |
"""Normalization caller""" | |
text = self.chars_to_map(text) | |
text = self.chars_to_preserve(text) | |
text = self.text_level_normalizer(text) | |
text = re.sub(r"\s+", " ", text) | |
if do_lowercase: | |
text = text.lower() | |
return text | |