import re from typing import Dict non_printing_characters_re = re.compile( f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]" ) digits_re: re.Pattern = re.compile(r"\d") unicode_punctuation: Dict[str, str] = { ",": ",", "。": ".", "、": ",", "„": '"', "”": '"', "“": '"', "«": '"', "»": '"', "1": '"', "」": '"', "「": '"', "《": '"', "》": '"', "´": "'", "∶": ":", ":": ":", "?": "?", "!": "!", "(": "(", ")": ")", ";": ";", "–": "-", "—": " - ", ".": ". ", "~": "~", "’": "'", "…": "...", "━": "-", "〈": "<", "〉": ">", "【": "[", "】": "]", "%": "%", "►": "-", } normalization = { "non_printing_characters_re": non_printing_characters_re, "digits_re": digits_re, "unicode_punctuation": unicode_punctuation, }