File size: 2,487 Bytes
7a6f591
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import re
import regex
import sys
import textwrap
from typing import Any, Dict, Optional

punctuations = [
    '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '.',
    '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_',
    '`', '{', '|', '}', '~', '»', '«', '“', '”', "-",
]


class Normalizer:
    """A general normalizer for every language"""

    _whitelist = r"[" + "\p{N}\p{L}\p{M}" + re.escape("".join(punctuations)) + "]+"
    _dictionary = {}

    def __init__(
            self,
            whitelist: str = None,
            dictionary: Dict[str, str] = None,
    ) -> None:
        self.whitelist = whitelist if whitelist and isinstance(whitelist, str) else self._whitelist
        self.dictionary = dictionary if dictionary and isinstance(dictionary, dict) else self._dictionary

    def chars_to_map(self, sentence: str) -> str:
        """Maps every character, words, and phrase into a proper one.

        Args:
            sentence (str): A piece of text.
        """
        if not len(self.dictionary) > 0:
            return sentence

        pattern = "|".join(map(re.escape, self.dictionary.keys()))
        return re.sub(pattern, lambda m: self.dictionary[m.group()], str(sentence))

    def chars_to_preserve(
            self,
            sentence: str,
    ) -> str:
        """Keeps specified characters from sentence

        Args:
            sentence (str): A piece of text.
        """
        try:
            tokenized = regex.findall(self.whitelist, sentence)
            return " ".join(tokenized)
        except Exception as error:
            print(
                textwrap.dedent(
                    f"""
                    Bad characters range {self.whitelist},
                    {error}
                    """
                )
            )
            raise

    def text_level_normalizer(self, text: str) -> str:
        """A text level of normalization"""

        text = regex.sub(r"([" + re.escape("".join(punctuations)) + "])", r" \1 ", text)
        text = text.strip()

        return text

    def __call__(
            self,
            text: str,
            do_lowercase: Optional[bool] = False
    ) -> Any:
        """Normalization caller"""

        text = self.chars_to_map(text)
        text = self.chars_to_preserve(text)
        text = self.text_level_normalizer(text)
        text = re.sub(r"\s+", " ", text)

        if do_lowercase:
            text = text.lower()

        return text