1
import re
2
import string
3
4
5
chars_to_ignore = [
6
    ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
7
    "#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬", 'ٔ', ",", "?",
8
    ".", "!", "-", ";", ":", '"', "“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
9
]
10
chars_to_ignore = f"""[{"".join(chars_to_ignore)}]"""
11
12
dictionary_mapping = {
13
    "\u200c": " ", 
14
    "\u200d": " ", 
15
    "\u200e": " ", 
16
    "\u200f": " ", 
17
    "\ufeff": " ",
18
    "\u0307": " ",
19
}
20
21
22
def multiple_replace(text, chars_to_mapping):
23
    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
24
    return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
25
26
27
def remove_special_characters(text, chars_to_ignore_regex):
28
    text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
29
    return text
30
31
32
def normalizer_at_word_level(text):
33
    words = text.split()
34
    _text = []
35
36
    for word in words:
37
        # Normalizer at word level
38
        _text.append(word)
39
40
    return " ".join(_text) + " "
41
42
def normalizer(batch, return_dict=True, filter_trivials=False, remove_extra_space=False):
43
    text = batch["sentence"].lower().strip()
44
45
    # Dictionary mapping
46
    text = multiple_replace(text, dictionary_mapping)
47
    text = re.sub(" +", " ", text)
48
49
    # Remove specials
50
    text = remove_special_characters(text, chars_to_ignore)
51
    text = re.sub(" +", " ", text)
52
53
    # Normalizer at word level
54
    text = normalizer_at_word_level(text)
55
    text = re.sub(" +", " ", text)
56
    
57
    if remove_extra_space:
58
        text = text.strip()
59
    else:
60
        text = text.strip() + " "
61
62
    if filter_trivials:
63
        if not len(text) > 2:
64
            text = None
65
66
    if not return_dict:
67
        return text
68
69
    batch["sentence"] = text
70
    return batch