manifoldix commited on
Commit
febb7f1
1 Parent(s): e5a848e

Upload normalizer.py

Browse files
Files changed (1) hide show
  1. normalizer.py +205 -0
normalizer.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from parsivar import Normalizer
2
+
3
+ import num2fawords
4
+ import re
5
+ import string
6
+
7
+ from dictionary import dictionary_mapping, fixator_dictionary
8
+
9
+ _normalizer = Normalizer(half_space_char="\u200c", statistical_space_correction=True)
10
+ chars_to_ignore = [
11
+ ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
12
+ "#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬", 'ٔ', ",", "?",
13
+ ".", "!", "-", ";", ":", '"', "“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
14
+ 'ā', 'š', 'ّ', 'ْ',
15
+ ]
16
+ chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)
17
+ chars_to_ignore = f"""[{"".join(chars_to_ignore)}]"""
18
+ zwnj = "\u200c"
19
+ silent_chars = ["ا", "د", "ذ", "ر", "ز", "و", "آ"] + [zwnj] + [" "]
20
+
21
+
22
+ def multiple_replace(text, chars_to_mapping):
23
+ pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
24
+ return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
25
+
26
+
27
+ def remove_special_characters(text, chars_to_ignore_regex):
28
+ text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
29
+ return text
30
+
31
+
32
+ def convert_word_nums_to_text(word):
33
+ try:
34
+ word = int(word)
35
+ word = num2fawords.words(word)
36
+ except:
37
+ word = word
38
+
39
+ return word
40
+
41
+
42
+ def normalizer_at_word_level(text):
43
+ words = text.split()
44
+ _text = []
45
+
46
+ for word in words:
47
+ word = convert_word_nums_to_text(word)
48
+ word = fixator_dictionary.get(word, word)
49
+
50
+ _text.append(word)
51
+
52
+ return " ".join(_text) + " "
53
+
54
+
55
+ def finder(ss, s, starter=False):
56
+ found = []
57
+ for m in re.finditer(ss, s):
58
+ if starter:
59
+ found.append(m.start())
60
+ else:
61
+ found.append((m.start(), m.end()))
62
+
63
+ return found
64
+
65
+
66
+ def substring_replace(ss, s, start, end, stripped=True):
67
+ s_start = s[:start]
68
+ s_end = s[end:]
69
+
70
+ counter = 0
71
+ if stripped:
72
+ counter = 1 if s_start.endswith(" ") else counter
73
+ s_start = s_start.rstrip()
74
+
75
+ return s_start + ss + s_end, counter
76
+
77
+
78
+ def normalizer(
79
+ batch,
80
+ is_normalize=True,
81
+ return_dict=True,
82
+ filter_trivials=False,
83
+ remove_extra_space=False
84
+ ):
85
+ text = batch["sentence"].lower().strip()
86
+
87
+ # Parsivar normalizer
88
+ if is_normalize:
89
+ text = _normalizer.normalize(text)
90
+
91
+ # Dictionary mapping
92
+ text = multiple_replace(text, dictionary_mapping)
93
+ text = re.sub(" +", " ", text)
94
+
95
+ # Remove specials
96
+ text = remove_special_characters(text, chars_to_ignore)
97
+ text = re.sub(" +", " ", text)
98
+
99
+ # Replace connected آ
100
+ special, pointer = "آ", int("0")
101
+ for f in sorted(finder(special, text, True)):
102
+ index = f + pointer - 1
103
+ if len(text) >= index:
104
+ if text[index] not in silent_chars:
105
+ new_text, extra_pointer = substring_replace(
106
+ f"{text[index]}{zwnj}", text, index, index + 1, stripped=True)
107
+ text = new_text
108
+ pointer += 1 + 1 - 1 - extra_pointer
109
+
110
+ # Replace connected ها
111
+ pointer = int("0")
112
+ special_list = [
113
+ # "ام", "ای", "است", "ایم", "اید", "اند",
114
+ "هایمان", "هایم", "هایت", "هایش",
115
+ "هایتان", "هایشان", "هام", "هات",
116
+ "هاتان", "هامون", "هامان", "هاش",
117
+ "هاتون", "هاشان", "هاشون",
118
+ "هایی", "های", "هاس", "ها"
119
+ ]
120
+ for special in special_list:
121
+ pointer = 0
122
+ text = text
123
+ for f in sorted(finder(special, text, False)):
124
+ start, end = f[0] + pointer - 1, f[1] + pointer - 1
125
+ if len(text) >= (end + 1):
126
+ if len(text) == (end + 1):
127
+ new_text, extra_pointer = substring_replace(
128
+ f"{zwnj}{special}",
129
+ text,
130
+ start + 1,
131
+ end + 1,
132
+ stripped=True)
133
+ text = new_text
134
+ pointer += 1 + 1 - 1 - extra_pointer
135
+ else:
136
+ if text[end + 1] == " ":
137
+ new_text, extra_pointer = substring_replace(
138
+ f"{zwnj}{special}",
139
+ text,
140
+ start + 1,
141
+ end + 1,
142
+ stripped=True)
143
+ text = new_text
144
+ pointer += 1 + 1 - 1 - extra_pointer
145
+
146
+ special, pointer = "افزار", int("0")
147
+ for f in sorted(finder(special, text, False)):
148
+ start, end = f[0] + pointer - 1, f[1] + pointer - 1
149
+
150
+ if len(text) >= (end + 1):
151
+ new_text, extra_pointer = substring_replace(f"{zwnj}{special}", text, start + 1, end + 1, stripped=True)
152
+ text = new_text
153
+ pointer += 1 + 1 - 1 - extra_pointer
154
+
155
+ # Replace connected ها
156
+ pointer = int("0")
157
+ special_list = [
158
+ "ترین", "تر"
159
+ ]
160
+ for special in special_list:
161
+ pointer = 0
162
+ text = text
163
+ for f in sorted(finder(special, text, False)):
164
+ start, end = f[0] + pointer - 1, f[1] + pointer - 1
165
+ if len(text) >= (end + 1):
166
+ if len(text) == (end + 1):
167
+ new_text, extra_pointer = substring_replace(
168
+ f"{zwnj}{special}",
169
+ text,
170
+ start + 1,
171
+ end + 1,
172
+ stripped=True)
173
+ text = new_text
174
+ pointer += 1 + 1 - 1 - extra_pointer
175
+ else:
176
+ if text[end + 1] == " ":
177
+ new_text, extra_pointer = substring_replace(
178
+ f"{zwnj}{special}",
179
+ text,
180
+ start + 1,
181
+ end + 1,
182
+ stripped=True)
183
+ text = new_text
184
+ pointer += 1 + 1 - 1 - extra_pointer
185
+
186
+ # Normalizer at word level
187
+ text = normalizer_at_word_level(text)
188
+ text = re.sub(" +", " ", text)
189
+
190
+ if remove_extra_space:
191
+ text = text.strip()
192
+ else:
193
+ text = text.strip() + " "
194
+
195
+ if filter_trivials:
196
+ if not len(text) > 2:
197
+ text = None
198
+
199
+ if not return_dict:
200
+ return text
201
+
202
+ batch["sentence"] = text
203
+ return batch
204
+
205
+