Upload normalizer.py
Browse files- normalizer.py +203 -0
normalizer.py
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from parsivar import Normalizer
|
2 |
+
|
3 |
+
import num2fawords
|
4 |
+
import re
|
5 |
+
import string
|
6 |
+
|
7 |
+
|
8 |
+
_normalizer = Normalizer(half_space_char="\u200c", statistical_space_correction=True)
|
9 |
+
chars_to_ignore = [
|
10 |
+
",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
|
11 |
+
"#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬", 'ٔ', ",", "?",
|
12 |
+
".", "!", "-", ";", ":", '"', "“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
|
13 |
+
'ā', 'š', 'ّ', 'ْ',
|
14 |
+
]
|
15 |
+
chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)
|
16 |
+
chars_to_ignore = f"""[{"".join(chars_to_ignore)}]"""
|
17 |
+
zwnj = "\u200c"
|
18 |
+
silent_chars = ["ا", "د", "ذ", "ر", "ز", "و", "آ"] + [zwnj] + [" "]
|
19 |
+
|
20 |
+
|
21 |
+
def multiple_replace(text, chars_to_mapping):
|
22 |
+
pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
|
23 |
+
return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
|
24 |
+
|
25 |
+
|
26 |
+
def remove_special_characters(text, chars_to_ignore_regex):
|
27 |
+
text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
|
28 |
+
return text
|
29 |
+
|
30 |
+
|
31 |
+
def convert_word_nums_to_text(word):
|
32 |
+
try:
|
33 |
+
word = int(word)
|
34 |
+
word = num2fawords.words(word)
|
35 |
+
except:
|
36 |
+
word = word
|
37 |
+
|
38 |
+
return word
|
39 |
+
|
40 |
+
|
41 |
+
def normalizer_at_word_level(text):
|
42 |
+
words = text.split()
|
43 |
+
_text = []
|
44 |
+
|
45 |
+
for word in words:
|
46 |
+
word = convert_word_nums_to_text(word)
|
47 |
+
word = fixator_dictionary.get(word, word)
|
48 |
+
|
49 |
+
_text.append(word)
|
50 |
+
|
51 |
+
return " ".join(_text) + " "
|
52 |
+
|
53 |
+
|
54 |
+
def finder(ss, s, starter=False):
|
55 |
+
found = []
|
56 |
+
for m in re.finditer(ss, s):
|
57 |
+
if starter:
|
58 |
+
found.append(m.start())
|
59 |
+
else:
|
60 |
+
found.append((m.start(), m.end()))
|
61 |
+
|
62 |
+
return found
|
63 |
+
|
64 |
+
|
65 |
+
def substring_replace(ss, s, start, end, stripped=True):
|
66 |
+
s_start = s[:start]
|
67 |
+
s_end = s[end:]
|
68 |
+
|
69 |
+
counter = 0
|
70 |
+
if stripped:
|
71 |
+
counter = 1 if s_start.endswith(" ") else counter
|
72 |
+
s_start = s_start.rstrip()
|
73 |
+
|
74 |
+
return s_start + ss + s_end, counter
|
75 |
+
|
76 |
+
|
77 |
+
def normalizer(
|
78 |
+
batch,
|
79 |
+
is_normalize=True,
|
80 |
+
return_dict=True,
|
81 |
+
filter_trivials=False,
|
82 |
+
remove_extra_space=False
|
83 |
+
):
|
84 |
+
text = batch["sentence"].lower().strip()
|
85 |
+
|
86 |
+
# Parsivar normalizer
|
87 |
+
if is_normalize:
|
88 |
+
text = _normalizer.normalize(text)
|
89 |
+
|
90 |
+
# Dictionary mapping
|
91 |
+
text = multiple_replace(text, dictionary_mapping)
|
92 |
+
text = re.sub(" +", " ", text)
|
93 |
+
|
94 |
+
# Remove specials
|
95 |
+
text = remove_special_characters(text, chars_to_ignore)
|
96 |
+
text = re.sub(" +", " ", text)
|
97 |
+
|
98 |
+
# Replace connected آ
|
99 |
+
special, pointer = "آ", int("0")
|
100 |
+
for f in sorted(finder(special, text, True)):
|
101 |
+
index = f + pointer - 1
|
102 |
+
if len(text) >= index:
|
103 |
+
if text[index] not in silent_chars:
|
104 |
+
new_text, extra_pointer = substring_replace(
|
105 |
+
f"{text[index]}{zwnj}", text, index, index + 1, stripped=True)
|
106 |
+
text = new_text
|
107 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
108 |
+
|
109 |
+
# Replace connected ها
|
110 |
+
pointer = int("0")
|
111 |
+
special_list = [
|
112 |
+
# "ام", "ای", "است", "ایم", "اید", "اند",
|
113 |
+
"هایمان", "هایم", "هایت", "هایش",
|
114 |
+
"هایتان", "هایشان", "هام", "هات",
|
115 |
+
"هاتان", "هامون", "هامان", "هاش",
|
116 |
+
"هاتون", "هاشان", "هاشون",
|
117 |
+
"هایی", "های", "هاس", "ها"
|
118 |
+
]
|
119 |
+
for special in special_list:
|
120 |
+
pointer = 0
|
121 |
+
text = text
|
122 |
+
for f in sorted(finder(special, text, False)):
|
123 |
+
start, end = f[0] + pointer - 1, f[1] + pointer - 1
|
124 |
+
if len(text) >= (end + 1):
|
125 |
+
if len(text) == (end + 1):
|
126 |
+
new_text, extra_pointer = substring_replace(
|
127 |
+
f"{zwnj}{special}",
|
128 |
+
text,
|
129 |
+
start + 1,
|
130 |
+
end + 1,
|
131 |
+
stripped=True)
|
132 |
+
text = new_text
|
133 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
134 |
+
else:
|
135 |
+
if text[end + 1] == " ":
|
136 |
+
new_text, extra_pointer = substring_replace(
|
137 |
+
f"{zwnj}{special}",
|
138 |
+
text,
|
139 |
+
start + 1,
|
140 |
+
end + 1,
|
141 |
+
stripped=True)
|
142 |
+
text = new_text
|
143 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
144 |
+
|
145 |
+
special, pointer = "افزار", int("0")
|
146 |
+
for f in sorted(finder(special, text, False)):
|
147 |
+
start, end = f[0] + pointer - 1, f[1] + pointer - 1
|
148 |
+
|
149 |
+
if len(text) >= (end + 1):
|
150 |
+
new_text, extra_pointer = substring_replace(f"{zwnj}{special}", text, start + 1, end + 1, stripped=True)
|
151 |
+
text = new_text
|
152 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
153 |
+
|
154 |
+
# Replace connected ها
|
155 |
+
pointer = int("0")
|
156 |
+
special_list = [
|
157 |
+
"ترین", "تر"
|
158 |
+
]
|
159 |
+
for special in special_list:
|
160 |
+
pointer = 0
|
161 |
+
text = text
|
162 |
+
for f in sorted(finder(special, text, False)):
|
163 |
+
start, end = f[0] + pointer - 1, f[1] + pointer - 1
|
164 |
+
if len(text) >= (end + 1):
|
165 |
+
if len(text) == (end + 1):
|
166 |
+
new_text, extra_pointer = substring_replace(
|
167 |
+
f"{zwnj}{special}",
|
168 |
+
text,
|
169 |
+
start + 1,
|
170 |
+
end + 1,
|
171 |
+
stripped=True)
|
172 |
+
text = new_text
|
173 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
174 |
+
else:
|
175 |
+
if text[end + 1] == " ":
|
176 |
+
new_text, extra_pointer = substring_replace(
|
177 |
+
f"{zwnj}{special}",
|
178 |
+
text,
|
179 |
+
start + 1,
|
180 |
+
end + 1,
|
181 |
+
stripped=True)
|
182 |
+
text = new_text
|
183 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
184 |
+
|
185 |
+
# Normalizer at word level
|
186 |
+
text = normalizer_at_word_level(text)
|
187 |
+
text = re.sub(" +", " ", text)
|
188 |
+
|
189 |
+
if remove_extra_space:
|
190 |
+
text = text.strip()
|
191 |
+
else:
|
192 |
+
text = text.strip() + " "
|
193 |
+
|
194 |
+
if filter_trivials:
|
195 |
+
if not len(text) > 2:
|
196 |
+
text = None
|
197 |
+
|
198 |
+
if not return_dict:
|
199 |
+
return text
|
200 |
+
|
201 |
+
batch["sentence"] = text
|
202 |
+
return batch
|
203 |
+
|