Abdul-Ib commited on
Commit
28a38bd
1 Parent(s): 58662dd

Create normalizer.py

Browse files
Files changed (1) hide show
  1. normalizer.py +468 -0
normalizer.py ADDED
@@ -0,0 +1,468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import string, re
3
+ import pandas as pd
4
+ from aiogoogletrans import Translator
5
+ from spellchecker import SpellChecker
6
+ from nltk.tokenize import RegexpTokenizer
7
+
8
+
9
+ class Normalizer:
10
+ """
11
+ A class for text normalization tasks such as converting to lowercase,
12
+ removing whitespace, punctuation, HTML tags, emojis, etc.
13
+ """
14
+
15
+ def __init__(self):
16
+ """
17
+ Initializes the Normalizer object.
18
+ """
19
+
20
+ # Letter variations dictionary
21
+ self._letter_variations = {
22
+ "aàáâãäåāăą": "a",
23
+ "cçćĉċč": "c",
24
+ "eèéêëēĕėęě": "e",
25
+ "gğ": "g",
26
+ "hħĥ": "h",
27
+ "iìíîïīĭįı": "i",
28
+ "jĵ": "j",
29
+ "nñńņň": "n",
30
+ "oòóôõöøōŏő": "o",
31
+ "ś": "s",
32
+ "ß": "ss",
33
+ "uùúûüūŭůűų": "u",
34
+ "yýÿŷ": "y",
35
+ "æ": "ae",
36
+ "œ": "oe",
37
+ }
38
+
39
+ # Generate regex pattern including single characters
40
+ pattern_parts = []
41
+ for variation in self._letter_variations.keys():
42
+ pattern_parts.append(variation)
43
+ for char in variation:
44
+ if len(char) == 1:
45
+ pattern_parts.append(re.escape(char))
46
+
47
+ self._pattern = "|".join(pattern_parts)
48
+
49
+ # RegexpTokenizer
50
+ self._regexp = RegexpTokenizer("[\w']+")
51
+
52
+ # Dictionary of acronyms
53
+ acronyms_url = "https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_acronyms.json"
54
+ self._acronyms_dict = pd.read_json(acronyms_url, typ="series")
55
+ self._acronyms_list = list(self._acronyms_dict.keys())
56
+
57
+ # Dictionary of contractions
58
+ contractions_url = "https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_contractions.json"
59
+ self._contractions_dict = pd.read_json(contractions_url, typ="series")
60
+ self._contractions_list = list(self._contractions_dict.keys())
61
+
62
+ # Initialize translator for language detection
63
+ self._translator = Translator()
64
+
65
+ # Converting to lowercase
66
+ def _convert_to_lowercase(self, text):
67
+ """
68
+ Convert the input text to lowercase.
69
+
70
+ Args:
71
+ text (str): The input text to be converted.
72
+
73
+ Returns:
74
+ str: The input text converted to lowercase.
75
+ """
76
+ try:
77
+ return text.lower()
78
+ except Exception as e:
79
+ print(f"An error occurred during lowercase conversion: {e}")
80
+ return text
81
+
82
+ # Removing whitespaces
83
+ def _remove_whitespace(self, text):
84
+ """
85
+ Remove leading and trailing whitespaces from the input text.
86
+
87
+ Args:
88
+ text (str): The input text to be processed.
89
+
90
+ Returns:
91
+ str: The input text with leading and trailing whitespaces removed.
92
+ """
93
+ try:
94
+ return text.strip()
95
+ except Exception as e:
96
+ print(f"An error occurred during whitespace removal: {e}")
97
+ return text
98
+
99
+ # Removing punctuations
100
+ def _remove_punctuation(self, text):
101
+ """
102
+ Remove punctuation marks from the input text, except for apostrophes and percent signs.
103
+
104
+ Args:
105
+ text (str): The input text to be processed.
106
+
107
+ Returns:
108
+ str: The input text with punctuation marks removed.
109
+ """
110
+ try:
111
+ punct_str = string.punctuation
112
+ punct_str = punct_str.replace("'", "").replace(
113
+ "%", ""
114
+ ) # discarding apostrophe from the string to keep the contractions intact
115
+ return text.translate(str.maketrans("", "", punct_str))
116
+ except Exception as e:
117
+ print(f"An error occurred during punctuation removal: {e}")
118
+ return text
119
+
120
+ # Removing HTML tags
121
+ def _remove_html(self, text):
122
+ """
123
+ Remove HTML tags from the input text.
124
+
125
+ Args:
126
+ text (str): The input text containing HTML tags.
127
+
128
+ Returns:
129
+ str: The input text with HTML tags removed.
130
+ """
131
+ try:
132
+ html = re.compile(r"<.*?>")
133
+ return html.sub(r"", text)
134
+ except Exception as e:
135
+ print(f"An error occurred during HTML tag removal: {e}")
136
+ return text
137
+
138
+ # Removing emojis
139
+ def _remove_emoji(self, text):
140
+ """
141
+ Remove emojis from the input text.
142
+
143
+ Args:
144
+ text (str): The input text containing emojis.
145
+
146
+ Returns:
147
+ str: The input text with emojis removed.
148
+ """
149
+ try:
150
+ emoji_pattern = re.compile(
151
+ "["
152
+ "\U0001F600-\U0001F64F" # emoticons
153
+ "\U0001F300-\U0001F5FF" # symbols & pictographs
154
+ "\U0001F680-\U0001F6FF" # transport & map symbols
155
+ "\U0001F1E0-\U0001F1FF" # flags (iOS)
156
+ "\U00002702-\U000027B0"
157
+ "\U000024C2-\U0001F251"
158
+ "]+",
159
+ flags=re.UNICODE,
160
+ )
161
+ return emoji_pattern.sub(r"", text)
162
+ except Exception as e:
163
+ print(f"An error occurred during emoji removal: {e}")
164
+ return text
165
+
166
+
167
+ # Removing other unicode characters
168
+ def _remove_http(self, text):
169
+ """
170
+ Remove HTTP links from the input text.
171
+
172
+ Args:
173
+ text (str): The input text containing HTTP links.
174
+
175
+ Returns:
176
+ str: The input text with HTTP links removed.
177
+ """
178
+ try:
179
+ http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http")
180
+ pattern = r"({})".format(http) # creating pattern
181
+ return re.sub(pattern, "", text)
182
+ except Exception as e:
183
+ print(f"An error occurred during HTTP link removal: {e}")
184
+ return text
185
+
186
+ # Function to convert contractions in a text
187
+ def _convert_acronyms(self, text):
188
+ """
189
+ Convert acronyms in the text.
190
+
191
+ Example of acronyms dictionary:
192
+ {"LOL": "laugh out loud", "BRB": "be right back", "IDK": "I don't know"}
193
+
194
+ Args:
195
+ text (str): The input text containing acronyms.
196
+
197
+ Returns:
198
+ str: The input text with acronyms expanded.
199
+ """
200
+ try:
201
+ words = []
202
+ for word in self._regexp.tokenize(text):
203
+ if word in self._acronyms_list:
204
+ words = words + self._acronyms_dict[word].split()
205
+ else:
206
+ words = words + word.split()
207
+
208
+ text_converted = " ".join(words)
209
+ return text_converted
210
+ except Exception as e:
211
+ print(f"An error occurred during acronym conversion: {e}")
212
+ return text
213
+
214
+ # Function to convert contractions in a text
215
+ def _convert_contractions(self, text):
216
+ """
217
+ Convert contractions in the text.
218
+
219
+ Example of contractions dictionary:
220
+ {"I'm": "I am", "he's": "he is", "won't": "will not"}
221
+
222
+ Args:
223
+ text (str): The input text containing contractions.
224
+
225
+ Returns:
226
+ str: The input text with contractions expanded.
227
+ """
228
+ try:
229
+ words = []
230
+ for word in self._regexp.tokenize(text):
231
+ if word in self._contractions_list:
232
+ words = words + self._contractions_dict[word].split()
233
+ else:
234
+ words = words + word.split()
235
+
236
+ text_converted = " ".join(words)
237
+ return text_converted
238
+ except Exception as e:
239
+ print(f"An error occurred during contraction conversion: {e}")
240
+ return text
241
+
242
+ def _fix_letter_variations(self, query):
243
+ """
244
+ Replace variations of letters with their original counterparts.
245
+
246
+ Args:
247
+ query (str): The input query containing variations of letters.
248
+
249
+ Returns:
250
+ str: The normalized query with variations replaced by their original counterparts.
251
+ """
252
+
253
+ def replace_variation(match):
254
+ """
255
+ Helper function to replace variations with original counterparts.
256
+
257
+ Args:
258
+ match (re.Match): The match object representing the found variation.
259
+
260
+ Returns:
261
+ str: The original character if match is not found in letter_variations, otherwise its original counterpart.
262
+ """
263
+ for key in self._letter_variations.keys():
264
+ if match.group(0) in key:
265
+ return self._letter_variations[key]
266
+ return match.group(0)
267
+
268
+ try:
269
+ # Fixing the query
270
+ normalized_query = re.sub(self._pattern, replace_variation, query)
271
+ return normalized_query
272
+ except Exception as e:
273
+ print(f"An error occurred during letter variation fixing: {e}")
274
+ return query
275
+
276
+ def _normalize_query(self, word: str):
277
+ """
278
+ Clean the input text by performing the following steps:
279
+ 1. Remove non-alphabetic characters and keep specific characters like spaces, dashes, asterisks, and Arabic characters.
280
+ 2. Remove non-alphabetic characters between alphabetic characters.
281
+ 3. Remove repeating characters.
282
+ 4. Remove preceding numbers (e.g. 123phone -> phone).
283
+ 5. Add space between numbers and letters.
284
+ 6. Remove extra spaces.
285
+
286
+ Args:
287
+ word (str): The input text to be cleaned.
288
+
289
+ Returns:
290
+ str: The cleaned text.
291
+ """
292
+ try:
293
+ # Remove non-alphabetic characters and keep specific characters like spaces, dashes, asterisks, and Arabic characters
294
+ word = re.sub(
295
+ r"[^A-Za-z\s\-%*.$\u0621-\u064A0-9\u00E4\u00F6\u00FC\u00C4\u00D6\u00DC\u00df]",
296
+ "",
297
+ word,
298
+ flags=re.UNICODE,
299
+ )
300
+
301
+ # Remove non-alphabetic characters between alphabetic characters
302
+ clean_text = re.sub(
303
+ r"(?<=[a-zA-Z])([^A-Za-z\u0621-\u064A\s]+)(?=[a-zA-Z])", "", word
304
+ )
305
+ # Remove non-alphabetic characters between alphabetic characters
306
+ clean_text = re.sub(r"(?<=[a-zA-Z])([^A-Za-z\s]+)(?=[a-zA-Z])", "", clean_text)
307
+ # Remove non-alphabetic characters between Arabic characters
308
+ clean_text = re.sub(
309
+ r"(?<=[\u0621-\u064A])([^\u0621-\u064A\s]+)(?=[\u0621-\u064A])",
310
+ "",
311
+ clean_text,
312
+ )
313
+
314
+ # Remove repeating characters
315
+ clean_text = re.sub(r"(.)(\1+)", r"\1\1", clean_text)
316
+
317
+ # Remove preceding non latin alpha (e.g. صصphone -> phone)
318
+ clean_text = re.sub(r"([\u0621-\u064A]+)([a-zA-Z]+)", r"\2", clean_text)
319
+ # Add space between numbers and letters
320
+ clean_text = re.sub(r"([a-zA-Z]+)([\u0621-\u064A]+)", r"\1", clean_text)
321
+
322
+ # Remove preceding latin alpha (from arabic words) (e.g. phoneصص -> phone)
323
+ clean_text = re.sub(r"([a-zA-Z]+)([\u0621-\u064A]+)", r"\2", clean_text)
324
+ # Add space between numbers and letters
325
+ clean_text = re.sub(r"([\u0621-\u064A]+)([a-zA-Z]+)", r"\1", clean_text)
326
+
327
+ # Remove preceding numbers (e.g. 123phone -> phone)
328
+ clean_text = re.sub(r"(\d+)([a-zA-Z\u0621-\u064A]+)", r"\1 \2", clean_text)
329
+ # Add space between numbers and letters
330
+ clean_text = re.sub(r"([a-zA-Z\u0621-\u064A]+)(\d+)", r"\1 \2", clean_text)
331
+
332
+ # Remove extra spaces
333
+ clean_text = re.sub(r"\s+", " ", clean_text)
334
+
335
+ return clean_text.strip()
336
+ except Exception as e:
337
+ print(f"An error occurred during query normalization: {e}")
338
+ return word
339
+
340
+ def keep_one_char(self, word: str) -> str:
341
+ """
342
+ Keep only one occurrence of consecutive repeated characters in the input word.
343
+
344
+ Args:
345
+ - word (str): The input word to modify.
346
+
347
+ Returns:
348
+ - str: The modified word with only one occurrence of consecutive repeated characters.
349
+ """
350
+ try:
351
+ return re.sub(r"(.)(\1+)", r"\1", word)
352
+ except Exception as e:
353
+ print(f"An error occurred during character repetition removal: {e}")
354
+ return word
355
+
356
+ def translate_text(self, text: str) -> str:
357
+ """
358
+ Translate the given text to English and return the translated text.
359
+
360
+ Args:
361
+ - text (str): The text to translate.
362
+
363
+ Returns:
364
+ - str: The translated text.
365
+ """
366
+ try:
367
+ loop = asyncio.get_event_loop()
368
+ translated_text = (
369
+ loop.run_until_complete(self._translator.translate(text))
370
+ .text.lower()
371
+ .strip()
372
+ )
373
+ except Exception as e:
374
+ print(f"Text Translation failed: {e}")
375
+ translated_text = (
376
+ text.lower().strip()
377
+ ) # Use original text if translation fails
378
+ return translated_text
379
+
380
+ def check_spelling(self, query: str) -> str:
381
+ """
382
+ Check the spelling of the input query and return the corrected version.
383
+
384
+ Args:
385
+ - query (str): The input query to check its spelling.
386
+
387
+ Returns:
388
+ - str: The corrected query.
389
+ """
390
+ try:
391
+ # Detect the language of the input query using Google Translate API
392
+ # input_language = self._translator.detect(query)
393
+ input_language = "en" if query.encode().isalpha() else "ar"
394
+
395
+ # Initialize SpellChecker with detected language, fallback to English if language detection fails
396
+ try:
397
+ spell_checker = SpellChecker(language=input_language)
398
+ except:
399
+ spell_checker = SpellChecker(language="en")
400
+
401
+ # Initialize an empty string to store the corrected query
402
+ result_query = ""
403
+
404
+ # Iterate through each word in the query
405
+ for word in query.split(" "):
406
+ # Get the corrected version of the word
407
+ corrected_word = spell_checker.correction(word)
408
+
409
+ # If the corrected word is not found, try correcting with keeping one character
410
+ if corrected_word is None:
411
+ corrected_word = spell_checker.correction(self.keep_one_char(word))
412
+
413
+ # If still not found, keep the original word
414
+ if corrected_word is None:
415
+ result_query += word + " "
416
+ else:
417
+ result_query += corrected_word + " "
418
+ else:
419
+ result_query += corrected_word + " "
420
+
421
+ # Remove trailing whitespace and return the corrected query
422
+ return result_query.strip()
423
+ except Exception as e:
424
+ print(f"An error occurred during spelling check: {e}")
425
+ return query
426
+
427
+ def clean_text(self, text):
428
+ """
429
+ Normalize the input text.
430
+
431
+ Args:
432
+ text (str): The input text to be normalized.
433
+
434
+ Returns:
435
+ str: The normalized text.
436
+ """
437
+ try:
438
+ # Convert text to lowercase
439
+ text = self._convert_to_lowercase(text)
440
+
441
+ # Remove whitespace
442
+ text = self._remove_whitespace(text)
443
+
444
+ # Convert text to one line
445
+ text = re.sub("\n", " ", text)
446
+
447
+ # Remove square brackets
448
+ text = re.sub("\[.*?\]", "", text)
449
+
450
+ # Remove HTTP links
451
+ text = self._remove_http(text)
452
+
453
+ # Remove HTML tags
454
+ text = self._remove_html(text)
455
+
456
+ # Remove emojis
457
+ text = self._remove_emoji(text)
458
+
459
+ # Fix letter variations
460
+ text = self._fix_letter_variations(text)
461
+
462
+ # Normalize queries
463
+ text = self._normalize_query(text)
464
+
465
+ return text
466
+ except Exception as e:
467
+ print(f"An error occurred during text cleaning: {e}")
468
+ return text