gte-ecommerce

Sleeping

App Files Files Community

gte-ecommerce / normalizer.py

Abdul-Ib

Update normalizer.py

a05a809 verified 9 months ago

raw

history blame

15.3 kB

	import asyncio
	import string, re
	import pandas as pd
	from spellchecker import SpellChecker
	from nltk.tokenize import RegexpTokenizer


	class Normalizer:
	"""
	A class for text normalization tasks such as converting to lowercase,
	removing whitespace, punctuation, HTML tags, emojis, etc.
	"""

	def __init__(self):
	"""
	Initializes the Normalizer object.
	"""

	# Letter variations dictionary
	self._letter_variations = {
	"aàáâãäåāăą": "a",
	"cçćĉċč": "c",
	"eèéêëēĕėęě": "e",
	"gğ": "g",
	"hħĥ": "h",
	"iìíîïīĭįı": "i",
	"jĵ": "j",
	"nñńņň": "n",
	"oòóôõöøōŏő": "o",
	"ś": "s",
	"ß": "ss",
	"uùúûüūŭůűų": "u",
	"yýÿŷ": "y",
	"æ": "ae",
	"œ": "oe",
	}

	# Generate regex pattern including single characters
	pattern_parts = []
	for variation in self._letter_variations.keys():
	pattern_parts.append(variation)
	for char in variation:
	if len(char) == 1:
	pattern_parts.append(re.escape(char))

	self._pattern = "\|".join(pattern_parts)

	# RegexpTokenizer
	self._regexp = RegexpTokenizer("[\w']+")

	# Dictionary of acronyms
	acronyms_url = "https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_acronyms.json"
	self._acronyms_dict = pd.read_json(acronyms_url, typ="series")
	self._acronyms_list = list(self._acronyms_dict.keys())

	# Dictionary of contractions
	contractions_url = "https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_contractions.json"
	self._contractions_dict = pd.read_json(contractions_url, typ="series")
	self._contractions_list = list(self._contractions_dict.keys())


	# Converting to lowercase
	def _convert_to_lowercase(self, text):
	"""
	Convert the input text to lowercase.

	Args:
	text (str): The input text to be converted.

	Returns:
	str: The input text converted to lowercase.
	"""
	try:
	return text.lower()
	except Exception as e:
	print(f"An error occurred during lowercase conversion: {e}")
	return text

	# Removing whitespaces
	def _remove_whitespace(self, text):
	"""
	Remove leading and trailing whitespaces from the input text.

	Args:
	text (str): The input text to be processed.

	Returns:
	str: The input text with leading and trailing whitespaces removed.
	"""
	try:
	return text.strip()
	except Exception as e:
	print(f"An error occurred during whitespace removal: {e}")
	return text

	# Removing punctuations
	def _remove_punctuation(self, text):
	"""
	Remove punctuation marks from the input text, except for apostrophes and percent signs.

	Args:
	text (str): The input text to be processed.

	Returns:
	str: The input text with punctuation marks removed.
	"""
	try:
	punct_str = string.punctuation
	punct_str = punct_str.replace("'", "").replace(
	"%", ""
	) # discarding apostrophe from the string to keep the contractions intact
	return text.translate(str.maketrans("", "", punct_str))
	except Exception as e:
	print(f"An error occurred during punctuation removal: {e}")
	return text

	# Removing HTML tags
	def _remove_html(self, text):
	"""
	Remove HTML tags from the input text.

	Args:
	text (str): The input text containing HTML tags.

	Returns:
	str: The input text with HTML tags removed.
	"""
	try:
	html = re.compile(r"<.*?>")
	return html.sub(r"", text)
	except Exception as e:
	print(f"An error occurred during HTML tag removal: {e}")
	return text

	# Removing emojis
	def _remove_emoji(self, text):
	"""
	Remove emojis from the input text.

	Args:
	text (str): The input text containing emojis.

	Returns:
	str: The input text with emojis removed.
	"""
	try:
	emoji_pattern = re.compile(
	"["
	"\U0001F600-\U0001F64F" # emoticons
	"\U0001F300-\U0001F5FF" # symbols & pictographs
	"\U0001F680-\U0001F6FF" # transport & map symbols
	"\U0001F1E0-\U0001F1FF" # flags (iOS)
	"\U00002702-\U000027B0"
	"\U000024C2-\U0001F251"
	"]+",
	flags=re.UNICODE,
	)
	return emoji_pattern.sub(r"", text)
	except Exception as e:
	print(f"An error occurred during emoji removal: {e}")
	return text


	# Removing other unicode characters
	def _remove_http(self, text):
	"""
	Remove HTTP links from the input text.

	Args:
	text (str): The input text containing HTTP links.

	Returns:
	str: The input text with HTTP links removed.
	"""
	try:
	http = "https?://\S+\|www\.\S+" # matching strings beginning with http (but not just "http")
	pattern = r"({})".format(http) # creating pattern
	return re.sub(pattern, "", text)
	except Exception as e:
	print(f"An error occurred during HTTP link removal: {e}")
	return text

	# Function to convert contractions in a text
	def _convert_acronyms(self, text):
	"""
	Convert acronyms in the text.

	Example of acronyms dictionary:
	{"LOL": "laugh out loud", "BRB": "be right back", "IDK": "I don't know"}

	Args:
	text (str): The input text containing acronyms.

	Returns:
	str: The input text with acronyms expanded.
	"""
	try:
	words = []
	for word in self._regexp.tokenize(text):
	if word in self._acronyms_list:
	words = words + self._acronyms_dict[word].split()
	else:
	words = words + word.split()

	text_converted = " ".join(words)
	return text_converted
	except Exception as e:
	print(f"An error occurred during acronym conversion: {e}")
	return text

	# Function to convert contractions in a text
	def _convert_contractions(self, text):
	"""
	Convert contractions in the text.

	Example of contractions dictionary:
	{"I'm": "I am", "he's": "he is", "won't": "will not"}

	Args:
	text (str): The input text containing contractions.

	Returns:
	str: The input text with contractions expanded.
	"""
	try:
	words = []
	for word in self._regexp.tokenize(text):
	if word in self._contractions_list:
	words = words + self._contractions_dict[word].split()
	else:
	words = words + word.split()

	text_converted = " ".join(words)
	return text_converted
	except Exception as e:
	print(f"An error occurred during contraction conversion: {e}")
	return text

	def _fix_letter_variations(self, query):
	"""
	Replace variations of letters with their original counterparts.

	Args:
	query (str): The input query containing variations of letters.

	Returns:
	str: The normalized query with variations replaced by their original counterparts.
	"""

	def replace_variation(match):
	"""
	Helper function to replace variations with original counterparts.

	Args:
	match (re.Match): The match object representing the found variation.

	Returns:
	str: The original character if match is not found in letter_variations, otherwise its original counterpart.
	"""
	for key in self._letter_variations.keys():
	if match.group(0) in key:
	return self._letter_variations[key]
	return match.group(0)

	try:
	# Fixing the query
	normalized_query = re.sub(self._pattern, replace_variation, query)
	return normalized_query
	except Exception as e:
	print(f"An error occurred during letter variation fixing: {e}")
	return query

	def _normalize_query(self, word: str):
	"""
	Clean the input text by performing the following steps:
	1. Remove non-alphabetic characters and keep specific characters like spaces, dashes, asterisks, and Arabic characters.
	2. Remove non-alphabetic characters between alphabetic characters.
	3. Remove repeating characters.
	4. Remove preceding numbers (e.g. 123phone -> phone).
	5. Add space between numbers and letters.
	6. Remove extra spaces.

	Args:
	word (str): The input text to be cleaned.

	Returns:
	str: The cleaned text.
	"""
	try:
	# Remove non-alphabetic characters and keep specific characters like spaces, dashes, asterisks, and Arabic characters
	word = re.sub(
	r"[^A-Za-z\s\-%*.$\u0621-\u064A0-9\u00E4\u00F6\u00FC\u00C4\u00D6\u00DC\u00df]",
	"",
	word,
	flags=re.UNICODE,
	)

	# Remove non-alphabetic characters between alphabetic characters
	clean_text = re.sub(
	r"(?<=[a-zA-Z])([^A-Za-z\u0621-\u064A\s]+)(?=[a-zA-Z])", "", word
	)
	# Remove non-alphabetic characters between alphabetic characters
	clean_text = re.sub(r"(?<=[a-zA-Z])([^A-Za-z\s]+)(?=[a-zA-Z])", "", clean_text)
	# Remove non-alphabetic characters between Arabic characters
	clean_text = re.sub(
	r"(?<=[\u0621-\u064A])([^\u0621-\u064A\s]+)(?=[\u0621-\u064A])",
	"",
	clean_text,
	)

	# Remove repeating characters
	clean_text = re.sub(r"(.)(\1+)", r"\1\1", clean_text)

	# Remove preceding non latin alpha (e.g. صصphone -> phone)
	clean_text = re.sub(r"([\u0621-\u064A]+)([a-zA-Z]+)", r"\2", clean_text)
	# Add space between numbers and letters
	clean_text = re.sub(r"([a-zA-Z]+)([\u0621-\u064A]+)", r"\1", clean_text)

	# Remove preceding latin alpha (from arabic words) (e.g. phoneصص -> phone)
	clean_text = re.sub(r"([a-zA-Z]+)([\u0621-\u064A]+)", r"\2", clean_text)
	# Add space between numbers and letters
	clean_text = re.sub(r"([\u0621-\u064A]+)([a-zA-Z]+)", r"\1", clean_text)

	# Remove preceding numbers (e.g. 123phone -> phone)
	clean_text = re.sub(r"(\d+)([a-zA-Z\u0621-\u064A]+)", r"\1 \2", clean_text)
	# Add space between numbers and letters
	clean_text = re.sub(r"([a-zA-Z\u0621-\u064A]+)(\d+)", r"\1 \2", clean_text)

	# Remove extra spaces
	clean_text = re.sub(r"\s+", " ", clean_text)

	return clean_text.strip()
	except Exception as e:
	print(f"An error occurred during query normalization: {e}")
	return word

	def keep_one_char(self, word: str) -> str:
	"""
	Keep only one occurrence of consecutive repeated characters in the input word.

	Args:
	- word (str): The input word to modify.

	Returns:
	- str: The modified word with only one occurrence of consecutive repeated characters.
	"""
	try:
	return re.sub(r"(.)(\1+)", r"\1", word)
	except Exception as e:
	print(f"An error occurred during character repetition removal: {e}")
	return word


	def check_spelling(self, query: str) -> str:
	"""
	Check the spelling of the input query and return the corrected version.

	Args:
	- query (str): The input query to check its spelling.

	Returns:
	- str: The corrected query.
	"""
	try:
	# Detect the language of the input query using Google Translate API
	# input_language = self._translator.detect(query)
	input_language = "en" if query.encode().isalpha() else "ar"

	# Initialize SpellChecker with detected language, fallback to English if language detection fails
	try:
	spell_checker = SpellChecker(language=input_language)
	except:
	spell_checker = SpellChecker(language="en")

	# Initialize an empty string to store the corrected query
	result_query = ""

	# Iterate through each word in the query
	for word in query.split(" "):
	# Get the corrected version of the word
	corrected_word = spell_checker.correction(word)

	# If the corrected word is not found, try correcting with keeping one character
	if corrected_word is None:
	corrected_word = spell_checker.correction(self.keep_one_char(word))

	# If still not found, keep the original word
	if corrected_word is None:
	result_query += word + " "
	else:
	result_query += corrected_word + " "
	else:
	result_query += corrected_word + " "

	# Remove trailing whitespace and return the corrected query
	return result_query.strip()
	except Exception as e:
	print(f"An error occurred during spelling check: {e}")
	return query

	def clean_text(self, text):
	"""
	Normalize the input text.

	Args:
	text (str): The input text to be normalized.

	Returns:
	str: The normalized text.
	"""
	try:
	# Convert text to lowercase
	text = self._convert_to_lowercase(text)

	# Remove whitespace
	text = self._remove_whitespace(text)

	# Convert text to one line
	text = re.sub("\n", " ", text)

	# Remove square brackets
	text = re.sub("\[.*?\]", "", text)

	# Remove HTTP links
	text = self._remove_http(text)

	# Remove HTML tags
	text = self._remove_html(text)

	# Remove emojis
	text = self._remove_emoji(text)

	# Fix letter variations
	text = self._fix_letter_variations(text)

	# Normalize queries
	text = self._normalize_query(text)

	return text
	except Exception as e:
	print(f"An error occurred during text cleaning: {e}")
	return text