gte-ecommerce

Sleeping

App Files Files Community

gte-ecommerce / clean_data.py

Abdul-Ib

Upload clean_data.py

f3669bf verified 7 months ago

raw

history blame

No virus

7.17 kB

	import string, re, nltk
	nltk.download('stopwords')
	nltk.download('averaged_perceptron_tagger')
	from string import punctuation
	from nltk.tokenize import word_tokenize, RegexpTokenizer
	from nltk.corpus import stopwords
	# from num2words import num2words
	# from spellchecker import SpellChecker
	# from nltk.stem.porter import PorterStemmer
	import spacy
	from nltk.stem import WordNetLemmatizer
	import pandas as pd

	# RegexpTokenizer
	regexp = RegexpTokenizer("[\w']+")

	# Converting to lowercase
	def convert_to_lowercase(text):
	return text.lower()

	# Removing whitespaces
	def remove_whitespace(text):
	return text.strip()

	# Removing punctuations
	def remove_punctuation(text):
	punct_str = string.punctuation
	punct_str = punct_str.replace("'", "").replace("%", "") # discarding apostrophe from the string to keep the contractions intact
	return text.translate(str.maketrans("", "", punct_str))

	# Removing HTML tags
	def remove_html(text):
	html = re.compile(r'<.*?>')
	return html.sub(r'', text)

	# Removing emojis
	def remove_emoji(text):
	emoji_pattern = re.compile("["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	"]+", flags = re.UNICODE)
	return emoji_pattern.sub(r'', text)

	# Removing other unicode characters
	def remove_http(text):
	http = "https?://\S+\|www\.\S+" # matching strings beginning with http (but not just "http")
	pattern = r"({})".format(http) # creating pattern
	return re.sub(pattern, "", text)

	# Dictionary of acronyms
	acronyms_url = 'https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_acronyms.json'
	acronyms_dict = pd.read_json(acronyms_url, typ = 'series')
	acronyms_list = list(acronyms_dict.keys())

	# Function to convert contractions in a text
	def convert_acronyms(text):
	words = []
	for word in regexp.tokenize(text):
	if word in acronyms_list:
	words = words + acronyms_dict[word].split()
	else:
	words = words + word.split()

	text_converted = " ".join(words)
	return text_converted

	# Dictionary of contractions
	contractions_url = 'https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_contractions.json'
	contractions_dict = pd.read_json(contractions_url, typ = 'series')
	contractions_list = list(contractions_dict.keys())

	# Function to convert contractions in a text
	def convert_contractions(text):
	words = []
	for word in regexp.tokenize(text):
	if word in contractions_list:
	words = words + contractions_dict[word].split()
	else:
	words = words + word.split()

	text_converted = " ".join(words)
	return text_converted

	# Stopwords
	stops = stopwords.words("english") # stopwords
	addstops = ["among", "onto", "shall", "thrice", "thus", "twice", "unto", "us", "would"] # additional stopwords
	allstops = stops + addstops

	# Function to remove stopwords from a list of texts
	def remove_stopwords(text):
	return " ".join([word for word in regexp.tokenize(text) if word not in allstops])

	# pyspellchecker
	# spell = SpellChecker()

	# def pyspellchecker(text):
	# word_list = regexp.tokenize(text)
	# word_list_corrected = []
	# for word in word_list:
	# if word in spell.unknown(word_list):
	# word_corrected = spell.correction(word)
	# if word_corrected == None:
	# word_list_corrected.append(word)
	# else:
	# word_list_corrected.append(word_corrected)
	# else:
	# word_list_corrected.append(word)
	# text_corrected = " ".join(word_list_corrected)
	# return text_corrected

	# Lemmatization
	spacy_lemmatizer = spacy.load("en_core_web_sm", disable = ['parser', 'ner'])

	def text_lemmatizer(text):
	text_spacy = " ".join([token.lemma_ for token in spacy_lemmatizer(text)])
	return text_spacy

	def keep_pos(text):
	tokens = regexp.tokenize(text)
	tokens_tagged = nltk.pos_tag(tokens)
	#keep_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
	keep_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'FW', 'PRP', 'PRPS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WPS', 'WRB', 'CD']
	keep_words = [x[0] for x in tokens_tagged if x[1] in keep_tags]
	return " ".join(keep_words)

	# Additional stopwords

	alphabets = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
	prepositions = ["about", "above", "across", "after", "against", "among", "around", "at", "before", "behind", "below", "beside", "between", "by", "down", "during", "for", "from", "in", "inside", "into", "near", "of", "off", "on", "out", "over", "through", "to", "toward", "under", "up", "with"]
	prepositions_less_common = ["aboard", "along", "amid", "as", "beneath", "beyond", "but", "concerning", "considering", "despite", "except", "following", "like", "minus", "onto", "outside", "per", "plus", "regarding", "round", "since", "than", "till", "underneath", "unlike", "until", "upon", "versus", "via", "within", "without"]
	coordinating_conjunctions = ["and", "but", "for", "nor", "or", "so", "and", "yet"]
	correlative_conjunctions = ["both", "and", "either", "or", "neither", "nor", "not", "only", "but", "whether", "or"]
	subordinating_conjunctions = ["after", "although", "as", "as if", "as long as", "as much as", "as soon as", "as though", "because", "before", "by the time", "even if", "even though", "if", "in order that", "in case", "in the event that", "lest", "now that", "once", "only", "only if", "provided that", "since", "so", "supposing", "that", "than", "though", "till", "unless", "until", "when", "whenever", "where", "whereas", "wherever", "whether or not", "while"]
	others = ["ã", "å", "ì", "û", "ûªm", "ûó", "ûò", "ìñ", "ûªre", "ûªve", "ûª", "ûªs", "ûówe"]
	additional_stops = prepositions + prepositions_less_common + coordinating_conjunctions + correlative_conjunctions + subordinating_conjunctions + others


	def remove_additional_stopwords(text):
	return " ".join([word for word in regexp.tokenize(text) if word not in additional_stops])

	def text_normalizer(text):
	text = convert_to_lowercase(text)
	text = remove_whitespace(text)
	text = re.sub('\n' , ' ', text) # converting text to one line
	text = re.sub('\[.*?\]', '', text) # removing square brackets
	text = remove_http(text)
	text = remove_punctuation(text)
	text = remove_html(text)
	text = remove_emoji(text)
	text = convert_acronyms(text)
	text = convert_contractions(text)
	text = remove_stopwords(text)
	# if include_spellchecker:
	# text = pyspellchecker(text)
	text = text_lemmatizer(text) # text = text_stemmer(text)
	# text = keep_pos(text)
	text = remove_additional_stopwords(text)
	return text