Spaces:

Abdul-Ib
/

Full-text-Search

Runtime error

App Files Files Community

Abdul-Ib commited on Jan 18

Commit

f3669bf

•

1 Parent(s): 050fb16

Upload clean_data.py

Browse files

Files changed (1) hide show

clean_data.py +162 -0

clean_data.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import string, re, nltk
+nltk.download('stopwords')
+nltk.download('averaged_perceptron_tagger')
+from string import punctuation
+from nltk.tokenize import word_tokenize, RegexpTokenizer
+from nltk.corpus import stopwords
+# from num2words import num2words
+# from spellchecker import SpellChecker
+# from nltk.stem.porter import PorterStemmer
+import spacy
+from nltk.stem import WordNetLemmatizer
+import pandas as pd
+# RegexpTokenizer
+regexp = RegexpTokenizer("[\w']+")
+# Converting to lowercase
+def convert_to_lowercase(text):
+    return text.lower()
+# Removing whitespaces
+def remove_whitespace(text):
+    return text.strip()
+# Removing punctuations
+def remove_punctuation(text):
+    punct_str = string.punctuation
+    punct_str = punct_str.replace("'", "").replace("%", "") # discarding apostrophe from the string to keep the contractions intact
+    return text.translate(str.maketrans("", "", punct_str))
+# Removing HTML tags
+def remove_html(text):
+    html = re.compile(r'<.*?>')
+    return html.sub(r'', text)
+# Removing emojis
+def remove_emoji(text):
+    emoji_pattern = re.compile("["
+                           u"\U0001F600-\U0001F64F"  # emoticons
+                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
+                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+                           u"\U00002702-\U000027B0"
+                           u"\U000024C2-\U0001F251"
+                           "]+", flags = re.UNICODE)
+    return emoji_pattern.sub(r'', text)
+# Removing other unicode characters
+def remove_http(text):
+    http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http")
+    pattern = r"({})".format(http) # creating pattern
+    return re.sub(pattern, "", text)
+# Dictionary of acronyms
+acronyms_url = 'https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_acronyms.json'
+acronyms_dict = pd.read_json(acronyms_url, typ = 'series')
+acronyms_list = list(acronyms_dict.keys())
+# Function to convert contractions in a text
+def convert_acronyms(text):
+    words = []
+    for word in regexp.tokenize(text):
+        if word in acronyms_list:
+            words = words + acronyms_dict[word].split()
+        else:
+            words = words + word.split()
+    text_converted = " ".join(words)
+    return text_converted
+# Dictionary of contractions
+contractions_url = 'https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_contractions.json'
+contractions_dict = pd.read_json(contractions_url, typ = 'series')
+contractions_list = list(contractions_dict.keys())
+# Function to convert contractions in a text
+def convert_contractions(text):
+    words = []
+    for word in regexp.tokenize(text):
+        if word in contractions_list:
+            words = words + contractions_dict[word].split()
+        else:
+            words = words + word.split()
+    text_converted = " ".join(words)
+    return text_converted
+# Stopwords
+stops = stopwords.words("english") # stopwords
+addstops = ["among", "onto", "shall", "thrice", "thus", "twice", "unto", "us", "would"] # additional stopwords
+allstops = stops + addstops
+# Function to remove stopwords from a list of texts
+def remove_stopwords(text):
+    return " ".join([word for word in regexp.tokenize(text) if word not in allstops])
+# pyspellchecker
+# spell = SpellChecker()
+# def pyspellchecker(text):
+#     word_list = regexp.tokenize(text)
+#     word_list_corrected = []
+#     for word in word_list:
+#         if word in spell.unknown(word_list):
+#             word_corrected = spell.correction(word)
+#             if word_corrected == None:
+#                 word_list_corrected.append(word)
+#             else:
+#                 word_list_corrected.append(word_corrected)
+#         else:
+#             word_list_corrected.append(word)
+#     text_corrected = " ".join(word_list_corrected)
+#     return text_corrected
+# Lemmatization
+spacy_lemmatizer = spacy.load("en_core_web_sm", disable = ['parser', 'ner'])
+def text_lemmatizer(text):
+    text_spacy = " ".join([token.lemma_ for token in spacy_lemmatizer(text)])
+    return text_spacy
+def keep_pos(text):
+    tokens = regexp.tokenize(text)
+    tokens_tagged = nltk.pos_tag(tokens)
+    #keep_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
+    keep_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'FW', 'PRP', 'PRPS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WPS', 'WRB', 'CD']
+    keep_words = [x[0] for x in tokens_tagged if x[1] in keep_tags]
+    return " ".join(keep_words)
+# Additional stopwords
+alphabets = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
+prepositions = ["about", "above", "across", "after", "against", "among", "around", "at", "before", "behind", "below", "beside", "between", "by", "down", "during", "for", "from", "in", "inside", "into", "near", "of", "off", "on", "out", "over", "through", "to", "toward", "under", "up", "with"]
+prepositions_less_common = ["aboard", "along", "amid", "as", "beneath", "beyond", "but", "concerning", "considering", "despite", "except", "following", "like", "minus", "onto", "outside", "per", "plus", "regarding", "round", "since", "than", "till", "underneath", "unlike", "until", "upon", "versus", "via", "within", "without"]
+coordinating_conjunctions = ["and", "but", "for", "nor", "or", "so", "and", "yet"]
+correlative_conjunctions = ["both", "and", "either", "or", "neither", "nor", "not", "only", "but", "whether", "or"]
+subordinating_conjunctions = ["after", "although", "as", "as if", "as long as", "as much as", "as soon as", "as though", "because", "before", "by the time", "even if", "even though", "if", "in order that", "in case", "in the event that", "lest", "now that", "once", "only", "only if", "provided that", "since", "so", "supposing", "that", "than", "though", "till", "unless", "until", "when", "whenever", "where", "whereas", "wherever", "whether or not", "while"]
+others = ["ã", "å", "ì", "û", "ûªm", "ûó", "ûò", "ìñ", "ûªre", "ûªve", "ûª", "ûªs", "ûówe"]
+additional_stops = prepositions + prepositions_less_common + coordinating_conjunctions + correlative_conjunctions + subordinating_conjunctions + others
+def remove_additional_stopwords(text):
+    return " ".join([word for word in regexp.tokenize(text) if word not in additional_stops])
+def text_normalizer(text):
+    text = convert_to_lowercase(text)
+    text = remove_whitespace(text)
+    text = re.sub('\n' , ' ', text) # converting text to one line
+    text = re.sub('\[.*?\]', '', text) # removing square brackets
+    text = remove_http(text)
+    text = remove_punctuation(text)
+    text = remove_html(text)
+    text = remove_emoji(text)
+    text = convert_acronyms(text)
+    text = convert_contractions(text)
+    text = remove_stopwords(text)
+    # if include_spellchecker:
+    #   text = pyspellchecker(text)
+    text = text_lemmatizer(text) # text = text_stemmer(text)
+    # text = keep_pos(text)
+    text = remove_additional_stopwords(text)
+    return text