Abdul-Ib commited on
Commit
f3669bf
1 Parent(s): 050fb16

Upload clean_data.py

Browse files
Files changed (1) hide show
  1. clean_data.py +162 -0
clean_data.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string, re, nltk
2
+ nltk.download('stopwords')
3
+ nltk.download('averaged_perceptron_tagger')
4
+ from string import punctuation
5
+ from nltk.tokenize import word_tokenize, RegexpTokenizer
6
+ from nltk.corpus import stopwords
7
+ # from num2words import num2words
8
+ # from spellchecker import SpellChecker
9
+ # from nltk.stem.porter import PorterStemmer
10
+ import spacy
11
+ from nltk.stem import WordNetLemmatizer
12
+ import pandas as pd
13
+
14
+ # RegexpTokenizer
15
+ regexp = RegexpTokenizer("[\w']+")
16
+
17
+ # Converting to lowercase
18
+ def convert_to_lowercase(text):
19
+ return text.lower()
20
+
21
+ # Removing whitespaces
22
+ def remove_whitespace(text):
23
+ return text.strip()
24
+
25
+ # Removing punctuations
26
+ def remove_punctuation(text):
27
+ punct_str = string.punctuation
28
+ punct_str = punct_str.replace("'", "").replace("%", "") # discarding apostrophe from the string to keep the contractions intact
29
+ return text.translate(str.maketrans("", "", punct_str))
30
+
31
+ # Removing HTML tags
32
+ def remove_html(text):
33
+ html = re.compile(r'<.*?>')
34
+ return html.sub(r'', text)
35
+
36
+ # Removing emojis
37
+ def remove_emoji(text):
38
+ emoji_pattern = re.compile("["
39
+ u"\U0001F600-\U0001F64F" # emoticons
40
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
41
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
42
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
43
+ u"\U00002702-\U000027B0"
44
+ u"\U000024C2-\U0001F251"
45
+ "]+", flags = re.UNICODE)
46
+ return emoji_pattern.sub(r'', text)
47
+
48
+ # Removing other unicode characters
49
+ def remove_http(text):
50
+ http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http")
51
+ pattern = r"({})".format(http) # creating pattern
52
+ return re.sub(pattern, "", text)
53
+
54
+ # Dictionary of acronyms
55
+ acronyms_url = 'https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_acronyms.json'
56
+ acronyms_dict = pd.read_json(acronyms_url, typ = 'series')
57
+ acronyms_list = list(acronyms_dict.keys())
58
+
59
+ # Function to convert contractions in a text
60
+ def convert_acronyms(text):
61
+ words = []
62
+ for word in regexp.tokenize(text):
63
+ if word in acronyms_list:
64
+ words = words + acronyms_dict[word].split()
65
+ else:
66
+ words = words + word.split()
67
+
68
+ text_converted = " ".join(words)
69
+ return text_converted
70
+
71
+ # Dictionary of contractions
72
+ contractions_url = 'https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_contractions.json'
73
+ contractions_dict = pd.read_json(contractions_url, typ = 'series')
74
+ contractions_list = list(contractions_dict.keys())
75
+
76
+ # Function to convert contractions in a text
77
+ def convert_contractions(text):
78
+ words = []
79
+ for word in regexp.tokenize(text):
80
+ if word in contractions_list:
81
+ words = words + contractions_dict[word].split()
82
+ else:
83
+ words = words + word.split()
84
+
85
+ text_converted = " ".join(words)
86
+ return text_converted
87
+
88
+ # Stopwords
89
+ stops = stopwords.words("english") # stopwords
90
+ addstops = ["among", "onto", "shall", "thrice", "thus", "twice", "unto", "us", "would"] # additional stopwords
91
+ allstops = stops + addstops
92
+
93
+ # Function to remove stopwords from a list of texts
94
+ def remove_stopwords(text):
95
+ return " ".join([word for word in regexp.tokenize(text) if word not in allstops])
96
+
97
+ # pyspellchecker
98
+ # spell = SpellChecker()
99
+
100
+ # def pyspellchecker(text):
101
+ # word_list = regexp.tokenize(text)
102
+ # word_list_corrected = []
103
+ # for word in word_list:
104
+ # if word in spell.unknown(word_list):
105
+ # word_corrected = spell.correction(word)
106
+ # if word_corrected == None:
107
+ # word_list_corrected.append(word)
108
+ # else:
109
+ # word_list_corrected.append(word_corrected)
110
+ # else:
111
+ # word_list_corrected.append(word)
112
+ # text_corrected = " ".join(word_list_corrected)
113
+ # return text_corrected
114
+
115
+ # Lemmatization
116
+ spacy_lemmatizer = spacy.load("en_core_web_sm", disable = ['parser', 'ner'])
117
+
118
+ def text_lemmatizer(text):
119
+ text_spacy = " ".join([token.lemma_ for token in spacy_lemmatizer(text)])
120
+ return text_spacy
121
+
122
+ def keep_pos(text):
123
+ tokens = regexp.tokenize(text)
124
+ tokens_tagged = nltk.pos_tag(tokens)
125
+ #keep_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
126
+ keep_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'FW', 'PRP', 'PRPS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WPS', 'WRB', 'CD']
127
+ keep_words = [x[0] for x in tokens_tagged if x[1] in keep_tags]
128
+ return " ".join(keep_words)
129
+
130
+ # Additional stopwords
131
+
132
+ alphabets = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
133
+ prepositions = ["about", "above", "across", "after", "against", "among", "around", "at", "before", "behind", "below", "beside", "between", "by", "down", "during", "for", "from", "in", "inside", "into", "near", "of", "off", "on", "out", "over", "through", "to", "toward", "under", "up", "with"]
134
+ prepositions_less_common = ["aboard", "along", "amid", "as", "beneath", "beyond", "but", "concerning", "considering", "despite", "except", "following", "like", "minus", "onto", "outside", "per", "plus", "regarding", "round", "since", "than", "till", "underneath", "unlike", "until", "upon", "versus", "via", "within", "without"]
135
+ coordinating_conjunctions = ["and", "but", "for", "nor", "or", "so", "and", "yet"]
136
+ correlative_conjunctions = ["both", "and", "either", "or", "neither", "nor", "not", "only", "but", "whether", "or"]
137
+ subordinating_conjunctions = ["after", "although", "as", "as if", "as long as", "as much as", "as soon as", "as though", "because", "before", "by the time", "even if", "even though", "if", "in order that", "in case", "in the event that", "lest", "now that", "once", "only", "only if", "provided that", "since", "so", "supposing", "that", "than", "though", "till", "unless", "until", "when", "whenever", "where", "whereas", "wherever", "whether or not", "while"]
138
+ others = ["ã", "å", "ì", "û", "ûªm", "ûó", "ûò", "ìñ", "ûªre", "ûªve", "ûª", "ûªs", "ûówe"]
139
+ additional_stops = prepositions + prepositions_less_common + coordinating_conjunctions + correlative_conjunctions + subordinating_conjunctions + others
140
+
141
+
142
+ def remove_additional_stopwords(text):
143
+ return " ".join([word for word in regexp.tokenize(text) if word not in additional_stops])
144
+
145
+ def text_normalizer(text):
146
+ text = convert_to_lowercase(text)
147
+ text = remove_whitespace(text)
148
+ text = re.sub('\n' , ' ', text) # converting text to one line
149
+ text = re.sub('\[.*?\]', '', text) # removing square brackets
150
+ text = remove_http(text)
151
+ text = remove_punctuation(text)
152
+ text = remove_html(text)
153
+ text = remove_emoji(text)
154
+ text = convert_acronyms(text)
155
+ text = convert_contractions(text)
156
+ text = remove_stopwords(text)
157
+ # if include_spellchecker:
158
+ # text = pyspellchecker(text)
159
+ text = text_lemmatizer(text) # text = text_stemmer(text)
160
+ # text = keep_pos(text)
161
+ text = remove_additional_stopwords(text)
162
+ return text