Spaces:
Runtime error
Runtime error
__author__ = "Baishali Dutta" | |
__copyright__ = "Copyright (C) 2021 Baishali Dutta" | |
__license__ = "Apache License 2.0" | |
__version__ = "0.1" | |
# ------------------------------------------------------------------------- | |
# Import Libraries | |
# ------------------------------------------------------------------------- | |
import re | |
import nltk | |
from nltk.corpus import stopwords | |
from textblob import TextBlob, Word | |
# ------------------------------------------------------------------------- | |
# One-shot Instance Creation | |
# ------------------------------------------------------------------------- | |
nltk.download('stopwords') | |
nltk.download('wordnet') | |
stop_words = stopwords.words('english') | |
# ------------------------------------------------------------------------- | |
# Data Cleaning | |
# ------------------------------------------------------------------------- | |
def convert_to_lower_case_on_string(text): | |
""" | |
Coverts the specified text to lower case | |
:param text: the text to convert | |
:return: the lower cased text | |
""" | |
return " ".join(text.lower() for text in text.split()) | |
def convert_to_lower_case(text_column): | |
""" | |
Coverts the text in the specified column to lower case | |
:param text_column: the text column whose context needs to be converted | |
:return: the text column containing the lower cased text | |
""" | |
return text_column.apply( | |
lambda x: convert_to_lower_case_on_string(x)) | |
def apply_contraction_mapping_on_string(text): | |
""" | |
Applies the contraction mapping to the specified text | |
:param text: the text on which the contraction will be mapped | |
:return: the text after the application of contraction mapping | |
""" | |
contraction_mapping = {"ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because", | |
"could've": "could have", "couldn't": "could not", | |
"didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", | |
"hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will", | |
"he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", | |
"how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", | |
"I'll've": "I will have", "I'm": "I am", "I've": "I have", "i'd": "i would", | |
"i'd've": "i would have", "i'll": "i will", "i'll've": "i will have", "i'm": "i am", | |
"i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", | |
"it'll": "it will", "it'll've": "it will have", "it's": "it is", "let's": "let us", | |
"ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not", | |
"mightn't've": "might not have", "must've": "must have", "mustn't": "must not", | |
"mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have", | |
"o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", | |
"shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", | |
"she'd": "she would", "she'd've": "she would have", "she'll": "she will", | |
"she'll've": "she will have", "she's": "she is", "should've": "should have", | |
"shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have", | |
"so's": "so as", | |
"this's": "this is", "that'd": "that would", "that'd've": "that would have", | |
"that's": "that is", | |
"there'd": "there would", "there'd've": "there would have", "there's": "there is", | |
"here's": "here is", | |
"they'd": "they would", "they'd've": "they would have", "they'll": "they will", | |
"they'll've": "they will have", | |
"they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", | |
"we'd": "we would", "we'd've": "we would have", "we'll": "we will", | |
"we'll've": "we will have", | |
"we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", | |
"what'll've": "what will have", | |
"what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", | |
"when've": "when have", | |
"where'd": "where did", "where's": "where is", "where've": "where have", | |
"who'll": "who will", | |
"who'll've": "who will have", | |
"who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", | |
"will've": "will have", | |
"won't": "will not", "won't've": "will not have", "would've": "would have", | |
"wouldn't": "would not", | |
"wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would", | |
"y'all'd've": "you all would have", | |
"y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", | |
"you'd've": "you would have", "you'll": "you will", | |
"you'll've": "you will have", "you're": "you are", "you've": "you have"} | |
specials = ["’", "‘", "´", "`"] | |
for s in specials: | |
text = text.replace(s, "'") | |
text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(" ")]) | |
return text | |
def apply_contraction_mapping(text_column): | |
""" | |
Applies the contraction mapping to the text in the specified column | |
:param text_column: the text column on which the contraction will be mapped | |
:return: the text column after the application of contraction mapping | |
""" | |
return text_column.apply(lambda x: apply_contraction_mapping_on_string(x)) | |
def fix_misspelled_words_on_string2(text): | |
""" | |
Fixes the misspelled words on the specified text (uses predefined misspelled dictionary) | |
:param text: The text to be fixed | |
:return: the fixed text | |
""" | |
mispelled_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', | |
'counselling': 'counseling', | |
'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', | |
'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', | |
'sallary': 'salary', | |
'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', | |
'howcan': 'how can', | |
'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', | |
'theBest': 'the best', | |
'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', | |
"mastrubating": 'masturbating', | |
'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', | |
'2k17': '2017', '2k18': '2018', | |
'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', | |
'watsapp': 'whatsapp', | |
'demonitisation': 'demonetization', 'demonitization': 'demonetization', | |
'demonetisation': 'demonetization', ' ur ': 'your', ' u r ': 'you are'} | |
for word in mispelled_dict.keys(): | |
text = text.replace(word, mispelled_dict[word]) | |
return text | |
def fix_misspelled_words_on_string(text): | |
""" | |
Fixes the misspelled words on the specified text (uses TextBlob model) | |
:param text: The text to be fixed | |
:return: the fixed text | |
""" | |
b = TextBlob(text) | |
return str(b.correct()) | |
def fix_misspelled_words(text_column): | |
""" | |
Fixes the misspelled words on the text column | |
:param text_column: The text column to be fixed | |
:return: the text column containing the text | |
""" | |
return text_column.apply(lambda x: fix_misspelled_words_on_string2(x)) | |
def remove_punctuations_on_string(text): | |
""" | |
Removes all punctuations from the specified text | |
:param text: the text whose punctuations to be removed | |
:return: the text after removing the punctuations | |
""" | |
return text.replace('[^\w\s]', '') | |
def remove_punctuations(text_column): | |
""" | |
Removes all punctuations from the text of the specified text column | |
:param text_column: the text column whose punctuations to be removed | |
:return: the text column after removing the punctuations | |
""" | |
return remove_punctuations_on_string(text_column.str) | |
def remove_emojis_on_string(text): | |
""" | |
Removes emojis from the specified text | |
:param text: the text whose emojis need to be removed | |
:return: the text after removing the emojis | |
""" | |
emoji_pattern = re.compile("[" | |
u"\U0001F600-\U0001F64F" # emoticons | |
u"\U0001F300-\U0001F5FF" # symbols & pictographs | |
u"\U0001F680-\U0001F6FF" # transport & map symbols | |
u"\U0001F1E0-\U0001F1FF" # flags | |
u"\U00002702-\U000027B0" | |
u"\U000024C2-\U0001F251" | |
"]+", flags=re.UNICODE) | |
return emoji_pattern.sub(r'', text) | |
def remove_emojis(text_column): | |
""" | |
Removes emojis from the text of the specified column | |
:param text_column: the text column whose emojis need to be removed | |
:return: the text column after removing the emojis | |
""" | |
return text_column.apply(lambda x: remove_emojis_on_string(x)) | |
def remove_stopwords_on_string(text): | |
""" | |
Removes all stop words from the specified text | |
:param text: the text whose stop words need to be removed | |
:return: the text after removing the stop words | |
""" | |
return " ".join(x for x in text.split() if x not in stop_words) | |
def remove_stopwords(text_column): | |
""" | |
Removes all stop words from the text of the specified column | |
:param text_column: the text column whose stop words need to be removed | |
:return: the text column after removing the stop words | |
""" | |
return text_column.apply( | |
lambda x: remove_stopwords_on_string(x)) | |
def lemmatize_on_string(text): | |
""" | |
Lemmatizes the specified text | |
:param text: the text which needs to be lemmatized | |
:return: the lemmatized text | |
""" | |
blob = TextBlob(text).split() | |
result=[] | |
for word in blob: | |
expected_str = Word(word) | |
expected_str = expected_str.lemmatize() | |
result.append(expected_str) | |
return " ".join(result) | |
def lemmatize(text_column): | |
""" | |
Lemmatizes the text of the specified text column | |
:param text_column: the text column which needs to be lemmatized | |
:return: the lemmatized text column | |
""" | |
return text_column.apply(lemmatize_on_string) | |
def clean_text_column(text_column): | |
""" | |
Cleans the data specified in the text column | |
The cleaning procedure is as follows: | |
1. Convert the context to lower case | |
2. Apply contraction mapping in which we fix shorter usages of english sentences | |
3. Fixe misspelled words | |
4. Remove all punctuations | |
5. Remove all emojis | |
6. Remove all stop words | |
7. Apply lemmatisation | |
:return: the text column with the cleaned data | |
""" | |
text_column = convert_to_lower_case(text_column) | |
text_column = apply_contraction_mapping(text_column) | |
text_column = fix_misspelled_words(text_column) | |
text_column = remove_punctuations(text_column) | |
text_column = remove_emojis(text_column) | |
text_column = remove_stopwords(text_column) | |
text_column = lemmatize(text_column) | |
return text_column | |
def clean_text(text): | |
""" | |
Cleans the specified text | |
The cleaning procedure is as follows: | |
1. Convert the context to lower case | |
2. Apply contraction mapping in which we fix shorter usages of english sentences | |
3. Fix misspelled words | |
4. Remove all punctuations | |
5. Remove all emojis | |
6. Remove all stop words | |
7. Apply lemmatization | |
:return: the cleaned text | |
""" | |
text = convert_to_lower_case_on_string(text) | |
text = apply_contraction_mapping_on_string(text) | |
text = fix_misspelled_words_on_string(text) | |
text = remove_punctuations_on_string(text) | |
text = remove_emojis_on_string(text) | |
text = remove_stopwords_on_string(text) | |
text = lemmatize_on_string(text) | |
return text | |