import re import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer def lowercase_text(text): return text.lower() def remove_html(text): return re.sub(r'<[^<]+?>', '', text) def remove_url(text): return re.sub(r'http[s]?://\S+|www\.\S+', '', text) def remove_punctuations(text): tokens_list = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' for char in text: if char in tokens_list: text = text.replace(char, ' ') return text def remove_emojis(text): emojis = re.compile("[" u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF" u"\U00002500-\U00002BEF" u"\U00002702-\U000027B0" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u"\U00010000-\U0010ffff" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u200d" u"\u23cf" u"\u23e9" u"\u231a" u"\ufe0f" u"\u3030" "]+", re.UNICODE) text = re.sub(emojis, '', text) return text def remove_stop_words(text): stop_words = stopwords.words('english') new_text = '' for word in text.split(): if word not in stop_words: new_text += ''.join(f'{word} ') return new_text.strip() def stem_words(text): stemmer = PorterStemmer() new_text = '' for word in text.split(): new_text += ''.join(f'{stemmer.stem(word)} ') return new_text def get_stopwords(): nltk.download('stopwords') def preprocess_text(text): text = lowercase_text(text) text = remove_html(text) text = remove_url(text) text = remove_punctuations(text) text = remove_emojis(text) text = remove_stop_words(text) text = stem_words(text) return text if __name__ == "__main__": pass