Spaces:
Sleeping
Sleeping
import re | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.stem import PorterStemmer | |
def lowercase_text(text): | |
return text.lower() | |
def remove_html(text): | |
return re.sub(r'<[^<]+?>', '', text) | |
def remove_url(text): | |
return re.sub(r'http[s]?://\S+|www\.\S+', '', text) | |
def remove_punctuations(text): | |
tokens_list = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' | |
for char in text: | |
if char in tokens_list: | |
text = text.replace(char, ' ') | |
return text | |
def remove_emojis(text): | |
emojis = re.compile("[" | |
u"\U0001F600-\U0001F64F" | |
u"\U0001F300-\U0001F5FF" | |
u"\U0001F680-\U0001F6FF" | |
u"\U0001F1E0-\U0001F1FF" | |
u"\U00002500-\U00002BEF" | |
u"\U00002702-\U000027B0" | |
u"\U00002702-\U000027B0" | |
u"\U000024C2-\U0001F251" | |
u"\U0001f926-\U0001f937" | |
u"\U00010000-\U0010ffff" | |
u"\u2640-\u2642" | |
u"\u2600-\u2B55" | |
u"\u200d" | |
u"\u23cf" | |
u"\u23e9" | |
u"\u231a" | |
u"\ufe0f" | |
u"\u3030" | |
"]+", re.UNICODE) | |
text = re.sub(emojis, '', text) | |
return text | |
def remove_stop_words(text): | |
stop_words = stopwords.words('english') | |
new_text = '' | |
for word in text.split(): | |
if word not in stop_words: | |
new_text += ''.join(f'{word} ') | |
return new_text.strip() | |
def stem_words(text): | |
stemmer = PorterStemmer() | |
new_text = '' | |
for word in text.split(): | |
new_text += ''.join(f'{stemmer.stem(word)} ') | |
return new_text | |
def get_stopwords(): | |
nltk.download('stopwords') | |
def preprocess_text(text): | |
text = lowercase_text(text) | |
text = remove_html(text) | |
text = remove_url(text) | |
text = remove_punctuations(text) | |
text = remove_emojis(text) | |
text = remove_stop_words(text) | |
text = stem_words(text) | |
return text | |
if __name__ == "__main__": | |
pass |