IMDB_Reviews / preprocess_data.py
danielcd99's picture
feat:added main files
14536de
raw
history blame
No virus
2.17 kB
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
def lowercase_text(text):
return text.lower()
def remove_html(text):
return re.sub(r'<[^<]+?>', '', text)
def remove_url(text):
return re.sub(r'http[s]?://\S+|www\.\S+', '', text)
def remove_punctuations(text):
tokens_list = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
for char in text:
if char in tokens_list:
text = text.replace(char, ' ')
return text
def remove_emojis(text):
emojis = re.compile("["
u"\U0001F600-\U0001F64F"
u"\U0001F300-\U0001F5FF"
u"\U0001F680-\U0001F6FF"
u"\U0001F1E0-\U0001F1FF"
u"\U00002500-\U00002BEF"
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f"
u"\u3030"
"]+", re.UNICODE)
text = re.sub(emojis, '', text)
return text
def remove_stop_words(text):
stop_words = stopwords.words('english')
new_text = ''
for word in text.split():
if word not in stop_words:
new_text += ''.join(f'{word} ')
return new_text.strip()
def stem_words(text):
stemmer = PorterStemmer()
new_text = ''
for word in text.split():
new_text += ''.join(f'{stemmer.stem(word)} ')
return new_text
def get_stopwords():
nltk.download('stopwords')
def preprocess_text(text):
text = lowercase_text(text)
text = remove_html(text)
text = remove_url(text)
text = remove_punctuations(text)
text = remove_emojis(text)
text = remove_stop_words(text)
text = stem_words(text)
return text
if __name__ == "__main__":
pass