Spaces:
Sleeping
Sleeping
File size: 2,174 Bytes
14536de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
def lowercase_text(text):
return text.lower()
def remove_html(text):
return re.sub(r'<[^<]+?>', '', text)
def remove_url(text):
return re.sub(r'http[s]?://\S+|www\.\S+', '', text)
def remove_punctuations(text):
tokens_list = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
for char in text:
if char in tokens_list:
text = text.replace(char, ' ')
return text
def remove_emojis(text):
emojis = re.compile("["
u"\U0001F600-\U0001F64F"
u"\U0001F300-\U0001F5FF"
u"\U0001F680-\U0001F6FF"
u"\U0001F1E0-\U0001F1FF"
u"\U00002500-\U00002BEF"
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f"
u"\u3030"
"]+", re.UNICODE)
text = re.sub(emojis, '', text)
return text
def remove_stop_words(text):
stop_words = stopwords.words('english')
new_text = ''
for word in text.split():
if word not in stop_words:
new_text += ''.join(f'{word} ')
return new_text.strip()
def stem_words(text):
stemmer = PorterStemmer()
new_text = ''
for word in text.split():
new_text += ''.join(f'{stemmer.stem(word)} ')
return new_text
def get_stopwords():
nltk.download('stopwords')
def preprocess_text(text):
text = lowercase_text(text)
text = remove_html(text)
text = remove_url(text)
text = remove_punctuations(text)
text = remove_emojis(text)
text = remove_stop_words(text)
text = stem_words(text)
return text
if __name__ == "__main__":
pass |