MindfulMedia_Mentor / clean_text_model.py
jaelin215's picture
added downloading nltk stop words
35de82b verified
raw
history blame
No virus
995 Bytes
import re
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import nltk
class TextCleaner:
def __init__(self):
warnings.filterwarnings("ignore")
nltk.download('stopwords')
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
def cleaning_text(self, text):
if text and isinstance(text, str):
text = BeautifulSoup(text, "html.parser").get_text()
text = re.sub(r'https?://\S+|www\.\S+|@\w+|#\w+|[^a-zA-Z]', ' ', text.lower())
text = ' '.join([self.lemmatizer.lemmatize(word) for word in text.split() if len(word) > 1 and word not in self.stop_words])
text = ' '.join(list(dict.fromkeys(text.split())))
else:
text = ''
return text
if __name__ == "__main__":
# Example usage:
cleaner = TextCleaner()
print(cleaner.cleaning_text("I feel bullied online."))