MindfulMedia_Mentor / clean_text_model.py
jaelin215's picture
added downloading word net
ba852a7 verified
import re
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import nltk
class TextCleaner:
def __init__(self):
warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('wordnet')
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
def cleaning_text(self, text):
if text and isinstance(text, str):
text = BeautifulSoup(text, "html.parser").get_text()
text = re.sub(r'https?://\S+|www\.\S+|@\w+|#\w+|[^a-zA-Z]', ' ', text.lower())
text = ' '.join([self.lemmatizer.lemmatize(word) for word in text.split() if len(word) > 1 and word not in self.stop_words])
text = ' '.join(list(dict.fromkeys(text.split())))
else:
text = ''
return text
if __name__ == "__main__":
# Example usage:
cleaner = TextCleaner()
print(cleaner.cleaning_text("I feel bullied online."))