MindfulMedia_Mentor / clean_text_model.py
jaelin215's picture
added downloading word net
ba852a7 verified
raw
history blame
1.03 kB
import re
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import nltk
class TextCleaner:
def __init__(self):
warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('wordnet')
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
def cleaning_text(self, text):
if text and isinstance(text, str):
text = BeautifulSoup(text, "html.parser").get_text()
text = re.sub(r'https?://\S+|www\.\S+|@\w+|#\w+|[^a-zA-Z]', ' ', text.lower())
text = ' '.join([self.lemmatizer.lemmatize(word) for word in text.split() if len(word) > 1 and word not in self.stop_words])
text = ' '.join(list(dict.fromkeys(text.split())))
else:
text = ''
return text
if __name__ == "__main__":
# Example usage:
cleaner = TextCleaner()
print(cleaner.cleaning_text("I feel bullied online."))