MindfulMedia_Mentor / clean_text_model.py
jaelin215's picture
Upload 14 files
bd9870c verified
raw history blame
No virus
947 Bytes
import re
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
class TextCleaner:
def __init__(self):
warnings.filterwarnings("ignore")
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
def cleaning_text(self, text):
if text and isinstance(text, str):
text = BeautifulSoup(text, "html.parser").get_text()
text = re.sub(r'https?://\S+|www\.\S+|@\w+|#\w+|[^a-zA-Z]', ' ', text.lower())
text = ' '.join([self.lemmatizer.lemmatize(word) for word in text.split() if len(word) > 1 and word not in self.stop_words])
text = ' '.join(list(dict.fromkeys(text.split())))
else:
text = ''
return text
if __name__ == "__main__":
# Example usage:
cleaner = TextCleaner()
print(cleaner.cleaning_text("I feel bullied online."))