import string import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize class TextCleaner: def __init__(self, raw_text): self.stopwords_set = set(stopwords.words("english") + list(string.punctuation)) self.lemmatizer = WordNetLemmatizer() self.raw_input_text = raw_text def clean_text(self) -> str: tokens = word_tokenize(self.raw_input_text.lower()) tokens = [token for token in tokens if token not in self.stopwords_set] tokens = [self.lemmatizer.lemmatize(token) for token in tokens] cleaned_text = " ".join(tokens) return cleaned_text