import nltk import spacy import re from nltk.tokenize import word_tokenize from nltk.corpus import stopwords class Preprocessor: """ This class provides methods to perform text preprocessing including tokenization, stopword removal, lemmatization, and basic text cleaning. """ def __init__(self, nltk_resource="all", spacy_model="en_core_web_sm", language="english"): """ The constructor for download the nltk resource and spacy model. :param nltk_resource: nltk resource :param spacy_model: spacy model :param language: the main language """ try: if nltk_resource == "all": nltk.data.find(f"corpora") else: nltk.data.find(f"corpora/{nltk_resource}") except LookupError: nltk.download(nltk_resource) if not spacy.util.is_package(spacy_model): spacy.cli.download(spacy_model) self.nlp = spacy.load(spacy_model) self.stop_word = set(stopwords.words(language)) def preprocess(self, text): """ This method performs text cleaning, tokenization, and lemmatization. :param text: the text to be preprocessed :return: the preprocessed text """ text = re.sub(r'(\w+):', r'\1:\n', text) text = re.sub(r'([a-z])([A-Z])', r'\1 \n\2', text) text = re.sub(r'\s+', ' ', text) text = re.sub(r'[^\w\s]', '', text) text = text.lower() tokens = word_tokenize(text) tokens = [token for token in tokens if token not in self.stop_word] doc = self.nlp(" ".join(tokens)) tokens = [token.lemma_ for token in doc] return " ".join(tokens)