import re from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() def preprocess_text(text): # Text processing steps text = text.lower() text = re.sub(r'http\S+|www\S+|https\S+', '', text) text = re.sub(r'<.*?>', '', text) text = re.sub(r'[^a-zA-Z\s]', '', text) text = re.sub(r'\s+', ' ', text).strip() text = ' '.join(word for word in text.split() if word not in stop_words) text = ' '.join(lemmatizer.lemmatize(word) for word in text.split()) return text