import joblib import re import string import nltk try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords', quiet=True) from nltk.corpus import stopwords def load_model(model_path): """ Load a joblib model Args: - model_path (str): path to the model Returns: - model: loaded model """ model = joblib.load(model_path) return model # Set of English stopwords stop_words = set(stopwords.words('english')) def preprocess_text(text:str): # Step 1: Lowercase text = text.lower() # Step 2: Strip extra whitespace text = re.sub(r'\s+', ' ', text.strip()) # Step 3: Remove punctuation text = text.translate(str.maketrans('', '', string.punctuation)) # Step 4: Remove stopwords text = ' '.join(word for word in text.split() if word not in stop_words) # Step 5: Remove noise (URLs, emails, hashtags, mentions, numbers, non-printables) text = re.sub(r'http\S+|www\.\S+', '', text) # URLs text = re.sub(r'\S+@\S+\.\S+', '', text) # Emails text = re.sub(r'#[A-Za-z0-9_]+', '', text) # Hashtags text = re.sub(r'@[A-Za-z0-9_]+', '', text) # Mentions text = re.sub(r'\d+', '', text) # Numbers text = ''.join(ch for ch in text if ch.isprintable()) # Non-printables return text