bertopic / text_preprocessor.py
Mars203020's picture
Upload 17 files
b7b041e verified
import re
import string
import pandas as pd
import spacy
import emoji
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex
from pathlib import Path
from resource_path import resource_path
class MultilingualPreprocessor:
"""
A robust text preprocessor using spaCy for multilingual support.
"""
def __init__(self, language: str):
"""
Initializes the preprocessor and loads the appropriate spaCy model.
Args:
language (str): 'english' or 'multilingual'.
"""
import sys
model_map = {
'english': 'en_core_web_sm',
'multilingual': 'xx_ent_wiki_sm'
}
self.model_name = model_map.get(language, 'xx_ent_wiki_sm')
try:
# Check if running from PyInstaller bundle
if hasattr(sys, '_MEIPASS'):
# PyInstaller mode: load from bundled path
model_path_obj = Path(resource_path(self.model_name))
self.nlp = spacy.util.load_model_from_path(model_path_obj)
else:
# Normal development mode: load by model name
self.nlp = spacy.load(self.model_name)
except OSError as e:
print(f"spaCy Model Error: Could not load model '{self.model_name}'")
print(f"Please run: python -m spacy download {self.model_name}")
raise
# Customize tokenizer to not split on hyphens in words
# CORRECTED LINE: CONCAT_QUOTES is wrapped in a list []
infixes = LIST_ELLIPSES + LIST_ICONS + [CONCAT_QUOTES]
infix_regex = compile_infix_regex(infixes)
self.nlp.tokenizer.infix_finditer = infix_regex.finditer
def preprocess_series(self, text_series: pd.Series, options: dict, n_process_spacy: int = -1) -> pd.Series:
"""
Applies a series of cleaning steps to a pandas Series of text.
Args:
text_series (pd.Series): The text to be cleaned.
options (dict): A dictionary of preprocessing options.
Returns:
pd.Series: The cleaned text Series.
"""
# --- Stage 1: Fast, Regex-based cleaning (combined for performance) ---
processed_text = text_series.copy().astype(str)
# Combine all regex patterns into a single pass for better performance
regex_patterns = []
if options.get("remove_html"):
regex_patterns.append(r"<.*?>")
if options.get("remove_urls"):
regex_patterns.append(r"http\S+|www\.\S+")
if options.get("handle_hashtags") == "Remove Hashtags":
regex_patterns.append(r"#\w+")
if options.get("handle_mentions") == "Remove Mentions":
regex_patterns.append(r"@\w+")
# Apply all regex replacements in a single pass
if regex_patterns:
combined_pattern = "|".join(regex_patterns)
processed_text = processed_text.str.replace(combined_pattern, "", regex=True)
# Emoji handling (separate as it needs special library)
emoji_option = options.get("handle_emojis", "Keep Emojis")
if emoji_option == "Remove Emojis":
processed_text = processed_text.apply(lambda s: emoji.replace_emoji(s, replace=''))
elif emoji_option == "Convert Emojis to Text":
processed_text = processed_text.apply(emoji.demojize)
# --- Stage 2: spaCy-based advanced processing ---
# Using nlp.pipe for efficiency on a Series
cleaned_docs = []
# docs = self.nlp.pipe(processed_text, n_process=-1, batch_size=500)
docs = self.nlp.pipe(processed_text, n_process=n_process_spacy, batch_size=500)
# Get custom stopwords and convert to lowercase set for fast lookups
custom_stopwords = set(options.get("custom_stopwords", []))
for doc in docs:
tokens = []
for token in doc:
# Punctuation and Number handling
if options.get("remove_punctuation") and token.is_punct:
continue
if options.get("remove_numbers") and (token.is_digit or token.like_num):
continue
# Stopword handling (including custom stopwords)
is_stopword = token.is_stop or token.text.lower() in custom_stopwords
if options.get("remove_stopwords") and is_stopword:
continue
# Use lemma if lemmatization is on, otherwise use the original text
token_text = token.lemma_ if options.get("lemmatize") else token.text
# Lowercasing (language-aware)
if options.get("lowercase"):
token_text = token_text.lower()
# Remove any leftover special characters or whitespace
if options.get("remove_special_chars"):
token_text = re.sub(r'[^\w\s-]', '', token_text)
if token_text.strip():
tokens.append(token_text.strip())
cleaned_docs.append(" ".join(tokens))
return pd.Series(cleaned_docs, index=text_series.index)