Spaces:
Sleeping
Sleeping
| import re | |
| import string | |
| import pandas as pd | |
| import spacy | |
| import emoji | |
| from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER | |
| from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS | |
| from spacy.util import compile_infix_regex | |
| from pathlib import Path | |
| from resource_path import resource_path | |
| class MultilingualPreprocessor: | |
| """ | |
| A robust text preprocessor using spaCy for multilingual support. | |
| """ | |
| def __init__(self, language: str): | |
| """ | |
| Initializes the preprocessor and loads the appropriate spaCy model. | |
| Args: | |
| language (str): 'english' or 'multilingual'. | |
| """ | |
| import sys | |
| model_map = { | |
| 'english': 'en_core_web_sm', | |
| 'multilingual': 'xx_ent_wiki_sm' | |
| } | |
| self.model_name = model_map.get(language, 'xx_ent_wiki_sm') | |
| try: | |
| # Check if running from PyInstaller bundle | |
| if hasattr(sys, '_MEIPASS'): | |
| # PyInstaller mode: load from bundled path | |
| model_path_obj = Path(resource_path(self.model_name)) | |
| self.nlp = spacy.util.load_model_from_path(model_path_obj) | |
| else: | |
| # Normal development mode: load by model name | |
| self.nlp = spacy.load(self.model_name) | |
| except OSError as e: | |
| print(f"spaCy Model Error: Could not load model '{self.model_name}'") | |
| print(f"Please run: python -m spacy download {self.model_name}") | |
| raise | |
| # Customize tokenizer to not split on hyphens in words | |
| # CORRECTED LINE: CONCAT_QUOTES is wrapped in a list [] | |
| infixes = LIST_ELLIPSES + LIST_ICONS + [CONCAT_QUOTES] | |
| infix_regex = compile_infix_regex(infixes) | |
| self.nlp.tokenizer.infix_finditer = infix_regex.finditer | |
| def preprocess_series(self, text_series: pd.Series, options: dict, n_process_spacy: int = -1) -> pd.Series: | |
| """ | |
| Applies a series of cleaning steps to a pandas Series of text. | |
| Args: | |
| text_series (pd.Series): The text to be cleaned. | |
| options (dict): A dictionary of preprocessing options. | |
| Returns: | |
| pd.Series: The cleaned text Series. | |
| """ | |
| # --- Stage 1: Fast, Regex-based cleaning (combined for performance) --- | |
| processed_text = text_series.copy().astype(str) | |
| # Combine all regex patterns into a single pass for better performance | |
| regex_patterns = [] | |
| if options.get("remove_html"): | |
| regex_patterns.append(r"<.*?>") | |
| if options.get("remove_urls"): | |
| regex_patterns.append(r"http\S+|www\.\S+") | |
| if options.get("handle_hashtags") == "Remove Hashtags": | |
| regex_patterns.append(r"#\w+") | |
| if options.get("handle_mentions") == "Remove Mentions": | |
| regex_patterns.append(r"@\w+") | |
| # Apply all regex replacements in a single pass | |
| if regex_patterns: | |
| combined_pattern = "|".join(regex_patterns) | |
| processed_text = processed_text.str.replace(combined_pattern, "", regex=True) | |
| # Emoji handling (separate as it needs special library) | |
| emoji_option = options.get("handle_emojis", "Keep Emojis") | |
| if emoji_option == "Remove Emojis": | |
| processed_text = processed_text.apply(lambda s: emoji.replace_emoji(s, replace='')) | |
| elif emoji_option == "Convert Emojis to Text": | |
| processed_text = processed_text.apply(emoji.demojize) | |
| # --- Stage 2: spaCy-based advanced processing --- | |
| # Using nlp.pipe for efficiency on a Series | |
| cleaned_docs = [] | |
| # docs = self.nlp.pipe(processed_text, n_process=-1, batch_size=500) | |
| docs = self.nlp.pipe(processed_text, n_process=n_process_spacy, batch_size=500) | |
| # Get custom stopwords and convert to lowercase set for fast lookups | |
| custom_stopwords = set(options.get("custom_stopwords", [])) | |
| for doc in docs: | |
| tokens = [] | |
| for token in doc: | |
| # Punctuation and Number handling | |
| if options.get("remove_punctuation") and token.is_punct: | |
| continue | |
| if options.get("remove_numbers") and (token.is_digit or token.like_num): | |
| continue | |
| # Stopword handling (including custom stopwords) | |
| is_stopword = token.is_stop or token.text.lower() in custom_stopwords | |
| if options.get("remove_stopwords") and is_stopword: | |
| continue | |
| # Use lemma if lemmatization is on, otherwise use the original text | |
| token_text = token.lemma_ if options.get("lemmatize") else token.text | |
| # Lowercasing (language-aware) | |
| if options.get("lowercase"): | |
| token_text = token_text.lower() | |
| # Remove any leftover special characters or whitespace | |
| if options.get("remove_special_chars"): | |
| token_text = re.sub(r'[^\w\s-]', '', token_text) | |
| if token_text.strip(): | |
| tokens.append(token_text.strip()) | |
| cleaned_docs.append(" ".join(tokens)) | |
| return pd.Series(cleaned_docs, index=text_series.index) |