Spaces:
Sleeping
Sleeping
| """ | |
| Text processing utilities for sentence-level categorization. | |
| Handles sentence segmentation and text cleaning. | |
| """ | |
| import re | |
| from typing import List | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class TextProcessor: | |
| """Handle sentence segmentation and text processing""" | |
| def segment_into_sentences(text: str) -> List[str]: | |
| """ | |
| Break text into sentences using multiple strategies. | |
| Strategies: | |
| 1. NLTK punkt tokenizer (primary) | |
| 2. Regex-based fallback | |
| 3. Min/max length constraints | |
| Args: | |
| text: Input text to segment | |
| Returns: | |
| List of sentences | |
| """ | |
| # Clean text | |
| text = text.strip() | |
| if not text: | |
| return [] | |
| # Try NLTK first (better accuracy) | |
| try: | |
| import nltk | |
| # Try to use punkt tokenizer | |
| try: | |
| from nltk.tokenize import sent_tokenize | |
| sentences = sent_tokenize(text) | |
| except LookupError: | |
| # Download punkt if not available | |
| logger.info("Downloading NLTK punkt tokenizer...") | |
| nltk.download('punkt', quiet=True) | |
| from nltk.tokenize import sent_tokenize | |
| sentences = sent_tokenize(text) | |
| except Exception as e: | |
| # Fallback: regex-based segmentation | |
| logger.warning(f"NLTK tokenization failed ({e}), using regex fallback") | |
| sentences = TextProcessor._regex_segmentation(text) | |
| # Clean and filter | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| # Filter out very short "sentences" (likely not meaningful) | |
| # Require at least 3 words | |
| sentences = [s for s in sentences if len(s.split()) >= 3] | |
| return sentences | |
| def _regex_segmentation(text: str) -> List[str]: | |
| """ | |
| Fallback sentence segmentation using regex. | |
| This is less accurate than NLTK but works without dependencies. | |
| """ | |
| # Split on period, exclamation, question mark (followed by space or end) | |
| # Look for: ., !, or ? followed by space + capital letter, or end of string | |
| pattern = r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])$' | |
| sentences = re.split(pattern, text) | |
| return [s.strip() for s in sentences if s.strip()] | |
| def is_valid_sentence(sentence: str) -> bool: | |
| """ | |
| Check if sentence is valid for categorization. | |
| Args: | |
| sentence: Input sentence | |
| Returns: | |
| True if valid, False otherwise | |
| """ | |
| # Must have at least 3 words | |
| if len(sentence.split()) < 3: | |
| return False | |
| # Must have some alphabetic characters | |
| if not any(c.isalpha() for c in sentence): | |
| return False | |
| # Not just a list item or fragment | |
| stripped = sentence.strip() | |
| if stripped.startswith('-') or stripped.startswith('•') or stripped.startswith('*'): | |
| # Allow if it has substantial text after the bullet | |
| if len(stripped[1:].strip().split()) < 3: | |
| return False | |
| return True | |
| def clean_sentence(sentence: str) -> str: | |
| """ | |
| Clean a sentence for processing. | |
| Args: | |
| sentence: Input sentence | |
| Returns: | |
| Cleaned sentence | |
| """ | |
| # Remove leading bullet points or numbers | |
| sentence = re.sub(r'^[\s\-•*\d.]+\s*', '', sentence) | |
| # Normalize whitespace | |
| sentence = ' '.join(sentence.split()) | |
| # Ensure it ends with punctuation | |
| if sentence and not sentence[-1] in '.!?': | |
| sentence += '.' | |
| return sentence.strip() | |
| def segment_and_clean(text: str) -> List[str]: | |
| """ | |
| Segment text into sentences and clean them. | |
| This is the main entry point for text processing. | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of cleaned, valid sentences | |
| """ | |
| # Segment | |
| sentences = TextProcessor.segment_into_sentences(text) | |
| # Clean and filter | |
| result = [] | |
| for sentence in sentences: | |
| cleaned = TextProcessor.clean_sentence(sentence) | |
| if TextProcessor.is_valid_sentence(cleaned): | |
| result.append(cleaned) | |
| return result | |
| def get_sentence_count_estimate(text: str) -> int: | |
| """ | |
| Quick estimate of sentence count without full processing. | |
| Args: | |
| text: Input text | |
| Returns: | |
| Estimated sentence count | |
| """ | |
| # Count sentence-ending punctuation | |
| count = text.count('.') + text.count('!') + text.count('?') | |
| # At least 1 if text exists | |
| return max(1, count) | |