participatory-planner / app /utils /text_processor.py
thadillo
Phases 1-3: Database schema, text processing, analyzer updates
71797a4
"""
Text processing utilities for sentence-level categorization.
Handles sentence segmentation and text cleaning.
"""
import re
from typing import List
import logging
logger = logging.getLogger(__name__)
class TextProcessor:
"""Handle sentence segmentation and text processing"""
@staticmethod
def segment_into_sentences(text: str) -> List[str]:
"""
Break text into sentences using multiple strategies.
Strategies:
1. NLTK punkt tokenizer (primary)
2. Regex-based fallback
3. Min/max length constraints
Args:
text: Input text to segment
Returns:
List of sentences
"""
# Clean text
text = text.strip()
if not text:
return []
# Try NLTK first (better accuracy)
try:
import nltk
# Try to use punkt tokenizer
try:
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(text)
except LookupError:
# Download punkt if not available
logger.info("Downloading NLTK punkt tokenizer...")
nltk.download('punkt', quiet=True)
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(text)
except Exception as e:
# Fallback: regex-based segmentation
logger.warning(f"NLTK tokenization failed ({e}), using regex fallback")
sentences = TextProcessor._regex_segmentation(text)
# Clean and filter
sentences = [s.strip() for s in sentences if s.strip()]
# Filter out very short "sentences" (likely not meaningful)
# Require at least 3 words
sentences = [s for s in sentences if len(s.split()) >= 3]
return sentences
@staticmethod
def _regex_segmentation(text: str) -> List[str]:
"""
Fallback sentence segmentation using regex.
This is less accurate than NLTK but works without dependencies.
"""
# Split on period, exclamation, question mark (followed by space or end)
# Look for: ., !, or ? followed by space + capital letter, or end of string
pattern = r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])$'
sentences = re.split(pattern, text)
return [s.strip() for s in sentences if s.strip()]
@staticmethod
def is_valid_sentence(sentence: str) -> bool:
"""
Check if sentence is valid for categorization.
Args:
sentence: Input sentence
Returns:
True if valid, False otherwise
"""
# Must have at least 3 words
if len(sentence.split()) < 3:
return False
# Must have some alphabetic characters
if not any(c.isalpha() for c in sentence):
return False
# Not just a list item or fragment
stripped = sentence.strip()
if stripped.startswith('-') or stripped.startswith('•') or stripped.startswith('*'):
# Allow if it has substantial text after the bullet
if len(stripped[1:].strip().split()) < 3:
return False
return True
@staticmethod
def clean_sentence(sentence: str) -> str:
"""
Clean a sentence for processing.
Args:
sentence: Input sentence
Returns:
Cleaned sentence
"""
# Remove leading bullet points or numbers
sentence = re.sub(r'^[\s\-•*\d.]+\s*', '', sentence)
# Normalize whitespace
sentence = ' '.join(sentence.split())
# Ensure it ends with punctuation
if sentence and not sentence[-1] in '.!?':
sentence += '.'
return sentence.strip()
@staticmethod
def segment_and_clean(text: str) -> List[str]:
"""
Segment text into sentences and clean them.
This is the main entry point for text processing.
Args:
text: Input text
Returns:
List of cleaned, valid sentences
"""
# Segment
sentences = TextProcessor.segment_into_sentences(text)
# Clean and filter
result = []
for sentence in sentences:
cleaned = TextProcessor.clean_sentence(sentence)
if TextProcessor.is_valid_sentence(cleaned):
result.append(cleaned)
return result
@staticmethod
def get_sentence_count_estimate(text: str) -> int:
"""
Quick estimate of sentence count without full processing.
Args:
text: Input text
Returns:
Estimated sentence count
"""
# Count sentence-ending punctuation
count = text.count('.') + text.count('!') + text.count('?')
# At least 1 if text exists
return max(1, count)