Spaces:

Thadillo
/

participatory-planner

Sleeping

participatory-planner / app /utils /text_processor.py

thadillo

Phases 1-3: Database schema, text processing, analyzer updates

71797a4 2 months ago

5.15 kB

	"""
	Text processing utilities for sentence-level categorization.
	Handles sentence segmentation and text cleaning.
	"""

	import re
	from typing import List
	import logging

	logger = logging.getLogger(__name__)

	class TextProcessor:
	"""Handle sentence segmentation and text processing"""

	@staticmethod
	def segment_into_sentences(text: str) -> List[str]:
	"""
	Break text into sentences using multiple strategies.

	Strategies:
	1. NLTK punkt tokenizer (primary)
	2. Regex-based fallback
	3. Min/max length constraints

	Args:
	text: Input text to segment

	Returns:
	List of sentences
	"""
	# Clean text
	text = text.strip()

	if not text:
	return []

	# Try NLTK first (better accuracy)
	try:
	import nltk
	# Try to use punkt tokenizer
	try:
	from nltk.tokenize import sent_tokenize
	sentences = sent_tokenize(text)
	except LookupError:
	# Download punkt if not available
	logger.info("Downloading NLTK punkt tokenizer...")
	nltk.download('punkt', quiet=True)
	from nltk.tokenize import sent_tokenize
	sentences = sent_tokenize(text)
	except Exception as e:
	# Fallback: regex-based segmentation
	logger.warning(f"NLTK tokenization failed ({e}), using regex fallback")
	sentences = TextProcessor._regex_segmentation(text)

	# Clean and filter
	sentences = [s.strip() for s in sentences if s.strip()]

	# Filter out very short "sentences" (likely not meaningful)
	# Require at least 3 words
	sentences = [s for s in sentences if len(s.split()) >= 3]

	return sentences

	@staticmethod
	def _regex_segmentation(text: str) -> List[str]:
	"""
	Fallback sentence segmentation using regex.

	This is less accurate than NLTK but works without dependencies.
	"""
	# Split on period, exclamation, question mark (followed by space or end)
	# Look for: ., !, or ? followed by space + capital letter, or end of string
	pattern = r'(?<=[.!?])\s+(?=[A-Z])\|(?<=[.!?])$'
	sentences = re.split(pattern, text)

	return [s.strip() for s in sentences if s.strip()]

	@staticmethod
	def is_valid_sentence(sentence: str) -> bool:
	"""
	Check if sentence is valid for categorization.

	Args:
	sentence: Input sentence

	Returns:
	True if valid, False otherwise
	"""
	# Must have at least 3 words
	if len(sentence.split()) < 3:
	return False

	# Must have some alphabetic characters
	if not any(c.isalpha() for c in sentence):
	return False

	# Not just a list item or fragment
	stripped = sentence.strip()
	if stripped.startswith('-') or stripped.startswith('•') or stripped.startswith('*'):
	# Allow if it has substantial text after the bullet
	if len(stripped[1:].strip().split()) < 3:
	return False

	return True

	@staticmethod
	def clean_sentence(sentence: str) -> str:
	"""
	Clean a sentence for processing.

	Args:
	sentence: Input sentence

	Returns:
	Cleaned sentence
	"""
	# Remove leading bullet points or numbers
	sentence = re.sub(r'^[\s\-•\d.]+\s', '', sentence)

	# Normalize whitespace
	sentence = ' '.join(sentence.split())

	# Ensure it ends with punctuation
	if sentence and not sentence[-1] in '.!?':
	sentence += '.'

	return sentence.strip()

	@staticmethod
	def segment_and_clean(text: str) -> List[str]:
	"""
	Segment text into sentences and clean them.

	This is the main entry point for text processing.

	Args:
	text: Input text

	Returns:
	List of cleaned, valid sentences
	"""
	# Segment
	sentences = TextProcessor.segment_into_sentences(text)

	# Clean and filter
	result = []
	for sentence in sentences:
	cleaned = TextProcessor.clean_sentence(sentence)
	if TextProcessor.is_valid_sentence(cleaned):
	result.append(cleaned)

	return result

	@staticmethod
	def get_sentence_count_estimate(text: str) -> int:
	"""
	Quick estimate of sentence count without full processing.

	Args:
	text: Input text

	Returns:
	Estimated sentence count
	"""
	# Count sentence-ending punctuation
	count = text.count('.') + text.count('!') + text.count('?')

	# At least 1 if text exists
	return max(1, count)