#sentence_analyzer.py import re import logging from typing import List, Tuple from datetime import datetime import os import unicodedata import nltk # Download the Punkt tokenizer if not already downloaded nltk.download('punkt', quiet=True) from nltk.tokenize import sent_tokenize class SentenceAnalyzer: def __init__(self): self._setup_logger() # Sentence types and their corresponding flags self.SENTENCE_TYPES = ['exclamation', 'question', 'statement', 'ellipsis', 'quote', 'emphasis'] self.FLAGS = { 'exclamation': 'EXCL', 'question': 'QUES', 'statement': 'STMT', 'ellipsis': 'ELIP', 'quote': 'QUOT', 'emphasis': 'EMPH' } self.logger.info("SentenceAnalyzer initialized successfully") def _setup_logger(self): """Set up logging configuration.""" try: # Create logs directory if it doesn't exist os.makedirs('logs', exist_ok=True) # Get current date for log file name current_date = datetime.now().strftime('%Y-%m-%d') log_file = f'logs/sentence_analyzer_{current_date}.log' # Create and configure logger self.logger = logging.getLogger('SentenceAnalyzer') self.logger.setLevel(logging.DEBUG) # Set to DEBUG to capture all logs # Clear existing handlers to avoid duplicates if self.logger.handlers: self.logger.handlers.clear() # Create file handler file_handler = logging.FileHandler(log_file, encoding='utf-8') file_handler.setLevel(logging.DEBUG) # Create console handler console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) # Create formatter formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) file_handler.setFormatter(formatter) console_handler.setFormatter(formatter) # Add handlers to logger self.logger.addHandler(file_handler) self.logger.addHandler(console_handler) self.logger.debug("Logger set up successfully") except Exception as e: print(f"Error setting up logger: {str(e)}") raise def split_into_sentences(self, text: str) -> List[str]: """Split text into sentences using NLTK's sentence tokenizer.""" if not text: return [] self.logger.debug("Starting sentence splitting") # Normalize Unicode characters text = unicodedata.normalize('NFC', text) self.logger.debug("Normalized text using NFC") # Remove page numbers and chapter titles (common in PDFs) text = re.sub(r'Page \d+|Chapter \d+:.*', '', text) self.logger.debug("Removed page numbers and chapter titles") # Replace hyphenated line breaks with just the word text = re.sub(r'-\s+\n', '', text) text = re.sub(r'-\s+', '', text) self.logger.debug("Replaced hyphenated line breaks") # Replace multiple newlines and carriage returns with a space text = re.sub(r'[\r\n]+', ' ', text) self.logger.debug("Replaced multiple newlines with a space") # Replace multiple spaces with a single space text = re.sub(r'\s+', ' ', text).strip() self.logger.debug("Normalized whitespace") # Use NLTK's sent_tokenize to split into sentences sentences = sent_tokenize(text) self.logger.debug(f"Split text into {len(sentences)} sentences using NLTK") # Clean up sentences sentences = [sentence.strip() for sentence in sentences if sentence.strip()] self.logger.info(f"Split text into {len(sentences)} sentences after cleanup") return sentences def analyze_sentence(self, sentence: str) -> Tuple[str, str, str]: """Analyze a sentence and return its type, color (handled by CSS), and flag.""" if not sentence: return ('statement', '', self.FLAGS['statement']) sentence = sentence.strip() self.logger.debug(f"Analyzing sentence: '{sentence}'") # Function to check for complete quotes def has_complete_quote(text): quote_pairs = [ ('"', '"'), ("'", "'"), ('“', '”'), ('‘', '’'), ('«', '»') ] text = text.strip() for open_quote, close_quote in quote_pairs: if text.startswith(open_quote) and text.endswith(close_quote): # Ensure that the quotes are balanced if text.count(open_quote) == text.count(close_quote): self.logger.debug(f"Sentence starts and ends with matching quotes: {open_quote}{close_quote}") return True return False # Check if the entire sentence is enclosed in matching quotes if has_complete_quote(sentence): sent_type = 'quote' self.logger.debug("Sentence classified as 'quote'") # Check for emphasis elif re.search(r'\*[^*]+\*', sentence): sent_type = 'emphasis' self.logger.debug("Sentence classified as 'emphasis'") # Check regular sentence types elif sentence.endswith(('!', '!')): sent_type = 'exclamation' self.logger.debug("Sentence classified as 'exclamation'") elif sentence.endswith(('?', '?')): sent_type = 'question' self.logger.debug("Sentence classified as 'question'") elif sentence.endswith('…') or sentence.endswith('...'): sent_type = 'ellipsis' self.logger.debug("Sentence classified as 'ellipsis'") else: sent_type = 'statement' self.logger.debug("Sentence classified as 'statement'") color = '' # Color is now handled by CSS classes self.logger.debug(f"Sentence type: {sent_type}, Flag: {self.FLAGS[sent_type]}") return (sent_type, color, self.FLAGS[sent_type]) def clean_sentence(self, sentence: str) -> str: """Remove special characters from the sentence that might confuse TTS models.""" # Define the pattern to match unwanted special characters pattern = r'[^\w\s.,!?\'"“”‘’«»\-—()]' cleaned_sentence = re.sub(pattern, '', sentence) self.logger.debug(f"Cleaned sentence: '{cleaned_sentence}'") return cleaned_sentence def process_text_interactive(self, text: str) -> str: """Process the text and return HTML-formatted output with interactive sentences.""" self.logger.info("Starting interactive text processing") if not text: self.logger.warning("Empty text received") return '' try: # Normalize Unicode characters text = unicodedata.normalize('NFC', text) self.logger.debug("Normalized text using NFC in interactive processing") sentences = self.split_into_sentences(text) formatted_output = [] for index, sentence in enumerate(sentences, 1): sent_type, color, flag = self.analyze_sentence(sentence) # Updated HTML to include class for sentence type and data attribute for indexing formatted_sentence = f'''