loc

Sleeping

App Files Files Community

M4xjunior commited on Nov 29, 2024

Commit

d06e453

•

1 Parent(s): 375ecba

Upload sentence_analyzer.py

Browse files

Files changed (1) hide show

sentence_analyzer.py +253 -0

sentence_analyzer.py ADDED Viewed

	@@ -0,0 +1,253 @@

+#sentence_analyzer.py
+import re
+import logging
+from typing import List, Tuple
+from datetime import datetime
+import os
+import unicodedata
+import nltk
+# Download the Punkt tokenizer if not already downloaded
+nltk.download('punkt', quiet=True)
+from nltk.tokenize import sent_tokenize
+class SentenceAnalyzer:
+    def __init__(self):
+        self._setup_logger()
+        # Sentence types and their corresponding flags
+        self.SENTENCE_TYPES = ['exclamation', 'question', 'statement', 'ellipsis', 'quote', 'emphasis']
+        self.FLAGS = {
+            'exclamation': 'EXCL',
+            'question': 'QUES',
+            'statement': 'STMT',
+            'ellipsis': 'ELIP',
+            'quote': 'QUOT',
+            'emphasis': 'EMPH'
+        }
+        self.logger.info("SentenceAnalyzer initialized successfully")
+    def _setup_logger(self):
+        """Set up logging configuration."""
+        try:
+            # Create logs directory if it doesn't exist
+            os.makedirs('logs', exist_ok=True)
+            # Get current date for log file name
+            current_date = datetime.now().strftime('%Y-%m-%d')
+            log_file = f'logs/sentence_analyzer_{current_date}.log'
+            # Create and configure logger
+            self.logger = logging.getLogger('SentenceAnalyzer')
+            self.logger.setLevel(logging.DEBUG)  # Set to DEBUG to capture all logs
+            # Clear existing handlers to avoid duplicates
+            if self.logger.handlers:
+                self.logger.handlers.clear()
+            # Create file handler
+            file_handler = logging.FileHandler(log_file, encoding='utf-8')
+            file_handler.setLevel(logging.DEBUG)
+            # Create console handler
+            console_handler = logging.StreamHandler()
+            console_handler.setLevel(logging.INFO)
+            # Create formatter
+            formatter = logging.Formatter(
+                '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+            )
+            file_handler.setFormatter(formatter)
+            console_handler.setFormatter(formatter)
+            # Add handlers to logger
+            self.logger.addHandler(file_handler)
+            self.logger.addHandler(console_handler)
+            self.logger.debug("Logger set up successfully")
+        except Exception as e:
+            print(f"Error setting up logger: {str(e)}")
+            raise
+    def split_into_sentences(self, text: str) -> List[str]:
+        """Split text into sentences using NLTK's sentence tokenizer."""
+        if not text:
+            return []
+        self.logger.debug("Starting sentence splitting")
+        # Normalize Unicode characters
+        text = unicodedata.normalize('NFC', text)
+        self.logger.debug("Normalized text using NFC")
+        # Remove page numbers and chapter titles (common in PDFs)
+        text = re.sub(r'Page \d+|Chapter \d+:.*', '', text)
+        self.logger.debug("Removed page numbers and chapter titles")
+        # Replace hyphenated line breaks with just the word
+        text = re.sub(r'-\s+\n', '', text)
+        text = re.sub(r'-\s+', '', text)
+        self.logger.debug("Replaced hyphenated line breaks")
+        # Replace multiple newlines and carriage returns with a space
+        text = re.sub(r'[\r\n]+', ' ', text)
+        self.logger.debug("Replaced multiple newlines with a space")
+        # Replace multiple spaces with a single space
+        text = re.sub(r'\s+', ' ', text).strip()
+        self.logger.debug("Normalized whitespace")
+        # Use NLTK's sent_tokenize to split into sentences
+        sentences = sent_tokenize(text)
+        self.logger.debug(f"Split text into {len(sentences)} sentences using NLTK")
+        # Clean up sentences
+        sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
+        self.logger.info(f"Split text into {len(sentences)} sentences after cleanup")
+        return sentences
+    def analyze_sentence(self, sentence: str) -> Tuple[str, str, str]:
+        """Analyze a sentence and return its type, color (handled by CSS), and flag."""
+        if not sentence:
+            return ('statement', '', self.FLAGS['statement'])
+        sentence = sentence.strip()
+        self.logger.debug(f"Analyzing sentence: '{sentence}'")
+        # Function to check for complete quotes
+        def has_complete_quote(text):
+            quote_pairs = [
+                ('"', '"'),
+                ("'", "'"),
+                ('“', '”'),
+                ('‘', '’'),
+                ('«', '»')
+            ]
+            text = text.strip()
+            for open_quote, close_quote in quote_pairs:
+                if text.startswith(open_quote) and text.endswith(close_quote):
+                    # Ensure that the quotes are balanced
+                    if text.count(open_quote) == text.count(close_quote):
+                        self.logger.debug(f"Sentence starts and ends with matching quotes: {open_quote}{close_quote}")
+                        return True
+            return False
+        # Check if the entire sentence is enclosed in matching quotes
+        if has_complete_quote(sentence):
+            sent_type = 'quote'
+            self.logger.debug("Sentence classified as 'quote'")
+        # Check for emphasis
+        elif re.search(r'\*[^*]+\*', sentence):
+            sent_type = 'emphasis'
+            self.logger.debug("Sentence classified as 'emphasis'")
+        # Check regular sentence types
+        elif sentence.endswith(('!', '！')):
+            sent_type = 'exclamation'
+            self.logger.debug("Sentence classified as 'exclamation'")
+        elif sentence.endswith(('?', '？')):
+            sent_type = 'question'
+            self.logger.debug("Sentence classified as 'question'")
+        elif sentence.endswith('…') or sentence.endswith('...'):
+            sent_type = 'ellipsis'
+            self.logger.debug("Sentence classified as 'ellipsis'")
+        else:
+            sent_type = 'statement'
+            self.logger.debug("Sentence classified as 'statement'")
+        color = ''  # Color is now handled by CSS classes
+        self.logger.debug(f"Sentence type: {sent_type}, Flag: {self.FLAGS[sent_type]}")
+        return (sent_type, color, self.FLAGS[sent_type])
+    def clean_sentence(self, sentence: str) -> str:
+        """Remove special characters from the sentence that might confuse TTS models."""
+        # Define the pattern to match unwanted special characters
+        pattern = r'[^\w\s.,!?\'"“”‘’«»\-—()]'
+        cleaned_sentence = re.sub(pattern, '', sentence)
+        self.logger.debug(f"Cleaned sentence: '{cleaned_sentence}'")
+        return cleaned_sentence
+    def process_text_interactive(self, text: str) -> str:
+        """Process the text and return HTML-formatted output with interactive sentences."""
+        self.logger.info("Starting interactive text processing")
+        if not text:
+            self.logger.warning("Empty text received")
+            return ''
+        try:
+            # Normalize Unicode characters
+            text = unicodedata.normalize('NFC', text)
+            self.logger.debug("Normalized text using NFC in interactive processing")
+            sentences = self.split_into_sentences(text)
+            formatted_output = []
+            for index, sentence in enumerate(sentences, 1):
+                sent_type, color, flag = self.analyze_sentence(sentence)
+                # Updated HTML to include class for sentence type and data attribute for indexing
+                formatted_sentence = f'''
+                    <div class="sentence-row {sent_type}">
+                        <div class="sentence-number">{index}.</div>
+                        <div class="sentence-content">
+                            {sentence}
+                        </div>
+                        <div class="sentence-type">{sent_type.capitalize()}</div>
+                    </div>
+                '''
+                formatted_output.append(formatted_sentence)
+                self.logger.info(f"Processed sentence {index}/{len(sentences)} - Type: {sent_type}")
+                self.logger.debug(f"Formatted HTML for sentence {index}: {formatted_sentence}")
+            result = ''.join(formatted_output)
+            self.logger.info("Text processing completed successfully")
+            return result
+        except Exception as e:
+            self.logger.error(f"Error processing text: {str(e)}", exc_info=True)
+            return f'<span style="color: red;">Error processing text: {str(e)}</span>'
+    def prepare_text_for_tts(self, sentences: List[str]) -> str:
+        """Prepare the text for TTS by cleaning special characters from each sentence."""
+        cleaned_sentences = [self.clean_sentence(sentence) for sentence in sentences]
+        tts_text = ' '.join(cleaned_sentences)
+        self.logger.debug(f"Prepared text for TTS: '{tts_text}'")
+        return tts_text
+    def process_text(self, text: str) -> str:
+        """Legacy method for non-interactive processing. Kept for compatibility."""
+        self.logger.info("Starting text processing (legacy method)")
+        if not text:
+            self.logger.warning("Empty text received")
+            return ""
+        try:
+            # Normalize Unicode characters
+            text = unicodedata.normalize('NFC', text)
+            self.logger.debug("Normalized text using NFC in legacy processing")
+            sentences = self.split_into_sentences(text)
+            formatted_output = []
+            for index, sentence in enumerate(sentences, 1):
+                sent_type, _, flag = self.analyze_sentence(sentence)
+                # Color is now handled by CSS classes
+                formatted_sentence = (
+                    f'<span class="{sent_type}" '
+                    f'data-flag="{flag}" '
+                    f'title="Sentence type: {sent_type}">'
+                    f'{sentence}</span>'
+                )
+                formatted_output.append(formatted_sentence)
+                self.logger.info(f"Processed sentence {index}/{len(sentences)} - Type: {sent_type}")
+                self.logger.debug(f"Formatted HTML for sentence {index}: {formatted_sentence}")
+            result = " ".join(formatted_output)
+            self.logger.info("Text processing completed successfully")
+            return result
+        except Exception as e:
+            self.logger.error(f"Error processing text: {str(e)}", exc_info=True)
+            return f'<span style="color: red;">Error processing text: {str(e)}</span>'