M4xjunior commited on
Commit
d06e453
1 Parent(s): 375ecba

Upload sentence_analyzer.py

Browse files
Files changed (1) hide show
  1. sentence_analyzer.py +253 -0
sentence_analyzer.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #sentence_analyzer.py
2
+ import re
3
+ import logging
4
+ from typing import List, Tuple
5
+ from datetime import datetime
6
+ import os
7
+ import unicodedata
8
+ import nltk
9
+
10
+ # Download the Punkt tokenizer if not already downloaded
11
+ nltk.download('punkt', quiet=True)
12
+ from nltk.tokenize import sent_tokenize
13
+
14
+ class SentenceAnalyzer:
15
+ def __init__(self):
16
+ self._setup_logger()
17
+
18
+ # Sentence types and their corresponding flags
19
+ self.SENTENCE_TYPES = ['exclamation', 'question', 'statement', 'ellipsis', 'quote', 'emphasis']
20
+ self.FLAGS = {
21
+ 'exclamation': 'EXCL',
22
+ 'question': 'QUES',
23
+ 'statement': 'STMT',
24
+ 'ellipsis': 'ELIP',
25
+ 'quote': 'QUOT',
26
+ 'emphasis': 'EMPH'
27
+ }
28
+
29
+ self.logger.info("SentenceAnalyzer initialized successfully")
30
+
31
+ def _setup_logger(self):
32
+ """Set up logging configuration."""
33
+ try:
34
+ # Create logs directory if it doesn't exist
35
+ os.makedirs('logs', exist_ok=True)
36
+
37
+ # Get current date for log file name
38
+ current_date = datetime.now().strftime('%Y-%m-%d')
39
+ log_file = f'logs/sentence_analyzer_{current_date}.log'
40
+
41
+ # Create and configure logger
42
+ self.logger = logging.getLogger('SentenceAnalyzer')
43
+ self.logger.setLevel(logging.DEBUG) # Set to DEBUG to capture all logs
44
+
45
+ # Clear existing handlers to avoid duplicates
46
+ if self.logger.handlers:
47
+ self.logger.handlers.clear()
48
+
49
+ # Create file handler
50
+ file_handler = logging.FileHandler(log_file, encoding='utf-8')
51
+ file_handler.setLevel(logging.DEBUG)
52
+
53
+ # Create console handler
54
+ console_handler = logging.StreamHandler()
55
+ console_handler.setLevel(logging.INFO)
56
+
57
+ # Create formatter
58
+ formatter = logging.Formatter(
59
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
60
+ )
61
+ file_handler.setFormatter(formatter)
62
+ console_handler.setFormatter(formatter)
63
+
64
+ # Add handlers to logger
65
+ self.logger.addHandler(file_handler)
66
+ self.logger.addHandler(console_handler)
67
+
68
+ self.logger.debug("Logger set up successfully")
69
+
70
+ except Exception as e:
71
+ print(f"Error setting up logger: {str(e)}")
72
+ raise
73
+
74
+ def split_into_sentences(self, text: str) -> List[str]:
75
+ """Split text into sentences using NLTK's sentence tokenizer."""
76
+ if not text:
77
+ return []
78
+
79
+ self.logger.debug("Starting sentence splitting")
80
+
81
+ # Normalize Unicode characters
82
+ text = unicodedata.normalize('NFC', text)
83
+ self.logger.debug("Normalized text using NFC")
84
+
85
+ # Remove page numbers and chapter titles (common in PDFs)
86
+ text = re.sub(r'Page \d+|Chapter \d+:.*', '', text)
87
+ self.logger.debug("Removed page numbers and chapter titles")
88
+
89
+ # Replace hyphenated line breaks with just the word
90
+ text = re.sub(r'-\s+\n', '', text)
91
+ text = re.sub(r'-\s+', '', text)
92
+ self.logger.debug("Replaced hyphenated line breaks")
93
+
94
+ # Replace multiple newlines and carriage returns with a space
95
+ text = re.sub(r'[\r\n]+', ' ', text)
96
+ self.logger.debug("Replaced multiple newlines with a space")
97
+
98
+ # Replace multiple spaces with a single space
99
+ text = re.sub(r'\s+', ' ', text).strip()
100
+ self.logger.debug("Normalized whitespace")
101
+
102
+ # Use NLTK's sent_tokenize to split into sentences
103
+ sentences = sent_tokenize(text)
104
+ self.logger.debug(f"Split text into {len(sentences)} sentences using NLTK")
105
+
106
+ # Clean up sentences
107
+ sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
108
+ self.logger.info(f"Split text into {len(sentences)} sentences after cleanup")
109
+ return sentences
110
+
111
+ def analyze_sentence(self, sentence: str) -> Tuple[str, str, str]:
112
+ """Analyze a sentence and return its type, color (handled by CSS), and flag."""
113
+ if not sentence:
114
+ return ('statement', '', self.FLAGS['statement'])
115
+
116
+ sentence = sentence.strip()
117
+ self.logger.debug(f"Analyzing sentence: '{sentence}'")
118
+
119
+ # Function to check for complete quotes
120
+ def has_complete_quote(text):
121
+ quote_pairs = [
122
+ ('"', '"'),
123
+ ("'", "'"),
124
+ ('“', '”'),
125
+ ('‘', '’'),
126
+ ('«', '»')
127
+ ]
128
+ text = text.strip()
129
+ for open_quote, close_quote in quote_pairs:
130
+ if text.startswith(open_quote) and text.endswith(close_quote):
131
+ # Ensure that the quotes are balanced
132
+ if text.count(open_quote) == text.count(close_quote):
133
+ self.logger.debug(f"Sentence starts and ends with matching quotes: {open_quote}{close_quote}")
134
+ return True
135
+ return False
136
+
137
+ # Check if the entire sentence is enclosed in matching quotes
138
+ if has_complete_quote(sentence):
139
+ sent_type = 'quote'
140
+ self.logger.debug("Sentence classified as 'quote'")
141
+ # Check for emphasis
142
+ elif re.search(r'\*[^*]+\*', sentence):
143
+ sent_type = 'emphasis'
144
+ self.logger.debug("Sentence classified as 'emphasis'")
145
+ # Check regular sentence types
146
+ elif sentence.endswith(('!', '!')):
147
+ sent_type = 'exclamation'
148
+ self.logger.debug("Sentence classified as 'exclamation'")
149
+ elif sentence.endswith(('?', '?')):
150
+ sent_type = 'question'
151
+ self.logger.debug("Sentence classified as 'question'")
152
+ elif sentence.endswith('…') or sentence.endswith('...'):
153
+ sent_type = 'ellipsis'
154
+ self.logger.debug("Sentence classified as 'ellipsis'")
155
+ else:
156
+ sent_type = 'statement'
157
+ self.logger.debug("Sentence classified as 'statement'")
158
+
159
+ color = '' # Color is now handled by CSS classes
160
+ self.logger.debug(f"Sentence type: {sent_type}, Flag: {self.FLAGS[sent_type]}")
161
+ return (sent_type, color, self.FLAGS[sent_type])
162
+
163
+ def clean_sentence(self, sentence: str) -> str:
164
+ """Remove special characters from the sentence that might confuse TTS models."""
165
+ # Define the pattern to match unwanted special characters
166
+ pattern = r'[^\w\s.,!?\'"“”‘’«»\-—()]'
167
+ cleaned_sentence = re.sub(pattern, '', sentence)
168
+ self.logger.debug(f"Cleaned sentence: '{cleaned_sentence}'")
169
+ return cleaned_sentence
170
+
171
+ def process_text_interactive(self, text: str) -> str:
172
+ """Process the text and return HTML-formatted output with interactive sentences."""
173
+ self.logger.info("Starting interactive text processing")
174
+
175
+ if not text:
176
+ self.logger.warning("Empty text received")
177
+ return ''
178
+
179
+ try:
180
+ # Normalize Unicode characters
181
+ text = unicodedata.normalize('NFC', text)
182
+ self.logger.debug("Normalized text using NFC in interactive processing")
183
+
184
+ sentences = self.split_into_sentences(text)
185
+ formatted_output = []
186
+
187
+ for index, sentence in enumerate(sentences, 1):
188
+ sent_type, color, flag = self.analyze_sentence(sentence)
189
+ # Updated HTML to include class for sentence type and data attribute for indexing
190
+ formatted_sentence = f'''
191
+ <div class="sentence-row {sent_type}">
192
+ <div class="sentence-number">{index}.</div>
193
+ <div class="sentence-content">
194
+ {sentence}
195
+ </div>
196
+ <div class="sentence-type">{sent_type.capitalize()}</div>
197
+ </div>
198
+ '''
199
+ formatted_output.append(formatted_sentence)
200
+ self.logger.info(f"Processed sentence {index}/{len(sentences)} - Type: {sent_type}")
201
+ self.logger.debug(f"Formatted HTML for sentence {index}: {formatted_sentence}")
202
+
203
+ result = ''.join(formatted_output)
204
+ self.logger.info("Text processing completed successfully")
205
+ return result
206
+
207
+ except Exception as e:
208
+ self.logger.error(f"Error processing text: {str(e)}", exc_info=True)
209
+ return f'<span style="color: red;">Error processing text: {str(e)}</span>'
210
+
211
+ def prepare_text_for_tts(self, sentences: List[str]) -> str:
212
+ """Prepare the text for TTS by cleaning special characters from each sentence."""
213
+ cleaned_sentences = [self.clean_sentence(sentence) for sentence in sentences]
214
+ tts_text = ' '.join(cleaned_sentences)
215
+ self.logger.debug(f"Prepared text for TTS: '{tts_text}'")
216
+ return tts_text
217
+
218
+ def process_text(self, text: str) -> str:
219
+ """Legacy method for non-interactive processing. Kept for compatibility."""
220
+ self.logger.info("Starting text processing (legacy method)")
221
+
222
+ if not text:
223
+ self.logger.warning("Empty text received")
224
+ return ""
225
+
226
+ try:
227
+ # Normalize Unicode characters
228
+ text = unicodedata.normalize('NFC', text)
229
+ self.logger.debug("Normalized text using NFC in legacy processing")
230
+
231
+ sentences = self.split_into_sentences(text)
232
+ formatted_output = []
233
+
234
+ for index, sentence in enumerate(sentences, 1):
235
+ sent_type, _, flag = self.analyze_sentence(sentence)
236
+ # Color is now handled by CSS classes
237
+ formatted_sentence = (
238
+ f'<span class="{sent_type}" '
239
+ f'data-flag="{flag}" '
240
+ f'title="Sentence type: {sent_type}">'
241
+ f'{sentence}</span>'
242
+ )
243
+ formatted_output.append(formatted_sentence)
244
+ self.logger.info(f"Processed sentence {index}/{len(sentences)} - Type: {sent_type}")
245
+ self.logger.debug(f"Formatted HTML for sentence {index}: {formatted_sentence}")
246
+
247
+ result = " ".join(formatted_output)
248
+ self.logger.info("Text processing completed successfully")
249
+ return result
250
+
251
+ except Exception as e:
252
+ self.logger.error(f"Error processing text: {str(e)}", exc_info=True)
253
+ return f'<span style="color: red;">Error processing text: {str(e)}</span>'