Spaces:
Runtime error
Runtime error
File size: 10,714 Bytes
b163aa7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 |
"""
Text Preprocessing Module
========================
Handles text normalization, translation, chunking, and optimization for TTS processing.
Implements caching and batch processing for improved performance.
"""
import re
import string
import logging
import asyncio
from typing import List, Tuple, Dict, Optional
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor
import time
import inflect
import requests
from requests.exceptions import Timeout, RequestException
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class TextProcessor:
"""High-performance text processor with caching and optimization."""
def __init__(self, max_chunk_length: int = 200, overlap_words: int = 5,
translation_timeout: int = 10):
"""
Initialize the text processor.
Args:
max_chunk_length: Maximum characters per chunk
overlap_words: Number of words to overlap between chunks
translation_timeout: Timeout for translation requests in seconds
"""
self.max_chunk_length = max_chunk_length
self.overlap_words = overlap_words
self.translation_timeout = translation_timeout
self.inflect_engine = inflect.engine()
self.translation_cache: Dict[str, str] = {}
self.number_cache: Dict[str, str] = {}
# Thread pool for parallel processing
self.executor = ThreadPoolExecutor(max_workers=4)
@lru_cache(maxsize=1000)
def _cached_translate(self, text: str) -> str:
"""
Cached translation function to avoid repeated API calls.
Args:
text: Text to translate
Returns:
Translated text in Armenian
"""
if not text.strip():
return text
try:
response = requests.get(
"https://translate.googleapis.com/translate_a/single",
params={
'client': 'gtx',
'sl': 'auto',
'tl': 'hy',
'dt': 't',
'q': text,
},
timeout=self.translation_timeout,
)
response.raise_for_status()
translation = response.json()[0][0][0]
logger.debug(f"Translated '{text}' to '{translation}'")
return translation
except (RequestException, Timeout, IndexError) as e:
logger.warning(f"Translation failed for '{text}': {e}")
return text # Return original text if translation fails
def _convert_number_to_armenian_words(self, number: int) -> str:
"""
Convert number to Armenian words with caching.
Args:
number: Integer to convert
Returns:
Number as Armenian words
"""
cache_key = str(number)
if cache_key in self.number_cache:
return self.number_cache[cache_key]
try:
# Convert to English words first
english_words = self.inflect_engine.number_to_words(number)
# Translate to Armenian
armenian_words = self._cached_translate(english_words)
# Cache the result
self.number_cache[cache_key] = armenian_words
return armenian_words
except Exception as e:
logger.warning(f"Number conversion failed for {number}: {e}")
return str(number) # Fallback to original number
def _normalize_text(self, text: str) -> str:
"""
Normalize text by handling numbers, punctuation, and special characters.
Args:
text: Input text to normalize
Returns:
Normalized text
"""
if not text:
return ""
# Convert to string and strip
text = str(text).strip()
# Process each word
words = []
for word in text.split():
# Extract numbers from word
if re.search(r'\d', word):
# Extract just the digits
digits = ''.join(filter(str.isdigit, word))
if digits:
try:
number = int(digits)
armenian_word = self._convert_number_to_armenian_words(number)
words.append(armenian_word)
except ValueError:
words.append(word) # Keep original if conversion fails
else:
words.append(word)
else:
words.append(word)
return ' '.join(words)
def _split_into_sentences(self, text: str) -> List[str]:
"""
Split text into sentences using multiple delimiters.
Args:
text: Text to split
Returns:
List of sentences
"""
# Armenian sentence delimiters
sentence_endings = r'[.!?։՞՜]+'
sentences = re.split(sentence_endings, text)
# Clean and filter empty sentences
sentences = [s.strip() for s in sentences if s.strip()]
return sentences
def chunk_text(self, text: str) -> List[str]:
"""
Intelligently chunk text for optimal TTS processing.
This method implements sophisticated chunking that:
1. Respects sentence boundaries
2. Maintains semantic coherence
3. Includes overlap for smooth transitions
4. Optimizes chunk sizes for the TTS model
Args:
text: Input text to chunk
Returns:
List of text chunks optimized for TTS
"""
if not text or len(text) <= self.max_chunk_length:
return [text] if text else []
sentences = self._split_into_sentences(text)
if not sentences:
return [text]
chunks = []
current_chunk = ""
for i, sentence in enumerate(sentences):
# If single sentence is too long, split by clauses
if len(sentence) > self.max_chunk_length:
# Split by commas and conjunctions
clauses = re.split(r'[,;]|\sև\s|\sկամ\s|\sբայց\s', sentence)
for clause in clauses:
clause = clause.strip()
if not clause:
continue
if len(current_chunk + " " + clause) <= self.max_chunk_length:
current_chunk = (current_chunk + " " + clause).strip()
else:
if current_chunk:
chunks.append(current_chunk)
current_chunk = clause
else:
# Try to add whole sentence
test_chunk = (current_chunk + " " + sentence).strip()
if len(test_chunk) <= self.max_chunk_length:
current_chunk = test_chunk
else:
# Current chunk is full, start new one
if current_chunk:
chunks.append(current_chunk)
current_chunk = sentence
# Add final chunk
if current_chunk:
chunks.append(current_chunk)
# Implement overlap for smooth transitions
if len(chunks) > 1:
chunks = self._add_overlap(chunks)
logger.info(f"Split text into {len(chunks)} chunks")
return chunks
def _add_overlap(self, chunks: List[str]) -> List[str]:
"""
Add overlapping words between chunks for smoother transitions.
Args:
chunks: List of text chunks
Returns:
Chunks with added overlap
"""
if len(chunks) <= 1:
return chunks
overlapped_chunks = [chunks[0]]
for i in range(1, len(chunks)):
prev_words = chunks[i-1].split()
current_chunk = chunks[i]
# Get last few words from previous chunk
overlap_words = prev_words[-self.overlap_words:] if len(prev_words) >= self.overlap_words else prev_words
overlap_text = " ".join(overlap_words)
# Prepend overlap to current chunk
overlapped_chunk = f"{overlap_text} {current_chunk}".strip()
overlapped_chunks.append(overlapped_chunk)
return overlapped_chunks
def process_text(self, text: str) -> str:
"""
Main text processing pipeline.
Args:
text: Raw input text
Returns:
Processed and normalized text ready for TTS
"""
start_time = time.time()
if not text or not text.strip():
return ""
try:
# Normalize the text
processed_text = self._normalize_text(text)
processing_time = time.time() - start_time
logger.info(f"Text processed in {processing_time:.3f}s")
return processed_text
except Exception as e:
logger.error(f"Text processing failed: {e}")
return str(text) # Return original text as fallback
def process_chunks(self, text: str) -> List[str]:
"""
Process text and return optimized chunks for TTS.
Args:
text: Input text
Returns:
List of processed text chunks
"""
# First normalize the text
processed_text = self.process_text(text)
# Then chunk it
chunks = self.chunk_text(processed_text)
return chunks
def clear_cache(self):
"""Clear all caches to free memory."""
self._cached_translate.cache_clear()
self.translation_cache.clear()
self.number_cache.clear()
logger.info("Caches cleared")
def get_cache_stats(self) -> Dict[str, int]:
"""Get statistics about cache usage."""
return {
"translation_cache_size": len(self.translation_cache),
"number_cache_size": len(self.number_cache),
"lru_cache_hits": self._cached_translate.cache_info().hits,
"lru_cache_misses": self._cached_translate.cache_info().misses,
}
|