Spaces:
Running
Running
File size: 5,063 Bytes
58d9769 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import logging
from typing import List, Tuple, Generator, Optional
import numpy as np
from utils.tts_base import TTSEngineBase, DummyTTSEngine
from utils.tts_engines import create_engine
# Configure logging
logger = logging.getLogger(__name__)
class CascadingTTSEngine(TTSEngineBase):
"""Cascading TTS engine implementation
This engine tries multiple TTS engines in order until one succeeds.
It provides a fallback mechanism to maximize the chances of getting
quality speech output.
"""
def __init__(self, engine_types: List[str], lang_code: str = 'z'):
"""Initialize the cascading TTS engine
Args:
engine_types (List[str]): List of engine types to try in order
lang_code (str): Language code for the engines
"""
super().__init__(lang_code)
self.engine_types = engine_types
self.lang_code = lang_code
logger.info(f"Initialized cascading TTS engine with engines: {engine_types}")
def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
"""Generate speech by trying multiple engines in order
Args:
text (str): Input text to synthesize
voice (str): Voice ID to use
speed (float): Speech speed multiplier
Returns:
str: Path to the generated audio file
"""
logger.info(f"Generating speech with cascading engine for text length: {len(text)}")
# Try each engine in order
for engine_type in self.engine_types:
try:
logger.info(f"Trying TTS engine: {engine_type}")
engine = create_engine(engine_type, self.lang_code)
# Generate speech with the current engine
result = engine.generate_speech(text, voice, speed)
# If the engine returned a valid result, return it
if result is not None:
logger.info(f"Successfully generated speech with {engine_type}")
return result
logger.warning(f"TTS engine {engine_type} failed to generate speech, trying next engine")
except Exception as e:
logger.error(f"Error with TTS engine {engine_type}: {str(e)}")
logger.error(f"Error type: {type(e).__name__}")
logger.warning(f"Trying next TTS engine")
# If all engines failed, fall back to dummy engine
logger.warning("All TTS engines failed, falling back to dummy engine")
return DummyTTSEngine(self.lang_code).generate_speech(text, voice, speed)
def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
"""Generate speech stream by trying multiple engines in order
Args:
text (str): Input text to synthesize
voice (str): Voice ID to use
speed (float): Speech speed multiplier
Yields:
tuple: (sample_rate, audio_data) pairs for each segment
"""
logger.info(f"Generating speech stream with cascading engine for text length: {len(text)}")
# Try each engine in order
for engine_type in self.engine_types:
try:
logger.info(f"Trying TTS engine for streaming: {engine_type}")
engine = create_engine(engine_type, self.lang_code)
# Create a generator for the current engine
generator = engine.generate_speech_stream(text, voice, speed)
# Try to get the first chunk to verify the engine works
first_chunk = next(generator, None)
if first_chunk is not None:
# Engine produced a valid first chunk, yield it and continue with this engine
logger.info(f"Successfully started speech stream with {engine_type}")
yield first_chunk
# Yield the rest of the chunks from this engine
for chunk in generator:
yield chunk
# Successfully streamed all chunks, return
return
logger.warning(f"TTS engine {engine_type} failed to generate speech stream, trying next engine")
except Exception as e:
logger.error(f"Error with TTS engine {engine_type} streaming: {str(e)}")
logger.error(f"Error type: {type(e).__name__}")
logger.warning(f"Trying next TTS engine for streaming")
# If all engines failed, fall back to dummy engine
logger.warning("All TTS engines failed for streaming, falling back to dummy engine")
yield from DummyTTSEngine(self.lang_code).generate_speech_stream(text, voice, speed) |