Spaces:
Running
Running
File size: 5,494 Bytes
3ed3b5a 58d9769 3ed3b5a 58d9769 3ed3b5a 60bd17d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import os
import time
import logging
import soundfile as sf
import numpy as np
from abc import ABC, abstractmethod
from typing import Tuple, Generator, Optional
# Configure logging
logger = logging.getLogger(__name__)
class TTSEngineBase(ABC):
"""Base class for all TTS engines
This abstract class defines the interface that all TTS engines must implement.
It also provides common utility methods for file handling and audio generation.
"""
def __init__(self, lang_code: str = 'z'):
"""Initialize the TTS engine
Args:
lang_code (str): Language code ('a' for US English, 'b' for British English,
'j' for Japanese, 'z' for Mandarin Chinese)
Note: Not all engines support all language codes
"""
self.lang_code = lang_code
logger.info(f"Initializing {self.__class__.__name__} with language code: {lang_code}")
@abstractmethod
def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Optional[str]:
"""Generate speech from text
Args:
text (str): Input text to synthesize
voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
Note: Not all engines support all voices
speed (float): Speech speed multiplier (0.5 to 2.0)
Note: Not all engines support speed adjustment
Returns:
Optional[str]: Path to the generated audio file, or None if generation fails
"""
pass
def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
"""Generate speech from text and yield each segment
Args:
text (str): Input text to synthesize
voice (str): Voice ID to use
speed (float): Speech speed multiplier
Yields:
tuple: (sample_rate, audio_data) pairs for each segment
"""
# Default implementation: generate full audio and yield as a single chunk
output_path = self.generate_speech(text, voice, speed)
audio_data, sample_rate = sf.read(output_path)
yield sample_rate, audio_data
def _create_output_dir(self) -> str:
"""Create output directory for audio files
Returns:
str: Path to the output directory
"""
output_dir = "temp/outputs"
os.makedirs(output_dir, exist_ok=True)
return output_dir
def _generate_output_path(self, prefix: str = "output") -> str:
"""Generate a unique output path for audio files
Args:
prefix (str): Prefix for the output filename
Returns:
str: Path to the output file
"""
output_dir = self._create_output_dir()
timestamp = int(time.time())
return f"{output_dir}/{prefix}_{timestamp}.wav"
class DummyTTSEngine(TTSEngineBase):
"""Dummy TTS engine that generates a simple sine wave
This engine is used as a fallback when no other engines are available.
"""
def __init__(self, lang_code: str = 'z'):
super().__init__(lang_code)
logger.warning("Using dummy TTS implementation as no other engines are available")
def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
"""Generate a dummy audio file with a simple sine wave
Args:
text (str): Input text (not used)
voice (str): Voice ID (not used)
speed (float): Speed multiplier (not used)
Returns:
str: Path to the generated dummy audio file
"""
logger.info(f"Generating dummy speech for text length: {len(text)}")
# Generate unique output path
output_path = self._generate_output_path("dummy")
# Generate a simple sine wave
sample_rate = 24000
duration = 3.0 # seconds
t = np.linspace(0, duration, int(sample_rate * duration), False)
tone = np.sin(2 * np.pi * 440 * t) * 0.3
# Save the audio file
logger.info(f"Saving dummy audio to {output_path}")
sf.write(output_path, tone, sample_rate)
logger.info(f"Dummy audio generation complete: {output_path}")
return output_path
def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
"""Generate dummy audio chunks with simple sine waves
Args:
text (str): Input text (not used)
voice (str): Voice ID (not used)
speed (float): Speed multiplier (not used)
Yields:
tuple: (sample_rate, audio_data) pairs for each dummy segment
"""
logger.info(f"Generating dummy speech stream for text length: {len(text)}")
sample_rate = 24000
duration = 1.0 # seconds per chunk
# Create 3 chunks of dummy audio
for i in range(3):
t = np.linspace(0, duration, int(sample_rate * duration), False)
freq = 440 + (i * 220) # Different frequency for each chunk
tone = np.sin(2 * np.pi * freq * t) * 0.3
yield sample_rate, tone |