Spaces:
Running
Running
import os | |
import logging | |
import time | |
import soundfile as sf | |
logger = logging.getLogger(__name__) | |
# Wrap the problematic import in a try-except block | |
try: | |
from kokoro import KPipeline | |
KOKORO_AVAILABLE = True | |
except AttributeError as e: | |
# Specifically catch the EspeakWrapper.set_data_path error | |
if "EspeakWrapper" in str(e) and "set_data_path" in str(e): | |
logger.warning("Kokoro import failed due to EspeakWrapper.set_data_path issue") | |
KOKORO_AVAILABLE = False | |
else: | |
# Re-raise if it's a different error | |
raise | |
class TTSEngine: | |
def __init__(self, lang_code='z'): | |
"""Initialize TTS Engine with Kokoro | |
Args: | |
lang_code (str): Language code ('a' for US English, 'b' for British English, | |
'j' for Japanese, 'z' for Mandarin Chinese) | |
""" | |
logger.info("Initializing TTS Engine") | |
if not KOKORO_AVAILABLE: | |
logger.warning("Using dummy TTS implementation as Kokoro is not available") | |
self.pipeline = None | |
else: | |
self.pipeline = KPipeline(lang_code=lang_code) | |
logger.info("TTS engine initialized with Kokoro") | |
def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str: | |
"""Generate speech from text using Kokoro | |
Args: | |
text (str): Input text to synthesize | |
voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.) | |
speed (float): Speech speed multiplier (0.5 to 2.0) | |
Returns: | |
str: Path to the generated audio file | |
""" | |
logger.info(f"Generating speech for text length: {len(text)}") | |
try: | |
# Create output directory if it doesn't exist | |
os.makedirs("temp/outputs", exist_ok=True) | |
# Generate unique output path | |
output_path = f"temp/outputs/output_{int(time.time())}.wav" | |
if not KOKORO_AVAILABLE: | |
# Generate a simple sine wave as dummy audio | |
import numpy as np | |
sample_rate = 24000 | |
duration = 3.0 # seconds | |
t = np.linspace(0, duration, int(sample_rate * duration), False) | |
tone = np.sin(2 * np.pi * 440 * t) * 0.3 | |
logger.info(f"Saving dummy audio to {output_path}") | |
sf.write(output_path, tone, sample_rate) | |
logger.info(f"Dummy audio generation complete: {output_path}") | |
return output_path | |
# Get the first generated segment | |
# We only take the first segment since the original code handled single segments | |
generator = self.pipeline(text, voice=voice, speed=speed) | |
for _, _, audio in generator: | |
logger.info(f"Saving audio to {output_path}") | |
sf.write(output_path, audio, 24000) | |
break | |
logger.info(f"Audio generation complete: {output_path}") | |
return output_path | |
except Exception as e: | |
logger.error(f"TTS generation failed: {str(e)}", exc_info=True) | |
raise | |
def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0): | |
"""Generate speech from text and yield each segment | |
Args: | |
text (str): Input text to synthesize | |
voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.) | |
speed (float): Speech speed multiplier (0.5 to 2.0) | |
Yields: | |
tuple: (sample_rate, audio_data) pairs for each segment | |
""" | |
try: | |
if not KOKORO_AVAILABLE: | |
# Generate dummy audio chunks | |
import numpy as np | |
sample_rate = 24000 | |
duration = 1.0 # seconds per chunk | |
# Create 3 chunks of dummy audio | |
for i in range(3): | |
t = np.linspace(0, duration, int(sample_rate * duration), False) | |
freq = 440 + (i * 220) # Different frequency for each chunk | |
tone = np.sin(2 * np.pi * freq * t) * 0.3 | |
yield sample_rate, tone | |
return | |
generator = self.pipeline(text, voice=voice, speed=speed) | |
for _, _, audio in generator: | |
yield 24000, audio | |
except Exception as e: | |
logger.error(f"TTS streaming failed: {str(e)}", exc_info=True) | |
raise | |
# Initialize TTS engine with cache decorator if using Streamlit | |
def get_tts_engine(lang_code='a'): | |
"""Get or create TTS engine instance | |
Args: | |
lang_code (str): Language code for the pipeline | |
Returns: | |
TTSEngine: Initialized TTS engine instance | |
""" | |
try: | |
import streamlit as st | |
def _get_engine(): | |
return TTSEngine(lang_code) | |
return _get_engine() | |
except ImportError: | |
return TTSEngine(lang_code) | |
def generate_speech(text: str, voice: str = 'af_heart', speed: float = 1.0) -> str: | |
"""Public interface for TTS generation | |
Args: | |
text (str): Input text to synthesize | |
voice (str): Voice ID to use | |
speed (float): Speech speed multiplier | |
Returns: | |
str: Path to generated audio file | |
""" | |
engine = get_tts_engine() | |
return engine.generate_speech(text, voice, speed) |