menamiai / src /utils /elevenlabs_voiceover.py
dfdfdsfgs's picture
Deploy: Fix all errors & make HF Spaces ready - Fixed Gradio interface, frame constants, ElevenLabs API, Arrow3D parameters, added comprehensive error handling and demo mode. All deployment tests passing!
92ea014
"""
Copyright (c) 2025 Xposed73
All rights reserved.
This file is part of the Manim Voiceover project.
"""
import hashlib
import json
import requests
import os
from pathlib import Path
from manim_voiceover.services.base import SpeechService
from manim_voiceover.helper import remove_bookmarks
from src.config.config import Config
import time
class ElevenLabsService(SpeechService):
"""Speech service class for ElevenLabs TTS integration."""
def __init__(self,
api_key: str = None,
voice_id: str = None,
model_id: str = "eleven_multilingual_v2",
voice_settings: dict = None,
**kwargs):
"""
Initialize ElevenLabs service.
Args:
api_key: ElevenLabs API key (defaults to ELEVENLABS_API_KEY env var)
voice_id: Voice ID to use (defaults to ELEVENLABS_DEFAULT_VOICE_ID env var)
model_id: Model ID to use for generation
voice_settings: Voice settings dict with stability, similarity_boost, style, use_speaker_boost
"""
self.api_key = api_key or Config.ELEVENLABS_API_KEY
self.voice_id = voice_id or Config.ELEVENLABS_DEFAULT_VOICE_ID
self.model_id = model_id
# Default voice settings
default_settings = {
"stability": 0.5,
"similarity_boost": 0.75,
"style": 0.0,
"use_speaker_boost": True
}
self.voice_settings = voice_settings or default_settings
if not self.api_key:
raise ValueError("ElevenLabs API key not found. Please set ELEVENLABS_API_KEY environment variable.")
if not self.voice_id:
raise ValueError("ElevenLabs voice ID not found. Please set ELEVENLABS_DEFAULT_VOICE_ID environment variable.")
super().__init__(**kwargs)
def get_data_hash(self, input_data: dict) -> str:
"""
Generates a hash based on the input data dictionary.
The hash is used to create a unique identifier for the input data.
Parameters:
input_data (dict): A dictionary of input data (e.g., text, voice, etc.).
Returns:
str: The generated hash as a string.
"""
# Convert the input data dictionary to a JSON string (sorted for consistency)
data_str = json.dumps(input_data, sort_keys=True)
# Generate a SHA-256 hash of the JSON string
return hashlib.sha256(data_str.encode('utf-8')).hexdigest()
def text_to_speech(self, text: str, output_file: str) -> str:
"""
Generate audio using ElevenLabs API with robust error handling.
Args:
text (str): Text to synthesize
output_file (str): Path to save the audio file
Returns:
str: Path to the generated audio file
Raises:
Exception: If API request fails after retries
"""
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}"
headers = {
"Accept": "audio/mpeg",
"Content-Type": "application/json",
"xi-api-key": self.api_key
}
data = {
"text": text,
"model_id": "eleven_monolingual_v1",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.8
}
}
max_retries = 3
retry_delay = 1
for attempt in range(max_retries):
try:
response = requests.post(url, json=data, headers=headers, timeout=30)
response.raise_for_status()
# Save the audio file
with open(output_file, 'wb') as f:
f.write(response.content)
return output_file
except requests.exceptions.ConnectionError as e:
print(f"Connection error (attempt {attempt + 1}/{max_retries}): {e}")
if attempt < max_retries - 1:
time.sleep(retry_delay * (attempt + 1))
continue
# If all retries failed, create a silent audio file as fallback
self._create_silent_audio(output_file, duration=len(text) * 0.1) # Rough estimate
return output_file
except requests.exceptions.Timeout as e:
print(f"Timeout error (attempt {attempt + 1}/{max_retries}): {e}")
if attempt < max_retries - 1:
time.sleep(retry_delay * (attempt + 1))
continue
self._create_silent_audio(output_file, duration=len(text) * 0.1)
return output_file
except requests.exceptions.RequestException as e:
print(f"Request error (attempt {attempt + 1}/{max_retries}): {e}")
if attempt < max_retries - 1:
time.sleep(retry_delay * (attempt + 1))
continue
self._create_silent_audio(output_file, duration=len(text) * 0.1)
return output_file
# This should not be reached, but added for safety
self._create_silent_audio(output_file, duration=len(text) * 0.1)
return output_file
def _create_silent_audio(self, output_file: str, duration: float):
"""Create a silent audio file as fallback when API fails."""
try:
import numpy as np
from scipy.io import wavfile
sample_rate = 22050
samples = int(sample_rate * duration)
silence = np.zeros(samples, dtype=np.float32)
# Convert to appropriate format for wav
silence_int = (silence * 32767).astype(np.int16)
wavfile.write(output_file.replace('.mp3', '.wav'), sample_rate, silence_int)
print(f"Created silent audio fallback: {output_file}")
except Exception as e:
print(f"Failed to create silent audio: {e}")
# Create an empty file as last resort
with open(output_file, 'w') as f:
f.write("")
def generate_from_text(self, text: str, cache_dir: str = None, path: str = None) -> dict:
"""
Generate audio from text with caching support.
Args:
text: Text to convert to speech
cache_dir: Directory for caching audio files
path: Optional specific path for the audio file
Returns:
Dictionary with audio generation details
"""
if cache_dir is None:
cache_dir = self.cache_dir
input_data = {
"input_text": text,
"service": "elevenlabs",
"voice_id": self.voice_id,
"model_id": self.model_id,
"voice_settings": self.voice_settings
}
cached_result = self.get_cached_result(input_data, cache_dir)
if cached_result is not None:
return cached_result
if path is None:
audio_path = self.get_data_hash(input_data) + ".mp3"
else:
audio_path = path
# Generate audio file using ElevenLabs API
full_audio_path = str(Path(cache_dir) / audio_path)
self.text_to_speech(text, full_audio_path)
json_dict = {
"input_text": text,
"input_data": input_data,
"original_audio": audio_path,
}
return json_dict