Spaces:
Sleeping
Sleeping
""" | |
Copyright (c) 2025 Xposed73 | |
All rights reserved. | |
This file is part of the Manim Voiceover project. | |
""" | |
import hashlib | |
import json | |
import requests | |
import os | |
from pathlib import Path | |
from manim_voiceover.services.base import SpeechService | |
from manim_voiceover.helper import remove_bookmarks | |
from src.config.config import Config | |
import time | |
class ElevenLabsService(SpeechService): | |
"""Speech service class for ElevenLabs TTS integration.""" | |
def __init__(self, | |
api_key: str = None, | |
voice_id: str = None, | |
model_id: str = "eleven_multilingual_v2", | |
voice_settings: dict = None, | |
**kwargs): | |
""" | |
Initialize ElevenLabs service. | |
Args: | |
api_key: ElevenLabs API key (defaults to ELEVENLABS_API_KEY env var) | |
voice_id: Voice ID to use (defaults to ELEVENLABS_DEFAULT_VOICE_ID env var) | |
model_id: Model ID to use for generation | |
voice_settings: Voice settings dict with stability, similarity_boost, style, use_speaker_boost | |
""" | |
self.api_key = api_key or Config.ELEVENLABS_API_KEY | |
self.voice_id = voice_id or Config.ELEVENLABS_DEFAULT_VOICE_ID | |
self.model_id = model_id | |
# Default voice settings | |
default_settings = { | |
"stability": 0.5, | |
"similarity_boost": 0.75, | |
"style": 0.0, | |
"use_speaker_boost": True | |
} | |
self.voice_settings = voice_settings or default_settings | |
if not self.api_key: | |
raise ValueError("ElevenLabs API key not found. Please set ELEVENLABS_API_KEY environment variable.") | |
if not self.voice_id: | |
raise ValueError("ElevenLabs voice ID not found. Please set ELEVENLABS_DEFAULT_VOICE_ID environment variable.") | |
super().__init__(**kwargs) | |
def get_data_hash(self, input_data: dict) -> str: | |
""" | |
Generates a hash based on the input data dictionary. | |
The hash is used to create a unique identifier for the input data. | |
Parameters: | |
input_data (dict): A dictionary of input data (e.g., text, voice, etc.). | |
Returns: | |
str: The generated hash as a string. | |
""" | |
# Convert the input data dictionary to a JSON string (sorted for consistency) | |
data_str = json.dumps(input_data, sort_keys=True) | |
# Generate a SHA-256 hash of the JSON string | |
return hashlib.sha256(data_str.encode('utf-8')).hexdigest() | |
def text_to_speech(self, text: str, output_file: str) -> str: | |
""" | |
Generate audio using ElevenLabs API with robust error handling. | |
Args: | |
text (str): Text to synthesize | |
output_file (str): Path to save the audio file | |
Returns: | |
str: Path to the generated audio file | |
Raises: | |
Exception: If API request fails after retries | |
""" | |
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}" | |
headers = { | |
"Accept": "audio/mpeg", | |
"Content-Type": "application/json", | |
"xi-api-key": self.api_key | |
} | |
data = { | |
"text": text, | |
"model_id": "eleven_monolingual_v1", | |
"voice_settings": { | |
"stability": 0.5, | |
"similarity_boost": 0.8 | |
} | |
} | |
max_retries = 3 | |
retry_delay = 1 | |
for attempt in range(max_retries): | |
try: | |
response = requests.post(url, json=data, headers=headers, timeout=30) | |
response.raise_for_status() | |
# Save the audio file | |
with open(output_file, 'wb') as f: | |
f.write(response.content) | |
return output_file | |
except requests.exceptions.ConnectionError as e: | |
print(f"Connection error (attempt {attempt + 1}/{max_retries}): {e}") | |
if attempt < max_retries - 1: | |
time.sleep(retry_delay * (attempt + 1)) | |
continue | |
# If all retries failed, create a silent audio file as fallback | |
self._create_silent_audio(output_file, duration=len(text) * 0.1) # Rough estimate | |
return output_file | |
except requests.exceptions.Timeout as e: | |
print(f"Timeout error (attempt {attempt + 1}/{max_retries}): {e}") | |
if attempt < max_retries - 1: | |
time.sleep(retry_delay * (attempt + 1)) | |
continue | |
self._create_silent_audio(output_file, duration=len(text) * 0.1) | |
return output_file | |
except requests.exceptions.RequestException as e: | |
print(f"Request error (attempt {attempt + 1}/{max_retries}): {e}") | |
if attempt < max_retries - 1: | |
time.sleep(retry_delay * (attempt + 1)) | |
continue | |
self._create_silent_audio(output_file, duration=len(text) * 0.1) | |
return output_file | |
# This should not be reached, but added for safety | |
self._create_silent_audio(output_file, duration=len(text) * 0.1) | |
return output_file | |
def _create_silent_audio(self, output_file: str, duration: float): | |
"""Create a silent audio file as fallback when API fails.""" | |
try: | |
import numpy as np | |
from scipy.io import wavfile | |
sample_rate = 22050 | |
samples = int(sample_rate * duration) | |
silence = np.zeros(samples, dtype=np.float32) | |
# Convert to appropriate format for wav | |
silence_int = (silence * 32767).astype(np.int16) | |
wavfile.write(output_file.replace('.mp3', '.wav'), sample_rate, silence_int) | |
print(f"Created silent audio fallback: {output_file}") | |
except Exception as e: | |
print(f"Failed to create silent audio: {e}") | |
# Create an empty file as last resort | |
with open(output_file, 'w') as f: | |
f.write("") | |
def generate_from_text(self, text: str, cache_dir: str = None, path: str = None) -> dict: | |
""" | |
Generate audio from text with caching support. | |
Args: | |
text: Text to convert to speech | |
cache_dir: Directory for caching audio files | |
path: Optional specific path for the audio file | |
Returns: | |
Dictionary with audio generation details | |
""" | |
if cache_dir is None: | |
cache_dir = self.cache_dir | |
input_data = { | |
"input_text": text, | |
"service": "elevenlabs", | |
"voice_id": self.voice_id, | |
"model_id": self.model_id, | |
"voice_settings": self.voice_settings | |
} | |
cached_result = self.get_cached_result(input_data, cache_dir) | |
if cached_result is not None: | |
return cached_result | |
if path is None: | |
audio_path = self.get_data_hash(input_data) + ".mp3" | |
else: | |
audio_path = path | |
# Generate audio file using ElevenLabs API | |
full_audio_path = str(Path(cache_dir) / audio_path) | |
self.text_to_speech(text, full_audio_path) | |
json_dict = { | |
"input_text": text, | |
"input_data": input_data, | |
"original_audio": audio_path, | |
} | |
return json_dict |