Spaces:

ManojINaik
/

menamiai

Sleeping

menamiai / src /utils /elevenlabs_voiceover.py

Deploy: Fix all errors & make HF Spaces ready - Fixed Gradio interface, frame constants, ElevenLabs API, Arrow3D parameters, added comprehensive error handling and demo mode. All deployment tests passing!

92ea014 4 months ago

raw

history blame contribute delete

7.7 kB

	"""
	Copyright (c) 2025 Xposed73
	All rights reserved.
	This file is part of the Manim Voiceover project.
	"""

	import hashlib
	import json
	import requests
	import os
	from pathlib import Path
	from manim_voiceover.services.base import SpeechService
	from manim_voiceover.helper import remove_bookmarks
	from src.config.config import Config
	import time


	class ElevenLabsService(SpeechService):
	"""Speech service class for ElevenLabs TTS integration."""

	def __init__(self,
	api_key: str = None,
	voice_id: str = None,
	model_id: str = "eleven_multilingual_v2",
	voice_settings: dict = None,
	**kwargs):
	"""
	Initialize ElevenLabs service.

	Args:
	api_key: ElevenLabs API key (defaults to ELEVENLABS_API_KEY env var)
	voice_id: Voice ID to use (defaults to ELEVENLABS_DEFAULT_VOICE_ID env var)
	model_id: Model ID to use for generation
	voice_settings: Voice settings dict with stability, similarity_boost, style, use_speaker_boost
	"""
	self.api_key = api_key or Config.ELEVENLABS_API_KEY
	self.voice_id = voice_id or Config.ELEVENLABS_DEFAULT_VOICE_ID
	self.model_id = model_id

	# Default voice settings
	default_settings = {
	"stability": 0.5,
	"similarity_boost": 0.75,
	"style": 0.0,
	"use_speaker_boost": True
	}
	self.voice_settings = voice_settings or default_settings

	if not self.api_key:
	raise ValueError("ElevenLabs API key not found. Please set ELEVENLABS_API_KEY environment variable.")
	if not self.voice_id:
	raise ValueError("ElevenLabs voice ID not found. Please set ELEVENLABS_DEFAULT_VOICE_ID environment variable.")

	super().__init__(**kwargs)

	def get_data_hash(self, input_data: dict) -> str:
	"""
	Generates a hash based on the input data dictionary.
	The hash is used to create a unique identifier for the input data.

	Parameters:
	input_data (dict): A dictionary of input data (e.g., text, voice, etc.).

	Returns:
	str: The generated hash as a string.
	"""
	# Convert the input data dictionary to a JSON string (sorted for consistency)
	data_str = json.dumps(input_data, sort_keys=True)
	# Generate a SHA-256 hash of the JSON string
	return hashlib.sha256(data_str.encode('utf-8')).hexdigest()

	def text_to_speech(self, text: str, output_file: str) -> str:
	"""
	Generate audio using ElevenLabs API with robust error handling.

	Args:
	text (str): Text to synthesize
	output_file (str): Path to save the audio file

	Returns:
	str: Path to the generated audio file

	Raises:
	Exception: If API request fails after retries
	"""
	url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}"

	headers = {
	"Accept": "audio/mpeg",
	"Content-Type": "application/json",
	"xi-api-key": self.api_key
	}

	data = {
	"text": text,
	"model_id": "eleven_monolingual_v1",
	"voice_settings": {
	"stability": 0.5,
	"similarity_boost": 0.8
	}
	}

	max_retries = 3
	retry_delay = 1

	for attempt in range(max_retries):
	try:
	response = requests.post(url, json=data, headers=headers, timeout=30)
	response.raise_for_status()

	# Save the audio file
	with open(output_file, 'wb') as f:
	f.write(response.content)

	return output_file

	except requests.exceptions.ConnectionError as e:
	print(f"Connection error (attempt {attempt + 1}/{max_retries}): {e}")
	if attempt < max_retries - 1:
	time.sleep(retry_delay * (attempt + 1))
	continue
	# If all retries failed, create a silent audio file as fallback
	self._create_silent_audio(output_file, duration=len(text) * 0.1) # Rough estimate
	return output_file

	except requests.exceptions.Timeout as e:
	print(f"Timeout error (attempt {attempt + 1}/{max_retries}): {e}")
	if attempt < max_retries - 1:
	time.sleep(retry_delay * (attempt + 1))
	continue
	self._create_silent_audio(output_file, duration=len(text) * 0.1)
	return output_file

	except requests.exceptions.RequestException as e:
	print(f"Request error (attempt {attempt + 1}/{max_retries}): {e}")
	if attempt < max_retries - 1:
	time.sleep(retry_delay * (attempt + 1))
	continue
	self._create_silent_audio(output_file, duration=len(text) * 0.1)
	return output_file

	# This should not be reached, but added for safety
	self._create_silent_audio(output_file, duration=len(text) * 0.1)
	return output_file

	def _create_silent_audio(self, output_file: str, duration: float):
	"""Create a silent audio file as fallback when API fails."""
	try:
	import numpy as np
	from scipy.io import wavfile

	sample_rate = 22050
	samples = int(sample_rate * duration)
	silence = np.zeros(samples, dtype=np.float32)

	# Convert to appropriate format for wav
	silence_int = (silence * 32767).astype(np.int16)
	wavfile.write(output_file.replace('.mp3', '.wav'), sample_rate, silence_int)

	print(f"Created silent audio fallback: {output_file}")

	except Exception as e:
	print(f"Failed to create silent audio: {e}")
	# Create an empty file as last resort
	with open(output_file, 'w') as f:
	f.write("")

	def generate_from_text(self, text: str, cache_dir: str = None, path: str = None) -> dict:
	"""
	Generate audio from text with caching support.

	Args:
	text: Text to convert to speech
	cache_dir: Directory for caching audio files
	path: Optional specific path for the audio file

	Returns:
	Dictionary with audio generation details
	"""
	if cache_dir is None:
	cache_dir = self.cache_dir

	input_data = {
	"input_text": text,
	"service": "elevenlabs",
	"voice_id": self.voice_id,
	"model_id": self.model_id,
	"voice_settings": self.voice_settings
	}

	cached_result = self.get_cached_result(input_data, cache_dir)
	if cached_result is not None:
	return cached_result

	if path is None:
	audio_path = self.get_data_hash(input_data) + ".mp3"
	else:
	audio_path = path

	# Generate audio file using ElevenLabs API
	full_audio_path = str(Path(cache_dir) / audio_path)
	self.text_to_speech(text, full_audio_path)

	json_dict = {
	"input_text": text,
	"input_data": input_data,
	"original_audio": audio_path,
	}

	return json_dict