Spaces:

CarsaAI
/

carsa_api

Running

App Files Files Community

carsa_api / tts_engine.py

athmontech

Initial commit: Carsa AI Backend for Hugging Face Spaces

d01de5d 2 months ago

raw

history blame contribute delete

12.7 kB

	"""
	Text-to-Speech Engine for Voice Assistant

	A complete, self-contained Python class that provides high-quality text-to-speech
	synthesis using the Coqui TTS library with multi-speaker support.

	Author: Voice Assistant Team
	Version: 1.0.0
	"""

	import os
	import torch
	from TTS.api import TTS
	import logging

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)


	class TTSEngine:
	"""
	A high-quality Text-to-Speech engine using Coqui TTS library.

	This class provides text-to-speech synthesis capabilities with support for
	multi-speaker models and custom voice cloning.
	"""

	def __init__(self, model_name="tts_models/en/vctk/vits"):
	"""
	Initialize the TTS engine with a pre-trained model.

	Args:
	model_name (str): The name of the TTS model to load.
	Default: "tts_models/en/vctk/vits" (multi-speaker English)

	Raises:
	Exception: If model loading fails
	"""
	try:
	logger.info("Initializing TTS Engine...")

	# Check for GPU availability
	device = "cuda" if torch.cuda.is_available() else "cpu"
	logger.info(f"Using device: {device}")
	if torch.cuda.is_available():
	logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
	logger.info(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
	else:
	logger.warning("No GPU detected - using CPU")

	# Load the TTS model
	logger.info(f"Loading model: {model_name}")
	self.tts = TTS(model_name=model_name).to(device)

	# Store model information
	self.model_name = model_name
	self.device = device

	logger.info(f"✅ TTS Engine initialized successfully!")
	logger.info(f" Model: {model_name}")
	logger.info(f" Device: {device}")

	# Print available speakers if it's a multi-speaker model
	if hasattr(self.tts, 'speakers') and self.tts.speakers:
	logger.info(f" Available speakers: {len(self.tts.speakers)}")
	logger.info(f" Sample speakers: {list(self.tts.speakers)[:5]}...")

	except Exception as e:
	logger.error(f"❌ Failed to initialize TTS Engine: {str(e)}")
	raise Exception(f"TTS Engine initialization failed: {str(e)}")

	def synthesize(self, text, output_path, speaker="p225", language=None):
	"""
	Synthesize speech from text and save to file.

	Args:
	text (str): The text to convert to speech
	output_path (str): File path to save the generated audio (.wav)
	speaker (str): Speaker ID for multi-speaker models (default: "p225")
	language (str): Language code (optional, auto-detected if None)

	Returns:
	bool: True if synthesis was successful, False otherwise

	Raises:
	Exception: If synthesis fails
	"""
	try:
	# Validate input
	if not text or not text.strip():
	raise ValueError("Text cannot be empty")

	if not output_path:
	raise ValueError("Output path cannot be empty")

	# Ensure output directory exists
	output_dir = os.path.dirname(output_path)
	if output_dir and not os.path.exists(output_dir):
	os.makedirs(output_dir)
	logger.info(f"Created output directory: {output_dir}")

	# Ensure output path has .wav extension
	if not output_path.lower().endswith('.wav'):
	output_path += '.wav'

	logger.info(f"Synthesizing speech...")
	logger.info(f" Text: {text[:50]}{'...' if len(text) > 50 else ''}")
	logger.info(f" Speaker: {speaker}")
	logger.info(f" Output: {output_path}")

	# Perform text-to-speech synthesis
	if language:
	# With explicit language
	self.tts.tts_to_file(
	text=text,
	speaker=speaker,
	language=language,
	file_path=output_path
	)
	else:
	# Auto-detect language
	self.tts.tts_to_file(
	text=text,
	speaker=speaker,
	file_path=output_path
	)

	# Verify the file was created
	if os.path.exists(output_path):
	file_size = os.path.getsize(output_path)
	logger.info(f"✅ Speech synthesis completed successfully!")
	logger.info(f" Output file: {output_path}")
	logger.info(f" File size: {file_size} bytes")
	return True
	else:
	raise Exception("Output file was not created")

	except Exception as e:
	logger.error(f"❌ Speech synthesis failed: {str(e)}")
	raise Exception(f"Speech synthesis failed: {str(e)}")

	def get_available_speakers(self):
	"""
	Get list of available speakers for the loaded model.

	Returns:
	list: List of available speaker IDs, or empty list if not a multi-speaker model
	"""
	try:
	if hasattr(self.tts, 'speakers') and self.tts.speakers:
	return list(self.tts.speakers)
	else:
	return []
	except Exception as e:
	logger.error(f"Failed to get available speakers: {str(e)}")
	return []

	def get_model_info(self):
	"""
	Get information about the loaded model.

	Returns:
	dict: Dictionary containing model information
	"""
	try:
	info = {
	"model_name": self.model_name,
	"device": self.device,
	"available_speakers": self.get_available_speakers(),
	"is_multi_speaker": hasattr(self.tts, 'speakers') and bool(self.tts.speakers)
	}
	return info
	except Exception as e:
	logger.error(f"Failed to get model info: {str(e)}")
	return {}

	@property
	def model(self):
	"""
	Property to check if the TTS model is loaded.

	Returns:
	bool: True if model is loaded, False otherwise
	"""
	return hasattr(self, 'tts') and self.tts is not None

	def synthesize_to_bytes(self, text, speaker="p225", language=None):
	"""
	Synthesize speech from text and return audio bytes directly.

	Args:
	text (str): The text to convert to speech
	speaker (str): Speaker ID for multi-speaker models (default: "p225")
	language (str): Language code (optional, auto-detected if None)

	Returns:
	bytes: Audio data as WAV bytes

	Raises:
	Exception: If synthesis fails
	"""
	try:
	# Validate input
	if not text or not text.strip():
	raise ValueError("Text cannot be empty")

	logger.info(f"Synthesizing speech to bytes...")
	logger.info(f" Text: {text[:50]}{'...' if len(text) > 50 else ''}")
	logger.info(f" Speaker: {speaker}")

	# Perform text-to-speech synthesis and get audio data
	if language:
	# With explicit language
	audio_data = self.tts.tts(
	text=text,
	speaker=speaker,
	language=language
	)
	else:
	# Auto-detect language
	audio_data = self.tts.tts(
	text=text,
	speaker=speaker
	)

	# Convert audio data to WAV bytes
	import io
	import soundfile as sf

	audio_bytes = io.BytesIO()
	sf.write(audio_bytes, audio_data, self.tts.synthesizer.output_sample_rate, format='WAV')
	audio_bytes.seek(0)

	logger.info(f"✅ Speech synthesis to bytes completed successfully!")
	logger.info(f" Audio size: {len(audio_bytes.getvalue())} bytes")

	return audio_bytes.getvalue()

	except Exception as e:
	logger.error(f"❌ Speech synthesis to bytes failed: {str(e)}")
	raise Exception(f"Speech synthesis to bytes failed: {str(e)}")


	# =============================================================================
	# CUSTOM VOICE CLONING INITIALIZATION (COMMENTED OUT)
	# =============================================================================
	"""
	# Alternative __init__ method for custom voice cloning
	# Uncomment and modify this section when you have a custom cloned voice model

	def __init__(self, model_path="path/to/your/custom/model", speaker_wav="speaker.wav"):
	'''
	Initialize the TTS engine with a custom cloned voice model.

	Args:
	model_path (str): Path to the custom TTS model directory
	speaker_wav (str): Path to the speaker reference audio file

	Raises:
	Exception: If model loading fails
	'''
	try:
	logger.info("Initializing TTS Engine with custom voice model...")

	# Check for GPU availability
	device = "cuda" if torch.cuda.is_available() else "cpu"
	logger.info(f"Using device: {device}")

	# Load the custom TTS model
	logger.info(f"Loading custom model from: {model_path}")
	self.tts = TTS(model_path=model_path).to(device)

	# Store model information
	self.model_path = model_path
	self.speaker_wav = speaker_wav
	self.device = device

	logger.info(f"✅ Custom TTS Engine initialized successfully!")
	logger.info(f" Model path: {model_path}")
	logger.info(f" Speaker file: {speaker_wav}")
	logger.info(f" Device: {device}")

	except Exception as e:
	logger.error(f"❌ Failed to initialize custom TTS Engine: {str(e)}")
	raise Exception(f"Custom TTS Engine initialization failed: {str(e)}")

	# Custom synthesis method for voice cloning
	def synthesize_with_cloned_voice(self, text, output_path):
	'''
	Synthesize speech using the cloned voice.

	Args:
	text (str): The text to convert to speech
	output_path (str): File path to save the generated audio

	Returns:
	bool: True if synthesis was successful
	'''
	try:
	logger.info(f"Synthesizing speech with cloned voice...")

	# Perform text-to-speech synthesis with cloned voice
	self.tts.tts_to_file(
	text=text,
	speaker_wav=self.speaker_wav,
	file_path=output_path
	)

	logger.info(f"✅ Cloned voice synthesis completed!")
	return True

	except Exception as e:
	logger.error(f"❌ Cloned voice synthesis failed: {str(e)}")
	raise Exception(f"Cloned voice synthesis failed: {str(e)}")
	"""


	def main():
	"""Example usage of the TTSEngine class."""
	try:
	# Create TTS engine instance
	logger.info("Creating TTS Engine instance...")
	tts_engine = TTSEngine()

	# Display model information
	model_info = tts_engine.get_model_info()
	logger.info(f"Model Information: {model_info}")

	# Test text for synthesis
	test_text = "Hello! This is a test of the text-to-speech engine. The voice synthesis is working perfectly."

	# Synthesize speech
	output_file = "test_output.wav"
	success = tts_engine.synthesize(
	text=test_text,
	output_path=output_file,
	speaker="p225" # Using a specific speaker from the VCTK dataset
	)

	if success:
	logger.info("🎉 Test completed successfully!")
	logger.info(f"Check the generated audio file: {output_file}")
	else:
	logger.error("❌ Test failed!")

	except Exception as e:
	logger.error(f"❌ Example usage failed: {str(e)}")


	if __name__ == "__main__":
	main()