carsa_api / tts_engine.py
athmontech's picture
Initial commit: Carsa AI Backend for Hugging Face Spaces
d01de5d
"""
Text-to-Speech Engine for Voice Assistant
A complete, self-contained Python class that provides high-quality text-to-speech
synthesis using the Coqui TTS library with multi-speaker support.
Author: Voice Assistant Team
Version: 1.0.0
"""
import os
import torch
from TTS.api import TTS
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class TTSEngine:
"""
A high-quality Text-to-Speech engine using Coqui TTS library.
This class provides text-to-speech synthesis capabilities with support for
multi-speaker models and custom voice cloning.
"""
def __init__(self, model_name="tts_models/en/vctk/vits"):
"""
Initialize the TTS engine with a pre-trained model.
Args:
model_name (str): The name of the TTS model to load.
Default: "tts_models/en/vctk/vits" (multi-speaker English)
Raises:
Exception: If model loading fails
"""
try:
logger.info("Initializing TTS Engine...")
# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device}")
if torch.cuda.is_available():
logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
logger.info(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
logger.warning("No GPU detected - using CPU")
# Load the TTS model
logger.info(f"Loading model: {model_name}")
self.tts = TTS(model_name=model_name).to(device)
# Store model information
self.model_name = model_name
self.device = device
logger.info(f"βœ… TTS Engine initialized successfully!")
logger.info(f" Model: {model_name}")
logger.info(f" Device: {device}")
# Print available speakers if it's a multi-speaker model
if hasattr(self.tts, 'speakers') and self.tts.speakers:
logger.info(f" Available speakers: {len(self.tts.speakers)}")
logger.info(f" Sample speakers: {list(self.tts.speakers)[:5]}...")
except Exception as e:
logger.error(f"❌ Failed to initialize TTS Engine: {str(e)}")
raise Exception(f"TTS Engine initialization failed: {str(e)}")
def synthesize(self, text, output_path, speaker="p225", language=None):
"""
Synthesize speech from text and save to file.
Args:
text (str): The text to convert to speech
output_path (str): File path to save the generated audio (.wav)
speaker (str): Speaker ID for multi-speaker models (default: "p225")
language (str): Language code (optional, auto-detected if None)
Returns:
bool: True if synthesis was successful, False otherwise
Raises:
Exception: If synthesis fails
"""
try:
# Validate input
if not text or not text.strip():
raise ValueError("Text cannot be empty")
if not output_path:
raise ValueError("Output path cannot be empty")
# Ensure output directory exists
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
logger.info(f"Created output directory: {output_dir}")
# Ensure output path has .wav extension
if not output_path.lower().endswith('.wav'):
output_path += '.wav'
logger.info(f"Synthesizing speech...")
logger.info(f" Text: {text[:50]}{'...' if len(text) > 50 else ''}")
logger.info(f" Speaker: {speaker}")
logger.info(f" Output: {output_path}")
# Perform text-to-speech synthesis
if language:
# With explicit language
self.tts.tts_to_file(
text=text,
speaker=speaker,
language=language,
file_path=output_path
)
else:
# Auto-detect language
self.tts.tts_to_file(
text=text,
speaker=speaker,
file_path=output_path
)
# Verify the file was created
if os.path.exists(output_path):
file_size = os.path.getsize(output_path)
logger.info(f"βœ… Speech synthesis completed successfully!")
logger.info(f" Output file: {output_path}")
logger.info(f" File size: {file_size} bytes")
return True
else:
raise Exception("Output file was not created")
except Exception as e:
logger.error(f"❌ Speech synthesis failed: {str(e)}")
raise Exception(f"Speech synthesis failed: {str(e)}")
def get_available_speakers(self):
"""
Get list of available speakers for the loaded model.
Returns:
list: List of available speaker IDs, or empty list if not a multi-speaker model
"""
try:
if hasattr(self.tts, 'speakers') and self.tts.speakers:
return list(self.tts.speakers)
else:
return []
except Exception as e:
logger.error(f"Failed to get available speakers: {str(e)}")
return []
def get_model_info(self):
"""
Get information about the loaded model.
Returns:
dict: Dictionary containing model information
"""
try:
info = {
"model_name": self.model_name,
"device": self.device,
"available_speakers": self.get_available_speakers(),
"is_multi_speaker": hasattr(self.tts, 'speakers') and bool(self.tts.speakers)
}
return info
except Exception as e:
logger.error(f"Failed to get model info: {str(e)}")
return {}
@property
def model(self):
"""
Property to check if the TTS model is loaded.
Returns:
bool: True if model is loaded, False otherwise
"""
return hasattr(self, 'tts') and self.tts is not None
def synthesize_to_bytes(self, text, speaker="p225", language=None):
"""
Synthesize speech from text and return audio bytes directly.
Args:
text (str): The text to convert to speech
speaker (str): Speaker ID for multi-speaker models (default: "p225")
language (str): Language code (optional, auto-detected if None)
Returns:
bytes: Audio data as WAV bytes
Raises:
Exception: If synthesis fails
"""
try:
# Validate input
if not text or not text.strip():
raise ValueError("Text cannot be empty")
logger.info(f"Synthesizing speech to bytes...")
logger.info(f" Text: {text[:50]}{'...' if len(text) > 50 else ''}")
logger.info(f" Speaker: {speaker}")
# Perform text-to-speech synthesis and get audio data
if language:
# With explicit language
audio_data = self.tts.tts(
text=text,
speaker=speaker,
language=language
)
else:
# Auto-detect language
audio_data = self.tts.tts(
text=text,
speaker=speaker
)
# Convert audio data to WAV bytes
import io
import soundfile as sf
audio_bytes = io.BytesIO()
sf.write(audio_bytes, audio_data, self.tts.synthesizer.output_sample_rate, format='WAV')
audio_bytes.seek(0)
logger.info(f"βœ… Speech synthesis to bytes completed successfully!")
logger.info(f" Audio size: {len(audio_bytes.getvalue())} bytes")
return audio_bytes.getvalue()
except Exception as e:
logger.error(f"❌ Speech synthesis to bytes failed: {str(e)}")
raise Exception(f"Speech synthesis to bytes failed: {str(e)}")
# =============================================================================
# CUSTOM VOICE CLONING INITIALIZATION (COMMENTED OUT)
# =============================================================================
"""
# Alternative __init__ method for custom voice cloning
# Uncomment and modify this section when you have a custom cloned voice model
def __init__(self, model_path="path/to/your/custom/model", speaker_wav="speaker.wav"):
'''
Initialize the TTS engine with a custom cloned voice model.
Args:
model_path (str): Path to the custom TTS model directory
speaker_wav (str): Path to the speaker reference audio file
Raises:
Exception: If model loading fails
'''
try:
logger.info("Initializing TTS Engine with custom voice model...")
# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device}")
# Load the custom TTS model
logger.info(f"Loading custom model from: {model_path}")
self.tts = TTS(model_path=model_path).to(device)
# Store model information
self.model_path = model_path
self.speaker_wav = speaker_wav
self.device = device
logger.info(f"βœ… Custom TTS Engine initialized successfully!")
logger.info(f" Model path: {model_path}")
logger.info(f" Speaker file: {speaker_wav}")
logger.info(f" Device: {device}")
except Exception as e:
logger.error(f"❌ Failed to initialize custom TTS Engine: {str(e)}")
raise Exception(f"Custom TTS Engine initialization failed: {str(e)}")
# Custom synthesis method for voice cloning
def synthesize_with_cloned_voice(self, text, output_path):
'''
Synthesize speech using the cloned voice.
Args:
text (str): The text to convert to speech
output_path (str): File path to save the generated audio
Returns:
bool: True if synthesis was successful
'''
try:
logger.info(f"Synthesizing speech with cloned voice...")
# Perform text-to-speech synthesis with cloned voice
self.tts.tts_to_file(
text=text,
speaker_wav=self.speaker_wav,
file_path=output_path
)
logger.info(f"βœ… Cloned voice synthesis completed!")
return True
except Exception as e:
logger.error(f"❌ Cloned voice synthesis failed: {str(e)}")
raise Exception(f"Cloned voice synthesis failed: {str(e)}")
"""
def main():
"""Example usage of the TTSEngine class."""
try:
# Create TTS engine instance
logger.info("Creating TTS Engine instance...")
tts_engine = TTSEngine()
# Display model information
model_info = tts_engine.get_model_info()
logger.info(f"Model Information: {model_info}")
# Test text for synthesis
test_text = "Hello! This is a test of the text-to-speech engine. The voice synthesis is working perfectly."
# Synthesize speech
output_file = "test_output.wav"
success = tts_engine.synthesize(
text=test_text,
output_path=output_file,
speaker="p225" # Using a specific speaker from the VCTK dataset
)
if success:
logger.info("πŸŽ‰ Test completed successfully!")
logger.info(f"Check the generated audio file: {output_file}")
else:
logger.error("❌ Test failed!")
except Exception as e:
logger.error(f"❌ Example usage failed: {str(e)}")
if __name__ == "__main__":
main()