""" Text-to-Speech Engine for Voice Assistant A complete, self-contained Python class that provides high-quality text-to-speech synthesis using the Coqui TTS library with multi-speaker support. Author: Voice Assistant Team Version: 1.0.0 """ import os import torch from TTS.api import TTS import logging # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) class TTSEngine: """ A high-quality Text-to-Speech engine using Coqui TTS library. This class provides text-to-speech synthesis capabilities with support for multi-speaker models and custom voice cloning. """ def __init__(self, model_name="tts_models/en/vctk/vits"): """ Initialize the TTS engine with a pre-trained model. Args: model_name (str): The name of the TTS model to load. Default: "tts_models/en/vctk/vits" (multi-speaker English) Raises: Exception: If model loading fails """ try: logger.info("Initializing TTS Engine...") # Check for GPU availability device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {device}") if torch.cuda.is_available(): logger.info(f"GPU: {torch.cuda.get_device_name(0)}") logger.info(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB") else: logger.warning("No GPU detected - using CPU") # Load the TTS model logger.info(f"Loading model: {model_name}") self.tts = TTS(model_name=model_name).to(device) # Store model information self.model_name = model_name self.device = device logger.info(f"✅ TTS Engine initialized successfully!") logger.info(f" Model: {model_name}") logger.info(f" Device: {device}") # Print available speakers if it's a multi-speaker model if hasattr(self.tts, 'speakers') and self.tts.speakers: logger.info(f" Available speakers: {len(self.tts.speakers)}") logger.info(f" Sample speakers: {list(self.tts.speakers)[:5]}...") except Exception as e: logger.error(f"❌ Failed to initialize TTS Engine: {str(e)}") raise Exception(f"TTS Engine initialization failed: {str(e)}") def synthesize(self, text, output_path, speaker="p225", language=None): """ Synthesize speech from text and save to file. Args: text (str): The text to convert to speech output_path (str): File path to save the generated audio (.wav) speaker (str): Speaker ID for multi-speaker models (default: "p225") language (str): Language code (optional, auto-detected if None) Returns: bool: True if synthesis was successful, False otherwise Raises: Exception: If synthesis fails """ try: # Validate input if not text or not text.strip(): raise ValueError("Text cannot be empty") if not output_path: raise ValueError("Output path cannot be empty") # Ensure output directory exists output_dir = os.path.dirname(output_path) if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) logger.info(f"Created output directory: {output_dir}") # Ensure output path has .wav extension if not output_path.lower().endswith('.wav'): output_path += '.wav' logger.info(f"Synthesizing speech...") logger.info(f" Text: {text[:50]}{'...' if len(text) > 50 else ''}") logger.info(f" Speaker: {speaker}") logger.info(f" Output: {output_path}") # Perform text-to-speech synthesis if language: # With explicit language self.tts.tts_to_file( text=text, speaker=speaker, language=language, file_path=output_path ) else: # Auto-detect language self.tts.tts_to_file( text=text, speaker=speaker, file_path=output_path ) # Verify the file was created if os.path.exists(output_path): file_size = os.path.getsize(output_path) logger.info(f"✅ Speech synthesis completed successfully!") logger.info(f" Output file: {output_path}") logger.info(f" File size: {file_size} bytes") return True else: raise Exception("Output file was not created") except Exception as e: logger.error(f"❌ Speech synthesis failed: {str(e)}") raise Exception(f"Speech synthesis failed: {str(e)}") def get_available_speakers(self): """ Get list of available speakers for the loaded model. Returns: list: List of available speaker IDs, or empty list if not a multi-speaker model """ try: if hasattr(self.tts, 'speakers') and self.tts.speakers: return list(self.tts.speakers) else: return [] except Exception as e: logger.error(f"Failed to get available speakers: {str(e)}") return [] def get_model_info(self): """ Get information about the loaded model. Returns: dict: Dictionary containing model information """ try: info = { "model_name": self.model_name, "device": self.device, "available_speakers": self.get_available_speakers(), "is_multi_speaker": hasattr(self.tts, 'speakers') and bool(self.tts.speakers) } return info except Exception as e: logger.error(f"Failed to get model info: {str(e)}") return {} @property def model(self): """ Property to check if the TTS model is loaded. Returns: bool: True if model is loaded, False otherwise """ return hasattr(self, 'tts') and self.tts is not None def synthesize_to_bytes(self, text, speaker="p225", language=None): """ Synthesize speech from text and return audio bytes directly. Args: text (str): The text to convert to speech speaker (str): Speaker ID for multi-speaker models (default: "p225") language (str): Language code (optional, auto-detected if None) Returns: bytes: Audio data as WAV bytes Raises: Exception: If synthesis fails """ try: # Validate input if not text or not text.strip(): raise ValueError("Text cannot be empty") logger.info(f"Synthesizing speech to bytes...") logger.info(f" Text: {text[:50]}{'...' if len(text) > 50 else ''}") logger.info(f" Speaker: {speaker}") # Perform text-to-speech synthesis and get audio data if language: # With explicit language audio_data = self.tts.tts( text=text, speaker=speaker, language=language ) else: # Auto-detect language audio_data = self.tts.tts( text=text, speaker=speaker ) # Convert audio data to WAV bytes import io import soundfile as sf audio_bytes = io.BytesIO() sf.write(audio_bytes, audio_data, self.tts.synthesizer.output_sample_rate, format='WAV') audio_bytes.seek(0) logger.info(f"✅ Speech synthesis to bytes completed successfully!") logger.info(f" Audio size: {len(audio_bytes.getvalue())} bytes") return audio_bytes.getvalue() except Exception as e: logger.error(f"❌ Speech synthesis to bytes failed: {str(e)}") raise Exception(f"Speech synthesis to bytes failed: {str(e)}") # ============================================================================= # CUSTOM VOICE CLONING INITIALIZATION (COMMENTED OUT) # ============================================================================= """ # Alternative __init__ method for custom voice cloning # Uncomment and modify this section when you have a custom cloned voice model def __init__(self, model_path="path/to/your/custom/model", speaker_wav="speaker.wav"): ''' Initialize the TTS engine with a custom cloned voice model. Args: model_path (str): Path to the custom TTS model directory speaker_wav (str): Path to the speaker reference audio file Raises: Exception: If model loading fails ''' try: logger.info("Initializing TTS Engine with custom voice model...") # Check for GPU availability device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {device}") # Load the custom TTS model logger.info(f"Loading custom model from: {model_path}") self.tts = TTS(model_path=model_path).to(device) # Store model information self.model_path = model_path self.speaker_wav = speaker_wav self.device = device logger.info(f"✅ Custom TTS Engine initialized successfully!") logger.info(f" Model path: {model_path}") logger.info(f" Speaker file: {speaker_wav}") logger.info(f" Device: {device}") except Exception as e: logger.error(f"❌ Failed to initialize custom TTS Engine: {str(e)}") raise Exception(f"Custom TTS Engine initialization failed: {str(e)}") # Custom synthesis method for voice cloning def synthesize_with_cloned_voice(self, text, output_path): ''' Synthesize speech using the cloned voice. Args: text (str): The text to convert to speech output_path (str): File path to save the generated audio Returns: bool: True if synthesis was successful ''' try: logger.info(f"Synthesizing speech with cloned voice...") # Perform text-to-speech synthesis with cloned voice self.tts.tts_to_file( text=text, speaker_wav=self.speaker_wav, file_path=output_path ) logger.info(f"✅ Cloned voice synthesis completed!") return True except Exception as e: logger.error(f"❌ Cloned voice synthesis failed: {str(e)}") raise Exception(f"Cloned voice synthesis failed: {str(e)}") """ def main(): """Example usage of the TTSEngine class.""" try: # Create TTS engine instance logger.info("Creating TTS Engine instance...") tts_engine = TTSEngine() # Display model information model_info = tts_engine.get_model_info() logger.info(f"Model Information: {model_info}") # Test text for synthesis test_text = "Hello! This is a test of the text-to-speech engine. The voice synthesis is working perfectly." # Synthesize speech output_file = "test_output.wav" success = tts_engine.synthesize( text=test_text, output_path=output_file, speaker="p225" # Using a specific speaker from the VCTK dataset ) if success: logger.info("🎉 Test completed successfully!") logger.info(f"Check the generated audio file: {output_file}") else: logger.error("❌ Test failed!") except Exception as e: logger.error(f"❌ Example usage failed: {str(e)}") if __name__ == "__main__": main()