|  | """ | 
					
						
						|  | Text-to-Speech Engine for Voice Assistant | 
					
						
						|  |  | 
					
						
						|  | A complete, self-contained Python class that provides high-quality text-to-speech | 
					
						
						|  | synthesis using the Coqui TTS library with multi-speaker support. | 
					
						
						|  |  | 
					
						
						|  | Author: Voice Assistant Team | 
					
						
						|  | Version: 1.0.0 | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | import os | 
					
						
						|  | import torch | 
					
						
						|  | from TTS.api import TTS | 
					
						
						|  | import logging | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | 
					
						
						|  | logger = logging.getLogger(__name__) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class TTSEngine: | 
					
						
						|  | """ | 
					
						
						|  | A high-quality Text-to-Speech engine using Coqui TTS library. | 
					
						
						|  |  | 
					
						
						|  | This class provides text-to-speech synthesis capabilities with support for | 
					
						
						|  | multi-speaker models and custom voice cloning. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | def __init__(self, model_name="tts_models/en/vctk/vits"): | 
					
						
						|  | """ | 
					
						
						|  | Initialize the TTS engine with a pre-trained model. | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | model_name (str): The name of the TTS model to load. | 
					
						
						|  | Default: "tts_models/en/vctk/vits" (multi-speaker English) | 
					
						
						|  |  | 
					
						
						|  | Raises: | 
					
						
						|  | Exception: If model loading fails | 
					
						
						|  | """ | 
					
						
						|  | try: | 
					
						
						|  | logger.info("Initializing TTS Engine...") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | device = "cuda" if torch.cuda.is_available() else "cpu" | 
					
						
						|  | logger.info(f"Using device: {device}") | 
					
						
						|  | if torch.cuda.is_available(): | 
					
						
						|  | logger.info(f"GPU: {torch.cuda.get_device_name(0)}") | 
					
						
						|  | logger.info(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB") | 
					
						
						|  | else: | 
					
						
						|  | logger.warning("No GPU detected - using CPU") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | logger.info(f"Loading model: {model_name}") | 
					
						
						|  | self.tts = TTS(model_name=model_name).to(device) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.model_name = model_name | 
					
						
						|  | self.device = device | 
					
						
						|  |  | 
					
						
						|  | logger.info(f"β
 TTS Engine initialized successfully!") | 
					
						
						|  | logger.info(f"   Model: {model_name}") | 
					
						
						|  | logger.info(f"   Device: {device}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if hasattr(self.tts, 'speakers') and self.tts.speakers: | 
					
						
						|  | logger.info(f"   Available speakers: {len(self.tts.speakers)}") | 
					
						
						|  | logger.info(f"   Sample speakers: {list(self.tts.speakers)[:5]}...") | 
					
						
						|  |  | 
					
						
						|  | except Exception as e: | 
					
						
						|  | logger.error(f"β Failed to initialize TTS Engine: {str(e)}") | 
					
						
						|  | raise Exception(f"TTS Engine initialization failed: {str(e)}") | 
					
						
						|  |  | 
					
						
						|  | def synthesize(self, text, output_path, speaker="p225", language=None): | 
					
						
						|  | """ | 
					
						
						|  | Synthesize speech from text and save to file. | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | text (str): The text to convert to speech | 
					
						
						|  | output_path (str): File path to save the generated audio (.wav) | 
					
						
						|  | speaker (str): Speaker ID for multi-speaker models (default: "p225") | 
					
						
						|  | language (str): Language code (optional, auto-detected if None) | 
					
						
						|  |  | 
					
						
						|  | Returns: | 
					
						
						|  | bool: True if synthesis was successful, False otherwise | 
					
						
						|  |  | 
					
						
						|  | Raises: | 
					
						
						|  | Exception: If synthesis fails | 
					
						
						|  | """ | 
					
						
						|  | try: | 
					
						
						|  |  | 
					
						
						|  | if not text or not text.strip(): | 
					
						
						|  | raise ValueError("Text cannot be empty") | 
					
						
						|  |  | 
					
						
						|  | if not output_path: | 
					
						
						|  | raise ValueError("Output path cannot be empty") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | output_dir = os.path.dirname(output_path) | 
					
						
						|  | if output_dir and not os.path.exists(output_dir): | 
					
						
						|  | os.makedirs(output_dir) | 
					
						
						|  | logger.info(f"Created output directory: {output_dir}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if not output_path.lower().endswith('.wav'): | 
					
						
						|  | output_path += '.wav' | 
					
						
						|  |  | 
					
						
						|  | logger.info(f"Synthesizing speech...") | 
					
						
						|  | logger.info(f"   Text: {text[:50]}{'...' if len(text) > 50 else ''}") | 
					
						
						|  | logger.info(f"   Speaker: {speaker}") | 
					
						
						|  | logger.info(f"   Output: {output_path}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if language: | 
					
						
						|  |  | 
					
						
						|  | self.tts.tts_to_file( | 
					
						
						|  | text=text, | 
					
						
						|  | speaker=speaker, | 
					
						
						|  | language=language, | 
					
						
						|  | file_path=output_path | 
					
						
						|  | ) | 
					
						
						|  | else: | 
					
						
						|  |  | 
					
						
						|  | self.tts.tts_to_file( | 
					
						
						|  | text=text, | 
					
						
						|  | speaker=speaker, | 
					
						
						|  | file_path=output_path | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if os.path.exists(output_path): | 
					
						
						|  | file_size = os.path.getsize(output_path) | 
					
						
						|  | logger.info(f"β
 Speech synthesis completed successfully!") | 
					
						
						|  | logger.info(f"   Output file: {output_path}") | 
					
						
						|  | logger.info(f"   File size: {file_size} bytes") | 
					
						
						|  | return True | 
					
						
						|  | else: | 
					
						
						|  | raise Exception("Output file was not created") | 
					
						
						|  |  | 
					
						
						|  | except Exception as e: | 
					
						
						|  | logger.error(f"β Speech synthesis failed: {str(e)}") | 
					
						
						|  | raise Exception(f"Speech synthesis failed: {str(e)}") | 
					
						
						|  |  | 
					
						
						|  | def get_available_speakers(self): | 
					
						
						|  | """ | 
					
						
						|  | Get list of available speakers for the loaded model. | 
					
						
						|  |  | 
					
						
						|  | Returns: | 
					
						
						|  | list: List of available speaker IDs, or empty list if not a multi-speaker model | 
					
						
						|  | """ | 
					
						
						|  | try: | 
					
						
						|  | if hasattr(self.tts, 'speakers') and self.tts.speakers: | 
					
						
						|  | return list(self.tts.speakers) | 
					
						
						|  | else: | 
					
						
						|  | return [] | 
					
						
						|  | except Exception as e: | 
					
						
						|  | logger.error(f"Failed to get available speakers: {str(e)}") | 
					
						
						|  | return [] | 
					
						
						|  |  | 
					
						
						|  | def get_model_info(self): | 
					
						
						|  | """ | 
					
						
						|  | Get information about the loaded model. | 
					
						
						|  |  | 
					
						
						|  | Returns: | 
					
						
						|  | dict: Dictionary containing model information | 
					
						
						|  | """ | 
					
						
						|  | try: | 
					
						
						|  | info = { | 
					
						
						|  | "model_name": self.model_name, | 
					
						
						|  | "device": self.device, | 
					
						
						|  | "available_speakers": self.get_available_speakers(), | 
					
						
						|  | "is_multi_speaker": hasattr(self.tts, 'speakers') and bool(self.tts.speakers) | 
					
						
						|  | } | 
					
						
						|  | return info | 
					
						
						|  | except Exception as e: | 
					
						
						|  | logger.error(f"Failed to get model info: {str(e)}") | 
					
						
						|  | return {} | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def model(self): | 
					
						
						|  | """ | 
					
						
						|  | Property to check if the TTS model is loaded. | 
					
						
						|  |  | 
					
						
						|  | Returns: | 
					
						
						|  | bool: True if model is loaded, False otherwise | 
					
						
						|  | """ | 
					
						
						|  | return hasattr(self, 'tts') and self.tts is not None | 
					
						
						|  |  | 
					
						
						|  | def synthesize_to_bytes(self, text, speaker="p225", language=None): | 
					
						
						|  | """ | 
					
						
						|  | Synthesize speech from text and return audio bytes directly. | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | text (str): The text to convert to speech | 
					
						
						|  | speaker (str): Speaker ID for multi-speaker models (default: "p225") | 
					
						
						|  | language (str): Language code (optional, auto-detected if None) | 
					
						
						|  |  | 
					
						
						|  | Returns: | 
					
						
						|  | bytes: Audio data as WAV bytes | 
					
						
						|  |  | 
					
						
						|  | Raises: | 
					
						
						|  | Exception: If synthesis fails | 
					
						
						|  | """ | 
					
						
						|  | try: | 
					
						
						|  |  | 
					
						
						|  | if not text or not text.strip(): | 
					
						
						|  | raise ValueError("Text cannot be empty") | 
					
						
						|  |  | 
					
						
						|  | logger.info(f"Synthesizing speech to bytes...") | 
					
						
						|  | logger.info(f"   Text: {text[:50]}{'...' if len(text) > 50 else ''}") | 
					
						
						|  | logger.info(f"   Speaker: {speaker}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if language: | 
					
						
						|  |  | 
					
						
						|  | audio_data = self.tts.tts( | 
					
						
						|  | text=text, | 
					
						
						|  | speaker=speaker, | 
					
						
						|  | language=language | 
					
						
						|  | ) | 
					
						
						|  | else: | 
					
						
						|  |  | 
					
						
						|  | audio_data = self.tts.tts( | 
					
						
						|  | text=text, | 
					
						
						|  | speaker=speaker | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | import io | 
					
						
						|  | import soundfile as sf | 
					
						
						|  |  | 
					
						
						|  | audio_bytes = io.BytesIO() | 
					
						
						|  | sf.write(audio_bytes, audio_data, self.tts.synthesizer.output_sample_rate, format='WAV') | 
					
						
						|  | audio_bytes.seek(0) | 
					
						
						|  |  | 
					
						
						|  | logger.info(f"β
 Speech synthesis to bytes completed successfully!") | 
					
						
						|  | logger.info(f"   Audio size: {len(audio_bytes.getvalue())} bytes") | 
					
						
						|  |  | 
					
						
						|  | return audio_bytes.getvalue() | 
					
						
						|  |  | 
					
						
						|  | except Exception as e: | 
					
						
						|  | logger.error(f"β Speech synthesis to bytes failed: {str(e)}") | 
					
						
						|  | raise Exception(f"Speech synthesis to bytes failed: {str(e)}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | """ | 
					
						
						|  | # Alternative __init__ method for custom voice cloning | 
					
						
						|  | # Uncomment and modify this section when you have a custom cloned voice model | 
					
						
						|  |  | 
					
						
						|  | def __init__(self, model_path="path/to/your/custom/model", speaker_wav="speaker.wav"): | 
					
						
						|  | ''' | 
					
						
						|  | Initialize the TTS engine with a custom cloned voice model. | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | model_path (str): Path to the custom TTS model directory | 
					
						
						|  | speaker_wav (str): Path to the speaker reference audio file | 
					
						
						|  |  | 
					
						
						|  | Raises: | 
					
						
						|  | Exception: If model loading fails | 
					
						
						|  | ''' | 
					
						
						|  | try: | 
					
						
						|  | logger.info("Initializing TTS Engine with custom voice model...") | 
					
						
						|  |  | 
					
						
						|  | # Check for GPU availability | 
					
						
						|  | device = "cuda" if torch.cuda.is_available() else "cpu" | 
					
						
						|  | logger.info(f"Using device: {device}") | 
					
						
						|  |  | 
					
						
						|  | # Load the custom TTS model | 
					
						
						|  | logger.info(f"Loading custom model from: {model_path}") | 
					
						
						|  | self.tts = TTS(model_path=model_path).to(device) | 
					
						
						|  |  | 
					
						
						|  | # Store model information | 
					
						
						|  | self.model_path = model_path | 
					
						
						|  | self.speaker_wav = speaker_wav | 
					
						
						|  | self.device = device | 
					
						
						|  |  | 
					
						
						|  | logger.info(f"β
 Custom TTS Engine initialized successfully!") | 
					
						
						|  | logger.info(f"   Model path: {model_path}") | 
					
						
						|  | logger.info(f"   Speaker file: {speaker_wav}") | 
					
						
						|  | logger.info(f"   Device: {device}") | 
					
						
						|  |  | 
					
						
						|  | except Exception as e: | 
					
						
						|  | logger.error(f"β Failed to initialize custom TTS Engine: {str(e)}") | 
					
						
						|  | raise Exception(f"Custom TTS Engine initialization failed: {str(e)}") | 
					
						
						|  |  | 
					
						
						|  | # Custom synthesis method for voice cloning | 
					
						
						|  | def synthesize_with_cloned_voice(self, text, output_path): | 
					
						
						|  | ''' | 
					
						
						|  | Synthesize speech using the cloned voice. | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | text (str): The text to convert to speech | 
					
						
						|  | output_path (str): File path to save the generated audio | 
					
						
						|  |  | 
					
						
						|  | Returns: | 
					
						
						|  | bool: True if synthesis was successful | 
					
						
						|  | ''' | 
					
						
						|  | try: | 
					
						
						|  | logger.info(f"Synthesizing speech with cloned voice...") | 
					
						
						|  |  | 
					
						
						|  | # Perform text-to-speech synthesis with cloned voice | 
					
						
						|  | self.tts.tts_to_file( | 
					
						
						|  | text=text, | 
					
						
						|  | speaker_wav=self.speaker_wav, | 
					
						
						|  | file_path=output_path | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | logger.info(f"β
 Cloned voice synthesis completed!") | 
					
						
						|  | return True | 
					
						
						|  |  | 
					
						
						|  | except Exception as e: | 
					
						
						|  | logger.error(f"β Cloned voice synthesis failed: {str(e)}") | 
					
						
						|  | raise Exception(f"Cloned voice synthesis failed: {str(e)}") | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def main(): | 
					
						
						|  | """Example usage of the TTSEngine class.""" | 
					
						
						|  | try: | 
					
						
						|  |  | 
					
						
						|  | logger.info("Creating TTS Engine instance...") | 
					
						
						|  | tts_engine = TTSEngine() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | model_info = tts_engine.get_model_info() | 
					
						
						|  | logger.info(f"Model Information: {model_info}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | test_text = "Hello! This is a test of the text-to-speech engine. The voice synthesis is working perfectly." | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | output_file = "test_output.wav" | 
					
						
						|  | success = tts_engine.synthesize( | 
					
						
						|  | text=test_text, | 
					
						
						|  | output_path=output_file, | 
					
						
						|  | speaker="p225" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if success: | 
					
						
						|  | logger.info("π Test completed successfully!") | 
					
						
						|  | logger.info(f"Check the generated audio file: {output_file}") | 
					
						
						|  | else: | 
					
						
						|  | logger.error("β Test failed!") | 
					
						
						|  |  | 
					
						
						|  | except Exception as e: | 
					
						
						|  | logger.error(f"β Example usage failed: {str(e)}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if __name__ == "__main__": | 
					
						
						|  | main() | 
					
						
						|  |  |