|
|
""" |
|
|
Text-to-Speech Engine for Voice Assistant |
|
|
|
|
|
A complete, self-contained Python class that provides high-quality text-to-speech |
|
|
synthesis using the Coqui TTS library with multi-speaker support. |
|
|
|
|
|
Author: Voice Assistant Team |
|
|
Version: 1.0.0 |
|
|
""" |
|
|
|
|
|
import os |
|
|
import torch |
|
|
from TTS.api import TTS |
|
|
import logging |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class TTSEngine: |
|
|
""" |
|
|
A high-quality Text-to-Speech engine using Coqui TTS library. |
|
|
|
|
|
This class provides text-to-speech synthesis capabilities with support for |
|
|
multi-speaker models and custom voice cloning. |
|
|
""" |
|
|
|
|
|
def __init__(self, model_name="tts_models/en/vctk/vits"): |
|
|
""" |
|
|
Initialize the TTS engine with a pre-trained model. |
|
|
|
|
|
Args: |
|
|
model_name (str): The name of the TTS model to load. |
|
|
Default: "tts_models/en/vctk/vits" (multi-speaker English) |
|
|
|
|
|
Raises: |
|
|
Exception: If model loading fails |
|
|
""" |
|
|
try: |
|
|
logger.info("Initializing TTS Engine...") |
|
|
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
logger.info(f"Using device: {device}") |
|
|
if torch.cuda.is_available(): |
|
|
logger.info(f"GPU: {torch.cuda.get_device_name(0)}") |
|
|
logger.info(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB") |
|
|
else: |
|
|
logger.warning("No GPU detected - using CPU") |
|
|
|
|
|
|
|
|
logger.info(f"Loading model: {model_name}") |
|
|
self.tts = TTS(model_name=model_name).to(device) |
|
|
|
|
|
|
|
|
self.model_name = model_name |
|
|
self.device = device |
|
|
|
|
|
logger.info(f"β
TTS Engine initialized successfully!") |
|
|
logger.info(f" Model: {model_name}") |
|
|
logger.info(f" Device: {device}") |
|
|
|
|
|
|
|
|
if hasattr(self.tts, 'speakers') and self.tts.speakers: |
|
|
logger.info(f" Available speakers: {len(self.tts.speakers)}") |
|
|
logger.info(f" Sample speakers: {list(self.tts.speakers)[:5]}...") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"β Failed to initialize TTS Engine: {str(e)}") |
|
|
raise Exception(f"TTS Engine initialization failed: {str(e)}") |
|
|
|
|
|
def synthesize(self, text, output_path, speaker="p225", language=None): |
|
|
""" |
|
|
Synthesize speech from text and save to file. |
|
|
|
|
|
Args: |
|
|
text (str): The text to convert to speech |
|
|
output_path (str): File path to save the generated audio (.wav) |
|
|
speaker (str): Speaker ID for multi-speaker models (default: "p225") |
|
|
language (str): Language code (optional, auto-detected if None) |
|
|
|
|
|
Returns: |
|
|
bool: True if synthesis was successful, False otherwise |
|
|
|
|
|
Raises: |
|
|
Exception: If synthesis fails |
|
|
""" |
|
|
try: |
|
|
|
|
|
if not text or not text.strip(): |
|
|
raise ValueError("Text cannot be empty") |
|
|
|
|
|
if not output_path: |
|
|
raise ValueError("Output path cannot be empty") |
|
|
|
|
|
|
|
|
output_dir = os.path.dirname(output_path) |
|
|
if output_dir and not os.path.exists(output_dir): |
|
|
os.makedirs(output_dir) |
|
|
logger.info(f"Created output directory: {output_dir}") |
|
|
|
|
|
|
|
|
if not output_path.lower().endswith('.wav'): |
|
|
output_path += '.wav' |
|
|
|
|
|
logger.info(f"Synthesizing speech...") |
|
|
logger.info(f" Text: {text[:50]}{'...' if len(text) > 50 else ''}") |
|
|
logger.info(f" Speaker: {speaker}") |
|
|
logger.info(f" Output: {output_path}") |
|
|
|
|
|
|
|
|
if language: |
|
|
|
|
|
self.tts.tts_to_file( |
|
|
text=text, |
|
|
speaker=speaker, |
|
|
language=language, |
|
|
file_path=output_path |
|
|
) |
|
|
else: |
|
|
|
|
|
self.tts.tts_to_file( |
|
|
text=text, |
|
|
speaker=speaker, |
|
|
file_path=output_path |
|
|
) |
|
|
|
|
|
|
|
|
if os.path.exists(output_path): |
|
|
file_size = os.path.getsize(output_path) |
|
|
logger.info(f"β
Speech synthesis completed successfully!") |
|
|
logger.info(f" Output file: {output_path}") |
|
|
logger.info(f" File size: {file_size} bytes") |
|
|
return True |
|
|
else: |
|
|
raise Exception("Output file was not created") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"β Speech synthesis failed: {str(e)}") |
|
|
raise Exception(f"Speech synthesis failed: {str(e)}") |
|
|
|
|
|
def get_available_speakers(self): |
|
|
""" |
|
|
Get list of available speakers for the loaded model. |
|
|
|
|
|
Returns: |
|
|
list: List of available speaker IDs, or empty list if not a multi-speaker model |
|
|
""" |
|
|
try: |
|
|
if hasattr(self.tts, 'speakers') and self.tts.speakers: |
|
|
return list(self.tts.speakers) |
|
|
else: |
|
|
return [] |
|
|
except Exception as e: |
|
|
logger.error(f"Failed to get available speakers: {str(e)}") |
|
|
return [] |
|
|
|
|
|
def get_model_info(self): |
|
|
""" |
|
|
Get information about the loaded model. |
|
|
|
|
|
Returns: |
|
|
dict: Dictionary containing model information |
|
|
""" |
|
|
try: |
|
|
info = { |
|
|
"model_name": self.model_name, |
|
|
"device": self.device, |
|
|
"available_speakers": self.get_available_speakers(), |
|
|
"is_multi_speaker": hasattr(self.tts, 'speakers') and bool(self.tts.speakers) |
|
|
} |
|
|
return info |
|
|
except Exception as e: |
|
|
logger.error(f"Failed to get model info: {str(e)}") |
|
|
return {} |
|
|
|
|
|
@property |
|
|
def model(self): |
|
|
""" |
|
|
Property to check if the TTS model is loaded. |
|
|
|
|
|
Returns: |
|
|
bool: True if model is loaded, False otherwise |
|
|
""" |
|
|
return hasattr(self, 'tts') and self.tts is not None |
|
|
|
|
|
def synthesize_to_bytes(self, text, speaker="p225", language=None): |
|
|
""" |
|
|
Synthesize speech from text and return audio bytes directly. |
|
|
|
|
|
Args: |
|
|
text (str): The text to convert to speech |
|
|
speaker (str): Speaker ID for multi-speaker models (default: "p225") |
|
|
language (str): Language code (optional, auto-detected if None) |
|
|
|
|
|
Returns: |
|
|
bytes: Audio data as WAV bytes |
|
|
|
|
|
Raises: |
|
|
Exception: If synthesis fails |
|
|
""" |
|
|
try: |
|
|
|
|
|
if not text or not text.strip(): |
|
|
raise ValueError("Text cannot be empty") |
|
|
|
|
|
logger.info(f"Synthesizing speech to bytes...") |
|
|
logger.info(f" Text: {text[:50]}{'...' if len(text) > 50 else ''}") |
|
|
logger.info(f" Speaker: {speaker}") |
|
|
|
|
|
|
|
|
if language: |
|
|
|
|
|
audio_data = self.tts.tts( |
|
|
text=text, |
|
|
speaker=speaker, |
|
|
language=language |
|
|
) |
|
|
else: |
|
|
|
|
|
audio_data = self.tts.tts( |
|
|
text=text, |
|
|
speaker=speaker |
|
|
) |
|
|
|
|
|
|
|
|
import io |
|
|
import soundfile as sf |
|
|
|
|
|
audio_bytes = io.BytesIO() |
|
|
sf.write(audio_bytes, audio_data, self.tts.synthesizer.output_sample_rate, format='WAV') |
|
|
audio_bytes.seek(0) |
|
|
|
|
|
logger.info(f"β
Speech synthesis to bytes completed successfully!") |
|
|
logger.info(f" Audio size: {len(audio_bytes.getvalue())} bytes") |
|
|
|
|
|
return audio_bytes.getvalue() |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"β Speech synthesis to bytes failed: {str(e)}") |
|
|
raise Exception(f"Speech synthesis to bytes failed: {str(e)}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
# Alternative __init__ method for custom voice cloning |
|
|
# Uncomment and modify this section when you have a custom cloned voice model |
|
|
|
|
|
def __init__(self, model_path="path/to/your/custom/model", speaker_wav="speaker.wav"): |
|
|
''' |
|
|
Initialize the TTS engine with a custom cloned voice model. |
|
|
|
|
|
Args: |
|
|
model_path (str): Path to the custom TTS model directory |
|
|
speaker_wav (str): Path to the speaker reference audio file |
|
|
|
|
|
Raises: |
|
|
Exception: If model loading fails |
|
|
''' |
|
|
try: |
|
|
logger.info("Initializing TTS Engine with custom voice model...") |
|
|
|
|
|
# Check for GPU availability |
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
logger.info(f"Using device: {device}") |
|
|
|
|
|
# Load the custom TTS model |
|
|
logger.info(f"Loading custom model from: {model_path}") |
|
|
self.tts = TTS(model_path=model_path).to(device) |
|
|
|
|
|
# Store model information |
|
|
self.model_path = model_path |
|
|
self.speaker_wav = speaker_wav |
|
|
self.device = device |
|
|
|
|
|
logger.info(f"β
Custom TTS Engine initialized successfully!") |
|
|
logger.info(f" Model path: {model_path}") |
|
|
logger.info(f" Speaker file: {speaker_wav}") |
|
|
logger.info(f" Device: {device}") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"β Failed to initialize custom TTS Engine: {str(e)}") |
|
|
raise Exception(f"Custom TTS Engine initialization failed: {str(e)}") |
|
|
|
|
|
# Custom synthesis method for voice cloning |
|
|
def synthesize_with_cloned_voice(self, text, output_path): |
|
|
''' |
|
|
Synthesize speech using the cloned voice. |
|
|
|
|
|
Args: |
|
|
text (str): The text to convert to speech |
|
|
output_path (str): File path to save the generated audio |
|
|
|
|
|
Returns: |
|
|
bool: True if synthesis was successful |
|
|
''' |
|
|
try: |
|
|
logger.info(f"Synthesizing speech with cloned voice...") |
|
|
|
|
|
# Perform text-to-speech synthesis with cloned voice |
|
|
self.tts.tts_to_file( |
|
|
text=text, |
|
|
speaker_wav=self.speaker_wav, |
|
|
file_path=output_path |
|
|
) |
|
|
|
|
|
logger.info(f"β
Cloned voice synthesis completed!") |
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"β Cloned voice synthesis failed: {str(e)}") |
|
|
raise Exception(f"Cloned voice synthesis failed: {str(e)}") |
|
|
""" |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Example usage of the TTSEngine class.""" |
|
|
try: |
|
|
|
|
|
logger.info("Creating TTS Engine instance...") |
|
|
tts_engine = TTSEngine() |
|
|
|
|
|
|
|
|
model_info = tts_engine.get_model_info() |
|
|
logger.info(f"Model Information: {model_info}") |
|
|
|
|
|
|
|
|
test_text = "Hello! This is a test of the text-to-speech engine. The voice synthesis is working perfectly." |
|
|
|
|
|
|
|
|
output_file = "test_output.wav" |
|
|
success = tts_engine.synthesize( |
|
|
text=test_text, |
|
|
output_path=output_file, |
|
|
speaker="p225" |
|
|
) |
|
|
|
|
|
if success: |
|
|
logger.info("π Test completed successfully!") |
|
|
logger.info(f"Check the generated audio file: {output_file}") |
|
|
else: |
|
|
logger.error("β Test failed!") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"β Example usage failed: {str(e)}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|