papercast / synthesis /tts_engine.py
batuhanozkose
feat: Implement Podcast Persona Framework (PPF) - Revolutionary adaptive conversation system
2bd49fc
import os
from datetime import datetime
from io import BytesIO
from elevenlabs import ElevenLabs, VoiceSettings
from pydub import AudioSegment
from utils.config import (
ELEVENLABS_API_KEY,
ELEVENLABS_GUEST_VOICE,
ELEVENLABS_HOST_VOICE,
OUTPUT_DIR,
)
# Import Supertonic TTS wrapper
from synthesis.supertonic_tts import (
SupertonicWrapper,
SUPERTONIC_VOICES,
DEFAULT_HOST_VOICE as SUPERTONIC_DEFAULT_HOST,
DEFAULT_GUEST_VOICE as SUPERTONIC_DEFAULT_GUEST,
get_supertonic_engine,
)
# ElevenLabs Voice Options
ELEVENLABS_VOICES = {
# Male Voices
"Antoni (Male - Well-rounded)": "ErXwobaYiN019PkySvjV",
"Josh (Male - Deep)": "TxGEqnHWrfWFTfGW9XjX",
"Arnold (Male - Crisp)": "VR6AewLTigWG4xSOukaG",
"Callum (Male - Hoarse)": "N2lVS1w4EtoT3dr4eOWO",
"Charlie (Male - Casual)": "IKne3meq5aSn9XLyUdCD",
"Clyde (Male - War veteran)": "2EiwWnXFnvU5JabPnv8n",
"Daniel (Male - Deep British)": "onwK4e9ZLuTAKqWW03F9",
"Ethan (Male - Young American)": "g5CIjZEefAph4nQFvHAz",
"Fin (Male - Irish)": "D38z5RcWu1voky8WS1ja",
"George (Male - British)": "JBFqnCBsd6RMkjVDRZzb",
# Female Voices
"Bella (Female - Soft)": "EXAVITQu4vr4xnSDxMaL",
"Rachel (Female - Calm)": "21m00Tcm4TlvDq8ikWAM",
"Domi (Female - Strong)": "AZnzlk1XvdvUeBnXmlld",
"Elli (Female - Emotional)": "MF3mGyEYCl7XYWbV9V6O",
"Emily (Female - Calm British)": "LcfcDJNUP1GQjkzn1xUU",
"Freya (Female - Young American)": "jsCqWAovK2LkecY7zXl4",
"Gigi (Female - Young Expressive)": "jBpfuIE2acCO8z3wKNLl",
"Grace (Female - Southern American)": "oWAxZDx7w5VEj9dCyTzz",
"Lily (Female - Warm British)": "pFZP5JQG7iQjIQuC4Bku",
"Matilda (Female - Warm)": "XrExE9yKIg1WjnnlVkGX",
}
def generate_unique_filename():
"""Generate unique filename using timestamp"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"podcast_{timestamp}.wav"
class TTSEngine:
def __init__(self, tts_provider="elevenlabs", custom_api_key=None, host_voice=None, guest_voice=None):
"""
Initialize TTS Engine with either ElevenLabs or Supertonic.
Args:
tts_provider: "elevenlabs" or "supertonic"
custom_api_key: API key for ElevenLabs (required if using elevenlabs)
host_voice: Voice ID for Host (optional, uses default if not provided)
guest_voice: Voice ID for Guest (optional, uses default if not provided)
"""
self.mode = tts_provider.lower()
if self.mode == "elevenlabs":
print("Initializing ElevenLabs TTS API...")
# Use custom key if provided, otherwise use default
api_key = custom_api_key if custom_api_key else ELEVENLABS_API_KEY
if not api_key:
raise ValueError("ElevenLabs API key is required")
self.client = ElevenLabs(api_key=api_key)
# Use custom voices or defaults
self.host_voice_id = host_voice if host_voice else ELEVENLABS_HOST_VOICE
self.guest_voice_id = guest_voice if guest_voice else ELEVENLABS_GUEST_VOICE
if custom_api_key:
print("✓ ElevenLabs TTS ready (custom API key)")
else:
print("✓ ElevenLabs TTS ready")
# Print selected voices
host_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.host_voice_id]
guest_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.guest_voice_id]
print(f" Host: {host_name[0] if host_name else 'Custom/Default'}")
print(f" Guest: {guest_name[0] if guest_name else 'Custom/Default'}")
elif self.mode == "supertonic":
print("Initializing Supertonic TTS (CPU mode)...")
self.supertonic_engine = get_supertonic_engine()
self.supertonic_engine.initialize()
# Use custom voices or defaults
# For Supertonic, voice is the display name, we'll convert to ID later
self.host_voice_id = host_voice if host_voice else SUPERTONIC_DEFAULT_HOST
self.guest_voice_id = guest_voice if guest_voice else SUPERTONIC_DEFAULT_GUEST
print("✓ Supertonic TTS ready (CPU mode, no API key required)")
print(f" Host: {self.host_voice_id}")
print(f" Guest: {self.guest_voice_id}")
else:
raise ValueError(f"Unknown TTS provider: {tts_provider}. Use 'elevenlabs' or 'supertonic'.")
def _build_speaker_mapping(self, script: list) -> dict:
"""
Build a mapping from speaker names to voice IDs.
First unique speaker gets host_voice, second gets guest_voice.
This allows PPF personas to work with any character names.
Args:
script: List of dialogue items with 'speaker' keys
Returns:
dict: Mapping from speaker name to voice ID
"""
unique_speakers = []
for item in script:
if isinstance(item, dict) and "speaker" in item:
speaker = item["speaker"]
if speaker not in unique_speakers:
unique_speakers.append(speaker)
# Map first speaker to host_voice, second to guest_voice
mapping = {}
if len(unique_speakers) >= 1:
mapping[unique_speakers[0]] = self.host_voice_id
print(f" 🎙️ Speaker mapping: {unique_speakers[0]} → Host Voice")
if len(unique_speakers) >= 2:
mapping[unique_speakers[1]] = self.guest_voice_id
print(f" 🎙️ Speaker mapping: {unique_speakers[1]} → Guest Voice")
return mapping
def synthesize_dialogue(self, script: list) -> str:
"""
Synthesize the script to audio using the selected TTS provider.
Args:
script: List of dialogue items
Returns:
str: Path to the generated audio file
"""
if self.mode == "elevenlabs":
return self._synthesize_elevenlabs(script)
elif self.mode == "supertonic":
return self._synthesize_supertonic(script)
else:
raise ValueError(f"Unknown TTS mode: {self.mode}")
def _synthesize_elevenlabs(self, script: list) -> str:
"""Synthesize using ElevenLabs API"""
print("Synthesizing audio via ElevenLabs API...")
audio_segments = []
# Build dynamic speaker-to-voice mapping
# First unique speaker gets host_voice, second gets guest_voice
speaker_to_voice = self._build_speaker_mapping(script)
for i, item in enumerate(script):
# Defensive checks for required keys
if not isinstance(item, dict):
print(f"⚠️ Skipping item {i + 1}: not a dictionary")
continue
if "text" not in item:
print(f"⚠️ Skipping item {i + 1}: missing 'text' key")
continue
if "speaker" not in item:
print(f"⚠️ Skipping item {i + 1}: missing 'speaker' key")
continue
text = item["text"]
speaker = item["speaker"]
emotion = item.get("emotion", "neutral")
# Note: ElevenLabs doesn't have a direct emotion parameter.
# Emotion is conveyed through the text content itself (exclamation marks, word choice, etc.)
# which the script generator already creates based on the emotion field.
# We log the emotion for debugging but don't modify the text (would be spoken out loud).
# Select voice based on speaker using dynamic mapping
voice_id = speaker_to_voice.get(speaker, self.host_voice_id)
try:
print(f"Synthesizing line {i + 1}/{len(script)} ({speaker}, {emotion})...")
# Generate audio using ElevenLabs with Turbo v2.5 for better quality and speed
# Turbo v2.5: High quality, low latency (~250-300ms), 50% cheaper than v2
audio_generator = self.client.text_to_speech.convert(
voice_id=voice_id,
text=text,
model_id="eleven_turbo_v2_5", # Upgraded from multilingual_v2 for better quality
voice_settings=VoiceSettings(
stability=0.4, # Lower = more expressiveness and variation (default: 0.5)
similarity_boost=0.8, # Higher = better voice consistency (default: 0.75)
style=0.6, # Higher = more dynamic, expressive delivery (default: 0.5)
use_speaker_boost=True, # Enhances similarity to original voice
),
)
# Collect audio bytes
audio_bytes = b"".join(audio_generator)
# Convert to AudioSegment
audio_segment = AudioSegment.from_mp3(BytesIO(audio_bytes))
audio_segments.append(audio_segment)
# Add 500ms silence between speakers
silence = AudioSegment.silent(duration=500)
audio_segments.append(silence)
print(f"✓ Synthesized line {i + 1}/{len(script)}")
except Exception as e:
print(f"Error synthesizing line '{text[:50]}...': {e}")
# Continue with next line even if one fails
if not audio_segments:
print("No audio generated")
return ""
# Combine all segments
print("Combining audio segments...")
combined = sum(audio_segments)
# Export as WAV with unique filename
filename = generate_unique_filename()
output_path = os.path.join(OUTPUT_DIR, filename)
combined.export(output_path, format="wav")
print(f"✓ Podcast saved to: {output_path}")
return output_path
def _synthesize_supertonic(self, script: list) -> str:
"""Synthesize using Supertonic TTS (CPU-based)"""
print("Synthesizing audio via Supertonic TTS (CPU mode)...")
audio_segments = []
# Build dynamic speaker-to-voice mapping
# First unique speaker gets host_voice, second gets guest_voice
speaker_to_voice = self._build_speaker_mapping(script)
for i, item in enumerate(script):
# Defensive checks for required keys
if not isinstance(item, dict):
print(f"⚠️ Skipping item {i + 1}: not a dictionary")
continue
if "text" not in item:
print(f"⚠️ Skipping item {i + 1}: missing 'text' key")
continue
if "speaker" not in item:
print(f"⚠️ Skipping item {i + 1}: missing 'speaker' key")
continue
text = item["text"]
speaker = item["speaker"]
emotion = item.get("emotion", "neutral")
# Select voice based on speaker using dynamic mapping
voice_id = speaker_to_voice.get(speaker, self.host_voice_id)
try:
print(f"Synthesizing line {i + 1}/{len(script)} ({speaker}, {emotion})...")
# Generate audio using Supertonic
# Parameters optimized for quality vs speed
audio_segment = self.supertonic_engine.synthesize_to_audio_segment(
text=text,
voice_id=voice_id,
speed=1.0,
steps=5, # Balanced quality/speed (1-50, lower=faster)
)
audio_segments.append(audio_segment)
# Add 500ms silence between speakers
silence = AudioSegment.silent(duration=500)
audio_segments.append(silence)
print(f"✓ Synthesized line {i + 1}/{len(script)}")
except Exception as e:
print(f"Error synthesizing line '{text[:50]}...': {e}")
# Continue with next line even if one fails
if not audio_segments:
print("No audio generated")
return ""
# Combine all segments
print("Combining audio segments...")
combined = sum(audio_segments)
# Export as WAV with unique filename
filename = generate_unique_filename()
output_path = os.path.join(OUTPUT_DIR, filename)
combined.export(output_path, format="wav")
print(f"✓ Podcast saved to: {output_path}")
return output_path
# Global instance
_tts_instance = None
def get_tts_engine(tts_provider="elevenlabs", custom_api_key=None, host_voice=None, guest_voice=None):
"""
Get TTS engine instance with ElevenLabs or Supertonic.
Args:
tts_provider: "elevenlabs" or "supertonic"
custom_api_key: ElevenLabs API key (required if using elevenlabs, not needed for supertonic)
host_voice: Voice ID for Host (optional)
guest_voice: Voice ID for Guest (optional)
Returns:
TTSEngine instance
"""
global _tts_instance
# Always create new instance if custom settings provided or if using Supertonic
if custom_api_key or tts_provider != "elevenlabs" or host_voice or guest_voice:
return TTSEngine(
tts_provider=tts_provider,
custom_api_key=custom_api_key,
host_voice=host_voice,
guest_voice=guest_voice
)
# Otherwise, reuse global instance (for default ElevenLabs)
if _tts_instance is None:
_tts_instance = TTSEngine(tts_provider="elevenlabs")
return _tts_instance