Spaces:
Running
Running
batuhanozkose
feat: Implement Podcast Persona Framework (PPF) - Revolutionary adaptive conversation system
2bd49fc
| import os | |
| from datetime import datetime | |
| from io import BytesIO | |
| from elevenlabs import ElevenLabs, VoiceSettings | |
| from pydub import AudioSegment | |
| from utils.config import ( | |
| ELEVENLABS_API_KEY, | |
| ELEVENLABS_GUEST_VOICE, | |
| ELEVENLABS_HOST_VOICE, | |
| OUTPUT_DIR, | |
| ) | |
| # Import Supertonic TTS wrapper | |
| from synthesis.supertonic_tts import ( | |
| SupertonicWrapper, | |
| SUPERTONIC_VOICES, | |
| DEFAULT_HOST_VOICE as SUPERTONIC_DEFAULT_HOST, | |
| DEFAULT_GUEST_VOICE as SUPERTONIC_DEFAULT_GUEST, | |
| get_supertonic_engine, | |
| ) | |
| # ElevenLabs Voice Options | |
| ELEVENLABS_VOICES = { | |
| # Male Voices | |
| "Antoni (Male - Well-rounded)": "ErXwobaYiN019PkySvjV", | |
| "Josh (Male - Deep)": "TxGEqnHWrfWFTfGW9XjX", | |
| "Arnold (Male - Crisp)": "VR6AewLTigWG4xSOukaG", | |
| "Callum (Male - Hoarse)": "N2lVS1w4EtoT3dr4eOWO", | |
| "Charlie (Male - Casual)": "IKne3meq5aSn9XLyUdCD", | |
| "Clyde (Male - War veteran)": "2EiwWnXFnvU5JabPnv8n", | |
| "Daniel (Male - Deep British)": "onwK4e9ZLuTAKqWW03F9", | |
| "Ethan (Male - Young American)": "g5CIjZEefAph4nQFvHAz", | |
| "Fin (Male - Irish)": "D38z5RcWu1voky8WS1ja", | |
| "George (Male - British)": "JBFqnCBsd6RMkjVDRZzb", | |
| # Female Voices | |
| "Bella (Female - Soft)": "EXAVITQu4vr4xnSDxMaL", | |
| "Rachel (Female - Calm)": "21m00Tcm4TlvDq8ikWAM", | |
| "Domi (Female - Strong)": "AZnzlk1XvdvUeBnXmlld", | |
| "Elli (Female - Emotional)": "MF3mGyEYCl7XYWbV9V6O", | |
| "Emily (Female - Calm British)": "LcfcDJNUP1GQjkzn1xUU", | |
| "Freya (Female - Young American)": "jsCqWAovK2LkecY7zXl4", | |
| "Gigi (Female - Young Expressive)": "jBpfuIE2acCO8z3wKNLl", | |
| "Grace (Female - Southern American)": "oWAxZDx7w5VEj9dCyTzz", | |
| "Lily (Female - Warm British)": "pFZP5JQG7iQjIQuC4Bku", | |
| "Matilda (Female - Warm)": "XrExE9yKIg1WjnnlVkGX", | |
| } | |
| def generate_unique_filename(): | |
| """Generate unique filename using timestamp""" | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| return f"podcast_{timestamp}.wav" | |
| class TTSEngine: | |
| def __init__(self, tts_provider="elevenlabs", custom_api_key=None, host_voice=None, guest_voice=None): | |
| """ | |
| Initialize TTS Engine with either ElevenLabs or Supertonic. | |
| Args: | |
| tts_provider: "elevenlabs" or "supertonic" | |
| custom_api_key: API key for ElevenLabs (required if using elevenlabs) | |
| host_voice: Voice ID for Host (optional, uses default if not provided) | |
| guest_voice: Voice ID for Guest (optional, uses default if not provided) | |
| """ | |
| self.mode = tts_provider.lower() | |
| if self.mode == "elevenlabs": | |
| print("Initializing ElevenLabs TTS API...") | |
| # Use custom key if provided, otherwise use default | |
| api_key = custom_api_key if custom_api_key else ELEVENLABS_API_KEY | |
| if not api_key: | |
| raise ValueError("ElevenLabs API key is required") | |
| self.client = ElevenLabs(api_key=api_key) | |
| # Use custom voices or defaults | |
| self.host_voice_id = host_voice if host_voice else ELEVENLABS_HOST_VOICE | |
| self.guest_voice_id = guest_voice if guest_voice else ELEVENLABS_GUEST_VOICE | |
| if custom_api_key: | |
| print("✓ ElevenLabs TTS ready (custom API key)") | |
| else: | |
| print("✓ ElevenLabs TTS ready") | |
| # Print selected voices | |
| host_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.host_voice_id] | |
| guest_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.guest_voice_id] | |
| print(f" Host: {host_name[0] if host_name else 'Custom/Default'}") | |
| print(f" Guest: {guest_name[0] if guest_name else 'Custom/Default'}") | |
| elif self.mode == "supertonic": | |
| print("Initializing Supertonic TTS (CPU mode)...") | |
| self.supertonic_engine = get_supertonic_engine() | |
| self.supertonic_engine.initialize() | |
| # Use custom voices or defaults | |
| # For Supertonic, voice is the display name, we'll convert to ID later | |
| self.host_voice_id = host_voice if host_voice else SUPERTONIC_DEFAULT_HOST | |
| self.guest_voice_id = guest_voice if guest_voice else SUPERTONIC_DEFAULT_GUEST | |
| print("✓ Supertonic TTS ready (CPU mode, no API key required)") | |
| print(f" Host: {self.host_voice_id}") | |
| print(f" Guest: {self.guest_voice_id}") | |
| else: | |
| raise ValueError(f"Unknown TTS provider: {tts_provider}. Use 'elevenlabs' or 'supertonic'.") | |
| def _build_speaker_mapping(self, script: list) -> dict: | |
| """ | |
| Build a mapping from speaker names to voice IDs. | |
| First unique speaker gets host_voice, second gets guest_voice. | |
| This allows PPF personas to work with any character names. | |
| Args: | |
| script: List of dialogue items with 'speaker' keys | |
| Returns: | |
| dict: Mapping from speaker name to voice ID | |
| """ | |
| unique_speakers = [] | |
| for item in script: | |
| if isinstance(item, dict) and "speaker" in item: | |
| speaker = item["speaker"] | |
| if speaker not in unique_speakers: | |
| unique_speakers.append(speaker) | |
| # Map first speaker to host_voice, second to guest_voice | |
| mapping = {} | |
| if len(unique_speakers) >= 1: | |
| mapping[unique_speakers[0]] = self.host_voice_id | |
| print(f" 🎙️ Speaker mapping: {unique_speakers[0]} → Host Voice") | |
| if len(unique_speakers) >= 2: | |
| mapping[unique_speakers[1]] = self.guest_voice_id | |
| print(f" 🎙️ Speaker mapping: {unique_speakers[1]} → Guest Voice") | |
| return mapping | |
| def synthesize_dialogue(self, script: list) -> str: | |
| """ | |
| Synthesize the script to audio using the selected TTS provider. | |
| Args: | |
| script: List of dialogue items | |
| Returns: | |
| str: Path to the generated audio file | |
| """ | |
| if self.mode == "elevenlabs": | |
| return self._synthesize_elevenlabs(script) | |
| elif self.mode == "supertonic": | |
| return self._synthesize_supertonic(script) | |
| else: | |
| raise ValueError(f"Unknown TTS mode: {self.mode}") | |
| def _synthesize_elevenlabs(self, script: list) -> str: | |
| """Synthesize using ElevenLabs API""" | |
| print("Synthesizing audio via ElevenLabs API...") | |
| audio_segments = [] | |
| # Build dynamic speaker-to-voice mapping | |
| # First unique speaker gets host_voice, second gets guest_voice | |
| speaker_to_voice = self._build_speaker_mapping(script) | |
| for i, item in enumerate(script): | |
| # Defensive checks for required keys | |
| if not isinstance(item, dict): | |
| print(f"⚠️ Skipping item {i + 1}: not a dictionary") | |
| continue | |
| if "text" not in item: | |
| print(f"⚠️ Skipping item {i + 1}: missing 'text' key") | |
| continue | |
| if "speaker" not in item: | |
| print(f"⚠️ Skipping item {i + 1}: missing 'speaker' key") | |
| continue | |
| text = item["text"] | |
| speaker = item["speaker"] | |
| emotion = item.get("emotion", "neutral") | |
| # Note: ElevenLabs doesn't have a direct emotion parameter. | |
| # Emotion is conveyed through the text content itself (exclamation marks, word choice, etc.) | |
| # which the script generator already creates based on the emotion field. | |
| # We log the emotion for debugging but don't modify the text (would be spoken out loud). | |
| # Select voice based on speaker using dynamic mapping | |
| voice_id = speaker_to_voice.get(speaker, self.host_voice_id) | |
| try: | |
| print(f"Synthesizing line {i + 1}/{len(script)} ({speaker}, {emotion})...") | |
| # Generate audio using ElevenLabs with Turbo v2.5 for better quality and speed | |
| # Turbo v2.5: High quality, low latency (~250-300ms), 50% cheaper than v2 | |
| audio_generator = self.client.text_to_speech.convert( | |
| voice_id=voice_id, | |
| text=text, | |
| model_id="eleven_turbo_v2_5", # Upgraded from multilingual_v2 for better quality | |
| voice_settings=VoiceSettings( | |
| stability=0.4, # Lower = more expressiveness and variation (default: 0.5) | |
| similarity_boost=0.8, # Higher = better voice consistency (default: 0.75) | |
| style=0.6, # Higher = more dynamic, expressive delivery (default: 0.5) | |
| use_speaker_boost=True, # Enhances similarity to original voice | |
| ), | |
| ) | |
| # Collect audio bytes | |
| audio_bytes = b"".join(audio_generator) | |
| # Convert to AudioSegment | |
| audio_segment = AudioSegment.from_mp3(BytesIO(audio_bytes)) | |
| audio_segments.append(audio_segment) | |
| # Add 500ms silence between speakers | |
| silence = AudioSegment.silent(duration=500) | |
| audio_segments.append(silence) | |
| print(f"✓ Synthesized line {i + 1}/{len(script)}") | |
| except Exception as e: | |
| print(f"Error synthesizing line '{text[:50]}...': {e}") | |
| # Continue with next line even if one fails | |
| if not audio_segments: | |
| print("No audio generated") | |
| return "" | |
| # Combine all segments | |
| print("Combining audio segments...") | |
| combined = sum(audio_segments) | |
| # Export as WAV with unique filename | |
| filename = generate_unique_filename() | |
| output_path = os.path.join(OUTPUT_DIR, filename) | |
| combined.export(output_path, format="wav") | |
| print(f"✓ Podcast saved to: {output_path}") | |
| return output_path | |
| def _synthesize_supertonic(self, script: list) -> str: | |
| """Synthesize using Supertonic TTS (CPU-based)""" | |
| print("Synthesizing audio via Supertonic TTS (CPU mode)...") | |
| audio_segments = [] | |
| # Build dynamic speaker-to-voice mapping | |
| # First unique speaker gets host_voice, second gets guest_voice | |
| speaker_to_voice = self._build_speaker_mapping(script) | |
| for i, item in enumerate(script): | |
| # Defensive checks for required keys | |
| if not isinstance(item, dict): | |
| print(f"⚠️ Skipping item {i + 1}: not a dictionary") | |
| continue | |
| if "text" not in item: | |
| print(f"⚠️ Skipping item {i + 1}: missing 'text' key") | |
| continue | |
| if "speaker" not in item: | |
| print(f"⚠️ Skipping item {i + 1}: missing 'speaker' key") | |
| continue | |
| text = item["text"] | |
| speaker = item["speaker"] | |
| emotion = item.get("emotion", "neutral") | |
| # Select voice based on speaker using dynamic mapping | |
| voice_id = speaker_to_voice.get(speaker, self.host_voice_id) | |
| try: | |
| print(f"Synthesizing line {i + 1}/{len(script)} ({speaker}, {emotion})...") | |
| # Generate audio using Supertonic | |
| # Parameters optimized for quality vs speed | |
| audio_segment = self.supertonic_engine.synthesize_to_audio_segment( | |
| text=text, | |
| voice_id=voice_id, | |
| speed=1.0, | |
| steps=5, # Balanced quality/speed (1-50, lower=faster) | |
| ) | |
| audio_segments.append(audio_segment) | |
| # Add 500ms silence between speakers | |
| silence = AudioSegment.silent(duration=500) | |
| audio_segments.append(silence) | |
| print(f"✓ Synthesized line {i + 1}/{len(script)}") | |
| except Exception as e: | |
| print(f"Error synthesizing line '{text[:50]}...': {e}") | |
| # Continue with next line even if one fails | |
| if not audio_segments: | |
| print("No audio generated") | |
| return "" | |
| # Combine all segments | |
| print("Combining audio segments...") | |
| combined = sum(audio_segments) | |
| # Export as WAV with unique filename | |
| filename = generate_unique_filename() | |
| output_path = os.path.join(OUTPUT_DIR, filename) | |
| combined.export(output_path, format="wav") | |
| print(f"✓ Podcast saved to: {output_path}") | |
| return output_path | |
| # Global instance | |
| _tts_instance = None | |
| def get_tts_engine(tts_provider="elevenlabs", custom_api_key=None, host_voice=None, guest_voice=None): | |
| """ | |
| Get TTS engine instance with ElevenLabs or Supertonic. | |
| Args: | |
| tts_provider: "elevenlabs" or "supertonic" | |
| custom_api_key: ElevenLabs API key (required if using elevenlabs, not needed for supertonic) | |
| host_voice: Voice ID for Host (optional) | |
| guest_voice: Voice ID for Guest (optional) | |
| Returns: | |
| TTSEngine instance | |
| """ | |
| global _tts_instance | |
| # Always create new instance if custom settings provided or if using Supertonic | |
| if custom_api_key or tts_provider != "elevenlabs" or host_voice or guest_voice: | |
| return TTSEngine( | |
| tts_provider=tts_provider, | |
| custom_api_key=custom_api_key, | |
| host_voice=host_voice, | |
| guest_voice=guest_voice | |
| ) | |
| # Otherwise, reuse global instance (for default ElevenLabs) | |
| if _tts_instance is None: | |
| _tts_instance = TTSEngine(tts_provider="elevenlabs") | |
| return _tts_instance | |