Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import os | |
| import tempfile | |
| import requests | |
| import soundfile as sf | |
| import json | |
| import shutil | |
| from pathlib import Path | |
| import numpy as np | |
| # ===== NEUTTS IMPORTS ===== | |
| from neuttsair.neutts import NeuTTSAir | |
| # ===== CONFIGURATION ===== | |
| CONFIG_FILE = "voice_profiles.json" | |
| SAMPLE_DIR = "samples" | |
| os.makedirs(SAMPLE_DIR, exist_ok=True) | |
| # ===== VOICE PROFILE MANAGEMENT ===== | |
| class VoiceProfileManager: | |
| def __init__(self, config_file=CONFIG_FILE): | |
| self.config_file = config_file | |
| self.profiles = self.load_profiles() | |
| def load_profiles(self): | |
| if os.path.exists(self.config_file): | |
| with open(self.config_file, 'r') as f: | |
| return json.load(f) | |
| return {} | |
| def save_profiles(self): | |
| with open(self.config_file, 'w') as f: | |
| json.dump(self.profiles, f, indent=2) | |
| def add_profile(self, name, audio_path, text): | |
| self.profiles[name] = { | |
| "audio_path": audio_path, | |
| "text": text, | |
| "created_at": str(np.datetime64('now')) | |
| } | |
| self.save_profiles() | |
| return f"β Voice profile '{name}' saved!" | |
| def get_profile(self, name): | |
| return self.profiles.get(name) | |
| def list_profiles(self): | |
| return list(self.profiles.keys()) | |
| # ===== SAMPLE MANAGEMENT ===== | |
| def download_default_samples(): | |
| """Download default sample voices""" | |
| samples = { | |
| "dave": { | |
| "audio": "https://github.com/neophonic/neutts-air/raw/main/samples/dave.wav", | |
| "text": "https://raw.githubusercontent.com/neophonic/neutts-air/main/samples/dave.txt" | |
| }, | |
| "andrea": { | |
| "audio": "https://github.com/neophonic/neutts-air/raw/main/samples/andrea.wav", | |
| "text": "https://raw.githubusercontent.com/neophonic/neutts-air/main/samples/andrea.txt" | |
| } | |
| } | |
| for name, urls in samples.items(): | |
| audio_path = f"{SAMPLE_DIR}/{name}.wav" | |
| text_path = f"{SAMPLE_DIR}/{name}.txt" | |
| if not os.path.exists(audio_path): | |
| try: | |
| response = requests.get(urls["audio"]) | |
| with open(audio_path, 'wb') as f: | |
| f.write(response.content) | |
| response = requests.get(urls["text"]) | |
| with open(text_path, 'w') as f: | |
| f.write(response.text) | |
| print(f"β Downloaded {name} sample") | |
| except Exception as e: | |
| print(f"β Failed to download {name}: {e}") | |
| # ===== TTS ENGINE ===== | |
| class TTSEngine: | |
| def __init__(self): | |
| self.tts = None | |
| self.voice_manager = VoiceProfileManager() | |
| download_default_samples() | |
| def initialize_tts(self): | |
| if self.tts is None: | |
| print("π Initializing NeuTTS Q4 GGUF...") | |
| self.tts = NeuTTSAir( | |
| backbone_repo="neuphonic/neutts-air-q4-gguf", | |
| backbone_device="cpu", | |
| codec_repo="neuphonic/neucodec", | |
| codec_device="cpu" | |
| ) | |
| return self.tts | |
| def generate_speech(self, text, voice_name): | |
| try: | |
| tts = self.initialize_tts() | |
| profile = self.voice_manager.get_profile(voice_name) | |
| if not profile: | |
| return None, f"β Voice profile '{voice_name}' not found" | |
| ref_codes = tts.encode_reference(profile["audio_path"]) | |
| ref_text = profile["text"] | |
| wav = tts.infer(text, ref_codes, ref_text) | |
| return wav, None | |
| except Exception as e: | |
| return None, f"β Generation error: {str(e)}" | |
| # ===== SCRIPT PARSING ===== | |
| def parse_conversation_script(script_text): | |
| """Parse script with speaker labels""" | |
| lines = [] | |
| for line in script_text.strip().split('\n'): | |
| line = line.strip() | |
| if ':' in line: | |
| speaker, dialogue = line.split(':', 1) | |
| lines.append({ | |
| "speaker": speaker.strip(), | |
| "text": dialogue.strip() | |
| }) | |
| elif line: | |
| # Default to Speaker A if no label | |
| lines.append({ | |
| "speaker": "Speaker A", | |
| "text": line | |
| }) | |
| return lines | |
| def generate_script_from_prompt(prompt, style="conversational"): | |
| """Generate a podcast script from a prompt""" | |
| # Simple template-based generation | |
| templates = { | |
| "conversational": [ | |
| "Host: Welcome to our podcast! Today we're discussing {prompt}", | |
| "Co-host: That's right! It's a fascinating topic that affects many people.", | |
| "Host: Let's start with the basics. What should our audience know about this?", | |
| "Co-host: Well, first of all, it's important to understand the key concepts.", | |
| "Host: And what about the practical applications? How can people use this in their daily lives?", | |
| "Co-host: Great question! There are several ways to apply this knowledge effectively." | |
| ], | |
| "interview": [ | |
| "Interviewer: Thanks for joining us today to talk about {prompt}", | |
| "Guest: Happy to be here! It's a topic I'm very passionate about.", | |
| "Interviewer: Could you share some background on how you got involved in this field?", | |
| "Guest: Absolutely. It all started several years ago when I first discovered this area.", | |
| "Interviewer: What are the most exciting developments you're seeing right now?", | |
| "Guest: There are some incredible advancements happening that will change everything." | |
| ], | |
| "debate": [ | |
| "Moderator: Welcome to our debate on {prompt}", | |
| "Proponent: I believe this is one of the most important issues of our time.", | |
| "Opponent: While I respect that view, I have some serious concerns about the approach.", | |
| "Proponent: Let me address those concerns with some concrete evidence.", | |
| "Opponent: The evidence is compelling, but we must consider the broader implications.", | |
| "Moderator: Let's hear from both sides about potential solutions." | |
| ] | |
| } | |
| template = templates.get(style, templates["conversational"]) | |
| script = "\n".join([line.format(prompt=prompt) for line in template]) | |
| return script | |
| # ===== MAIN GENERATION FUNCTIONS ===== | |
| tts_engine = TTSEngine() | |
| def clone_voice(voice_name, upload_audio, reference_text): | |
| """Clone a voice from uploaded audio""" | |
| if not voice_name or not upload_audio: | |
| return "β Please provide a voice name and audio file" | |
| try: | |
| # Save uploaded audio | |
| audio_ext = Path(upload_audio).suffix | |
| audio_path = f"{SAMPLE_DIR}/{voice_name}{audio_ext}" | |
| shutil.copy2(upload_audio, audio_path) | |
| # Save voice profile | |
| result = tts_engine.voice_manager.add_profile(voice_name, audio_path, reference_text) | |
| return result | |
| except Exception as e: | |
| return f"β Error cloning voice: {str(e)}" | |
| def generate_podcast(script_input, speaker_a, speaker_b, prompt_input, script_style): | |
| """Generate a complete podcast with two speakers""" | |
| try: | |
| # Generate script if prompt is provided | |
| if prompt_input and (not script_input or script_input.strip() == ""): | |
| script_input = generate_script_from_prompt(prompt_input, script_style) | |
| if not script_input or script_input.strip() == "": | |
| return None, "β Please provide either a script or a prompt" | |
| # Parse conversation | |
| conversation = parse_conversation_script(script_input) | |
| if not conversation: | |
| return None, "β Could not parse script" | |
| # Generate audio for each line | |
| combined_audio = None | |
| current_sample_rate = 24000 | |
| for i, line in enumerate(conversation): | |
| speaker = line["speaker"] | |
| text = line["text"] | |
| # Choose voice based on speaker label or A/B assignment | |
| if "host" in speaker.lower() or "a" in speaker.lower() or "interviewer" in speaker.lower(): | |
| voice = speaker_a | |
| elif "co-host" in speaker.lower() or "b" in speaker.lower() or "guest" in speaker.lower(): | |
| voice = speaker_b | |
| else: | |
| # Default assignment | |
| voice = speaker_a if i % 2 == 0 else speaker_b | |
| print(f"ποΈ {speaker} ({voice}): {text}") | |
| # Generate speech | |
| wav, error = tts_engine.generate_speech(text, voice) | |
| if error: | |
| return None, error | |
| # Combine audio | |
| if combined_audio is None: | |
| combined_audio = wav | |
| else: | |
| # Add a small pause between speakers | |
| pause = np.zeros(int(0.5 * current_sample_rate)) # 0.5 second pause | |
| combined_audio = np.concatenate([combined_audio, pause, wav]) | |
| # Save final audio | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
| sf.write(f.name, combined_audio, current_sample_rate) | |
| audio_file = f.name | |
| # Save script | |
| script_file = audio_file.replace(".wav", "_script.txt") | |
| with open(script_file, 'w') as f: | |
| f.write(script_input) | |
| return audio_file, script_file, "β Podcast generated successfully!" | |
| except Exception as e: | |
| return None, None, f"β Error: {str(e)}" | |
| # ===== GRADIO UI ===== | |
| css = """ | |
| .container { max-width: 1400px; margin: 0 auto; } | |
| .header { background: linear-gradient(135deg, #32CD32 0%, #1E90FF 100%); color: white; padding: 30px; border-radius: 12px; margin-bottom: 25px; text-align: center; border: 3px solid #1E90FF; } | |
| .section { border: 2px solid #32CD32; border-radius: 10px; padding: 20px; margin-bottom: 20px; background: white; } | |
| .output-section { background: linear-gradient(135deg, #F0FFF0 0%, #F0F8FF 100%); border: 2px dashed #1E90FF; border-radius: 10px; padding: 20px; margin-top: 20px; } | |
| .btn-primary { background: linear-gradient(135deg, #32CD32 0%, #1E90FF 100%) !important; border: 2px solid #1E90FF !important; color: white !important; font-weight: bold !important; } | |
| .btn-secondary { background: linear-gradient(135deg, #FFA500 0%, #FF6347 100%) !important; border: 2px solid #FF6347 !important; color: white !important; } | |
| .tab { background: #f0f8ff; padding: 15px; border-radius: 8px; margin: 10px 0; } | |
| """ | |
| with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo: | |
| gr.HTML(""" | |
| <div class="header"> | |
| <h1>ποΈ 2nd-Host AI - Complete Podcast Studio</h1> | |
| <h3>Voice Cloning β’ 2-Speaker Podcasts β’ Script Generation β’ Export</h3> | |
| </div> | |
| """) | |
| # Initialize voice manager | |
| voice_manager = VoiceProfileManager() | |
| available_voices = voice_manager.list_profiles() | |
| with gr.Tab("π Voice Cloning"): | |
| gr.Markdown("### Clone New Voices") | |
| with gr.Row(): | |
| with gr.Column(): | |
| voice_name = gr.Textbox(label="Voice Name", placeholder="e.g., 'David', 'Sarah', 'Expert'") | |
| upload_audio = gr.Audio(label="Reference Audio", type="filepath") | |
| reference_text = gr.Textbox( | |
| label="Reference Text", | |
| value="Hey there, this is my voice for cloning.", | |
| placeholder="Text spoken in the reference audio" | |
| ) | |
| clone_btn = gr.Button("π― Clone Voice", variant="primary") | |
| with gr.Column(): | |
| clone_status = gr.Textbox(label="Cloning Status", interactive=False) | |
| available_voices_display = gr.Dropdown( | |
| label="Available Voices", | |
| choices=available_voices, | |
| value=available_voices[0] if available_voices else None | |
| ) | |
| refresh_btn = gr.Button("π Refresh Voices") | |
| with gr.Tab("π¬ Podcast Studio"): | |
| gr.Markdown("### Create 2-Speaker Podcast") | |
| with gr.Row(): | |
| with gr.Column(): | |
| # Script input | |
| script_input = gr.Textbox( | |
| label="Podcast Script", | |
| lines=6, | |
| placeholder="""Format: Speaker: Dialogue | |
| Example: | |
| Host: Welcome to our show! | |
| Co-host: Thanks for having me! | |
| Host: Let's discuss AI voice technology... | |
| Co-host: It's revolutionizing content creation!""", | |
| value="" | |
| ) | |
| # Script generation | |
| prompt_input = gr.Textbox( | |
| label="Or Generate from Prompt", | |
| placeholder="e.g., 'The future of AI in education'" | |
| ) | |
| script_style = gr.Radio( | |
| choices=["conversational", "interview", "debate"], | |
| label="Script Style", | |
| value="conversational" | |
| ) | |
| generate_script_btn = gr.Button("π Generate Script", variant="secondary") | |
| with gr.Column(): | |
| # Speaker selection | |
| speaker_a = gr.Dropdown( | |
| choices=available_voices, | |
| label="π€ Speaker A (Host)", | |
| value=available_voices[0] if available_voices else None | |
| ) | |
| speaker_b = gr.Dropdown( | |
| choices=available_voices, | |
| label="π€ Speaker B (Co-host/Guest)", | |
| value=available_voices[1] if len(available_voices) > 1 else available_voices[0] if available_voices else None | |
| ) | |
| generate_btn = gr.Button("π Generate Podcast", variant="primary", size="lg") | |
| with gr.Tab("π€ Output"): | |
| gr.Markdown("### Generated Podcast") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="π§ Podcast Audio", type="filepath") | |
| script_output = gr.File(label="π Script File", file_types=[".txt"]) | |
| with gr.Column(): | |
| generation_status = gr.Textbox(label="Generation Status", lines=3) | |
| download_btn = gr.Button("πΎ Download All", variant="primary") | |
| # ===== EVENT HANDLERS ===== | |
| def refresh_voices(): | |
| voice_manager = VoiceProfileManager() | |
| voices = voice_manager.list_profiles() | |
| return gr.Dropdown(choices=voices, value=voices[0] if voices else None), gr.Dropdown(choices=voices, value=voices[1] if len(voices) > 1 else voices[0] if voices else None) | |
| def handle_clone_voice(voice_name, audio_path, text): | |
| result = clone_voice(voice_name, audio_path, text) | |
| return result, *refresh_voices() | |
| def handle_generate_script(prompt, style): | |
| if not prompt: | |
| return "β Please enter a prompt" | |
| script = generate_script_from_prompt(prompt, style) | |
| return script | |
| def handle_generate_podcast(script, speaker_a, speaker_b, prompt, style): | |
| return generate_podcast(script, speaker_a, speaker_b, prompt, style) | |
| # Connect events | |
| clone_btn.click( | |
| handle_clone_voice, | |
| inputs=[voice_name, upload_audio, reference_text], | |
| outputs=[clone_status, speaker_a, speaker_b] | |
| ) | |
| refresh_btn.click( | |
| refresh_voices, | |
| outputs=[speaker_a, speaker_b] | |
| ) | |
| generate_script_btn.click( | |
| handle_generate_script, | |
| inputs=[prompt_input, script_style], | |
| outputs=[script_input] | |
| ) | |
| generate_btn.click( | |
| handle_generate_podcast, | |
| inputs=[script_input, speaker_a, speaker_b, prompt_input, script_style], | |
| outputs=[audio_output, script_output, generation_status] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=True) |