🎤 KittenTTS

import gradio as gr
import soundfile as sf
import numpy as np
from kittentts import KittenTTS
import os

# Initialize the model
model = KittenTTS("KittenML/kitten-tts-nano-0.1")

# Available voices
AVAILABLE_VOICES = [
    'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f',
    'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f'
]

def generate_speech(text, voice, progress=gr.Progress()):
    """
    Generate speech from text using KittenTTS
    """
    if not text.strip():
        return None, "Please enter some text to generate speech."
    
    try:
        progress(0.3, desc="Loading model...")
        
        # Generate audio
        progress(0.6, desc="Generating speech...")
        audio = model.generate(text, voice=voice)
        
        progress(0.9, desc="Processing audio...")
        
        # Convert to the format expected by Gradio
        # Ensure audio is in the correct format (float32, mono)
        if len(audio.shape) > 1:
            audio = audio.mean(axis=1)  # Convert stereo to mono if needed
        
        # Normalize audio
        audio = audio / np.max(np.abs(audio)) if np.max(np.abs(audio)) > 0 else audio
        
        progress(1.0, desc="Complete!")
        
        return audio, f"✅ Successfully generated speech with voice: {voice}"
        
    except Exception as e:
        return None, f"❌ Error generating speech: {str(e)}"

def create_demo():
    """
    Create the Gradio demo interface
    """
    
    # Custom CSS for better styling
    css = """
    .gradio-container {
        max-width: 800px !important;
        margin: auto !important;
    }
    .main-header {
        text-align: center;
        margin-bottom: 2rem;
    }
    .voice-selector {
        margin: 1rem 0;
    }
    .output-audio {
        margin-top: 1rem;
    }
    """
    
    with gr.Blocks(css=css, title="KittenTTS - High Quality Text-to-Speech") as demo:
        
        # Header
        gr.HTML("""
        <div class="main-header">
            <h1>🎤 KittenTTS</h1>
            <p><em>High Quality Text-to-Speech Generation</em></p>
            <p>Generate natural-sounding speech from text using the KittenTTS model</p>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=2):
                # Text input
                text_input = gr.Textbox(
                    label="Enter your text",
                    placeholder="Type or paste your text here...",
                    lines=4,
                    max_lines=10
                )
                
                # Voice selection
                voice_dropdown = gr.Dropdown(
                    choices=AVAILABLE_VOICES,
                    value=AVAILABLE_VOICES[1],  # Default to female voice
                    label="Select Voice",
                    info="Choose from 8 different voices (4 male, 4 female)"
                )
                
                # Generate button
                generate_btn = gr.Button(
                    "🎵 Generate Speech",
                    variant="primary",
                    size="lg"
                )
            
            with gr.Column(scale=1):
                # Voice info
                gr.HTML("""
                <div style="background: #f0f0f0; padding: 1rem; border-radius: 8px;">
                    <h3>Available Voices:</h3>
                    <ul>
                        <li><strong>Male voices:</strong> expr-voice-2-m, expr-voice-3-m, expr-voice-4-m, expr-voice-5-m</li>
                        <li><strong>Female voices:</strong> expr-voice-2-f, expr-voice-3-f, expr-voice-4-f, expr-voice-5-f</li>
                    </ul>
                </div>
                """)
        
        # Output section
        with gr.Row():
            with gr.Column():
                # Audio output
                audio_output = gr.Audio(
                    label="Generated Audio",
                    type="numpy"
                )
                
                # Status message
                status_output = gr.Textbox(
                    label="Status",
                    interactive=False
                )
        
        # Example texts
        gr.Examples(
            examples=[
                ["Hello! This is a demonstration of the KittenTTS model.", "expr-voice-2-f"],
                ["The quick brown fox jumps over the lazy dog.", "expr-voice-2-m"],
                ["Welcome to our high-quality text-to-speech system.", "expr-voice-3-f"],
                ["This model works without requiring a GPU.", "expr-voice-3-m"],
            ],
            inputs=[text_input, voice_dropdown]
        )
        
        # Footer
        gr.HTML("""
        <div style="text-align: center; margin-top: 2rem; padding: 1rem; background: #f9f9f9; border-radius: 8px;">
            <p><strong>KittenTTS</strong> - Powered by <a href="https://huggingface.co/KittenML/kitten-tts-nano-0.1" target="_blank">KittenML/kitten-tts-nano-0.1</a></p>
            <p>Model: KittenTTS Nano v0.1 | Sample Rate: 24kHz</p>
        </div>
        """)
        
        # Connect the generate button
        generate_btn.click(
            fn=generate_speech,
            inputs=[text_input, voice_dropdown],
            outputs=[audio_output, status_output]
        )
        
        # Auto-generate when text is entered and Enter is pressed
        text_input.submit(
            fn=generate_speech,
            inputs=[text_input, voice_dropdown],
            outputs=[audio_output, status_output]
        )
    
    return demo

# Create and launch the demo
if __name__ == "__main__":
    demo = create_demo()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True,
        debug=False
    )