import gradio as gr import soundfile as sf import numpy as np from kittentts import KittenTTS import os # Initialize the model model = KittenTTS("KittenML/kitten-tts-nano-0.1") # Available voices AVAILABLE_VOICES = [ 'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f', 'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f' ] def generate_speech(text, voice, progress=gr.Progress()): """ Generate speech from text using KittenTTS """ if not text.strip(): return None, "Please enter some text to generate speech." try: progress(0.3, desc="Loading model...") # Generate audio progress(0.6, desc="Generating speech...") audio = model.generate(text, voice=voice) progress(0.9, desc="Processing audio...") # Convert to the format expected by Gradio # Ensure audio is in the correct format (float32, mono) if len(audio.shape) > 1: audio = audio.mean(axis=1) # Convert stereo to mono if needed # Normalize audio audio = audio / np.max(np.abs(audio)) if np.max(np.abs(audio)) > 0 else audio progress(1.0, desc="Complete!") return audio, f"✅ Successfully generated speech with voice: {voice}" except Exception as e: return None, f"❌ Error generating speech: {str(e)}" def create_demo(): """ Create the Gradio demo interface """ # Custom CSS for better styling css = """ .gradio-container { max-width: 800px !important; margin: auto !important; } .main-header { text-align: center; margin-bottom: 2rem; } .voice-selector { margin: 1rem 0; } .output-audio { margin-top: 1rem; } """ with gr.Blocks(css=css, title="KittenTTS - High Quality Text-to-Speech") as demo: # Header gr.HTML("""

🎤 KittenTTS

High Quality Text-to-Speech Generation

Generate natural-sounding speech from text using the KittenTTS model

""") with gr.Row(): with gr.Column(scale=2): # Text input text_input = gr.Textbox( label="Enter your text", placeholder="Type or paste your text here...", lines=4, max_lines=10 ) # Voice selection voice_dropdown = gr.Dropdown( choices=AVAILABLE_VOICES, value=AVAILABLE_VOICES[1], # Default to female voice label="Select Voice", info="Choose from 8 different voices (4 male, 4 female)" ) # Generate button generate_btn = gr.Button( "🎵 Generate Speech", variant="primary", size="lg" ) with gr.Column(scale=1): # Voice info gr.HTML("""

Available Voices:

""") # Output section with gr.Row(): with gr.Column(): # Audio output audio_output = gr.Audio( label="Generated Audio", type="numpy" ) # Status message status_output = gr.Textbox( label="Status", interactive=False ) # Example texts gr.Examples( examples=[ ["Hello! This is a demonstration of the KittenTTS model.", "expr-voice-2-f"], ["The quick brown fox jumps over the lazy dog.", "expr-voice-2-m"], ["Welcome to our high-quality text-to-speech system.", "expr-voice-3-f"], ["This model works without requiring a GPU.", "expr-voice-3-m"], ], inputs=[text_input, voice_dropdown] ) # Footer gr.HTML("""

KittenTTS - Powered by KittenML/kitten-tts-nano-0.1

Model: KittenTTS Nano v0.1 | Sample Rate: 24kHz

""") # Connect the generate button generate_btn.click( fn=generate_speech, inputs=[text_input, voice_dropdown], outputs=[audio_output, status_output] ) # Auto-generate when text is entered and Enter is pressed text_input.submit( fn=generate_speech, inputs=[text_input, voice_dropdown], outputs=[audio_output, status_output] ) return demo # Create and launch the demo if __name__ == "__main__": demo = create_demo() demo.launch( server_name="0.0.0.0", server_port=7860, share=True, debug=False )