import gradio as gr import numpy as np from kittentts import KittenTTS # Initialize the model model = KittenTTS("KittenML/kitten-tts-nano-0.1") # Available voices AVAILABLE_VOICES = [ 'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f', 'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f' ] def generate_speech(text, voice): """Generate speech from text using KittenTTS""" if not text.strip(): return None, "Please enter some text to generate speech." try: # Generate audio audio = model.generate(text, voice=voice) # Convert to the format expected by Gradio if len(audio.shape) > 1: audio = audio.mean(axis=1) # Convert stereo to mono if needed # Normalize audio audio = audio / np.max(np.abs(audio)) if np.max(np.abs(audio)) > 0 else audio # Return in the format expected by Gradio Audio component: (sample_rate, audio_data) return (24000, audio), f"✅ Successfully generated speech with voice: {voice}" except Exception as e: return None, f"❌ Error generating speech: {str(e)}" # Create the interface using Interface instead of Blocks demo = gr.Interface( fn=generate_speech, inputs=[ gr.Textbox(label="Enter your text", placeholder="Type your text here...", lines=3), gr.Dropdown(choices=AVAILABLE_VOICES, value=AVAILABLE_VOICES[1], label="Select Voice") ], outputs=[ gr.Audio(label="Generated Audio"), gr.Textbox(label="Status", interactive=False) ], title="🎤 KittenTTS - High Quality Text-to-Speech", description="Generate natural-sounding speech from text using the KittenTTS model", examples=[ ["Hello! This is a demonstration of the KittenTTS model.", "expr-voice-2-f"], ["The quick brown fox jumps over the lazy dog.", "expr-voice-2-m"], ["Welcome to our high-quality text-to-speech system.", "expr-voice-3-f"], ] ) # Launch the demo if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=True )