KittenTTSNano / app.py
fxPracht's picture
updated text box
22a2bf5 verified
import gradio as gr
import numpy as np
import tempfile
import os
from kittentts import KittenTTS
import soundfile as sf
# Initialize the TTS model
print("Loading KittenTTS model from Hugging Face...")
try:
tts_model = KittenTTS("KittenML/kitten-tts-nano-0.1")
print("βœ… KittenTTS model loaded successfully!")
except Exception as e:
print(f"❌ Error loading model: {e}")
print("Make sure the kittentts package is properly installed")
raise
# Available voices from the model
AVAILABLE_VOICES = [
'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f',
'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f'
]
# Create friendly voice names mapping
VOICE_MAPPING = {
"Voice 2 - Male": "expr-voice-2-m",
"Voice 2 - Female": "expr-voice-2-f",
"Voice 3 - Male": "expr-voice-3-m",
"Voice 3 - Female": "expr-voice-3-f",
"Voice 4 - Male": "expr-voice-4-m",
"Voice 4 - Female": "expr-voice-4-f",
"Voice 5 - Male": "expr-voice-5-m",
"Voice 5 - Female": "expr-voice-5-f",
}
print(f"βœ… Available voices: {AVAILABLE_VOICES}")
MAX_CHARS = 420 # we don't know the exact limit at this point - works experimentally
def generate_speech(text, voice_choice):
"""
Generate speech from text using KittenTTS with voice selection
Args:
text (str): The text to convert to speech
voice_choice (str): The selected voice option
Returns:
tuple: (sample_rate, audio_array) for Gradio audio component
"""
if not text.strip():
return None, "Please enter some text to generate speech."
# Check text length - KittenTTS nano model has context limitations
if len(text) > MAX_CHARS:
return None, f"Text too long! Please limit to {MAX_CHARS} characters. Current length: {len(text)} characters."
text = text + " ..." # Added because the model cuts off the audio sometimes.
try:
# Get voice identifier
voice_id = None
if voice_choice in VOICE_MAPPING:
voice_id = VOICE_MAPPING[voice_choice]
print(f"Using voice: {voice_choice} ({voice_id})")
# Generate audio using KittenTTS
if voice_id is not None:
# Use specific voice
audio = tts_model.generate(text, voice=voice_id)
else:
# Fall back to default voice
audio = tts_model.generate(text)
# KittenTTS returns audio at 24kHz sample rate
sample_rate = 24000
# Ensure audio is in the right format for Gradio
if isinstance(audio, np.ndarray):
# Make sure audio is float32 and in the right range
audio = audio.astype(np.float32)
if len(audio) > 0 and (audio.max() > 1.0 or audio.min() < -1.0):
audio = audio / np.max(np.abs(audio))
voice_msg = f" with {voice_choice}" if voice_id is not None else ""
char_count = len(text)
return (sample_rate, audio), f"Speech generated successfully{voice_msg}! ({char_count} characters)"
except Exception as e:
error_msg = str(e)
print(f"Error details: {e}")
# Provide helpful error messages for common issues
if "INVALID_ARGUMENT" in error_msg and "Expand" in error_msg:
return None, "Text is too long or complex for the model. Please try shorter, simpler text."
elif "ONNXRuntimeError" in error_msg:
return None, "Model processing error. Try shorter text or simpler punctuation."
else:
return None, f"Error generating speech: {error_msg}"
def create_interface():
"""Create the Gradio interface"""
with gr.Blocks(
title="KittenTTS - High Quality Text-to-Speech",
theme=gr.themes.Soft(font=["Arial", "sans-serif"]),
) as demo:
gr.Markdown("""
# 🐱 KittenTTS - High Quality Text-to-Speech
Generate high-quality speech from text using [KittenTTS](https://huggingface.co/KittenML/kitten-tts-nano-0.1),
a lightweight TTS model that works without GPU!
Choose from multiple voice options and enter your text to hear the synthesized speech.
""")
with gr.Row():
with gr.Column(scale=2):
# Voice selection
voice_dropdown = gr.Dropdown(
choices=list(VOICE_MAPPING.keys()),
value=list(VOICE_MAPPING.keys())[0],
label="🎀 Select Voice",
info="Choose between different male and female voices"
)
# Text input
text_input = gr.Textbox(
label="Text to Speech",
placeholder=f"Enter text (max {MAX_CHARS} characters for best results)...",
lines=3,
max_length=MAX_CHARS,
show_copy_button=True,
info="Keep text short and simple for the nano model"
)
# Generate button
generate_btn = gr.Button(
"🎡 Generate Speech",
variant="primary",
size="lg"
)
# Status message
status_msg = gr.Textbox(
label="Status",
interactive=False,
show_label=True
)
with gr.Column(scale=1):
# Audio output
audio_output = gr.Audio(
label="Generated Speech",
type="numpy",
interactive=False
)
# Example texts
gr.Markdown("### πŸ“ Example Texts to Try (Short & Simple):")
examples = [
["Hello world! This is KittenTTS.", "Voice 2 - Female"],
["The quick brown fox jumps over the lazy dog.", "Voice 3 - Male"],
["This model works without a GPU.", "Voice 4 - Female"],
["Welcome to KittenTTS!", "Voice 5 - Male"],
["How are you today?", "Voice 2 - Male"],
["The weather is nice today.", "Voice 3 - Female"]
]
gr.Examples(
examples=examples,
inputs=[text_input, voice_dropdown],
label="Click on any example to try it out"
)
# Event handlers
generate_btn.click(
fn=generate_speech,
inputs=[text_input, voice_dropdown],
outputs=[audio_output, status_msg],
show_progress=True
)
# Also allow Enter key to generate
text_input.submit(
fn=generate_speech,
inputs=[text_input, voice_dropdown],
outputs=[audio_output, status_msg],
show_progress=True
)
# Footer
gr.Markdown("""
---
**About KittenTTS Nano:**
- Lightweight 15M parameter text-to-speech model
- Works without GPU - optimized for efficiency
- Multiple voice options (male and female variants)
- 24kHz output sample rate
- **Best with short texts (under 400 characters)**
- Model: [KittenML/kitten-tts-nano-0.1](https://huggingface.co/KittenML/kitten-tts-nano-0.1)
- Built by [KittenML](https://github.com/KittenML/KittenTTS)
**Usage Tips for Nano Model:**
- βœ… Keep text short and simple (about 400 characters)
- βœ… Use common words and standard punctuation
- βœ… Break long content into shorter sentences
- ❌ Avoid very long sentences or complex punctuation
- ❌ Avoid technical jargon or unusual words
""")
return demo
# Create and launch the interface
if __name__ == "__main__":
demo = create_interface()
# Launch the app
demo.launch(
server_name="0.0.0.0", # Allow external connections
server_port=7860, # Standard port for HF Spaces
share=False, # Don't create a public link (HF Spaces handles this)
show_error=True, # Show errors in the interface
quiet=False # Show startup logs
)