Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import tempfile | |
import os | |
from kittentts import KittenTTS | |
import soundfile as sf | |
# Initialize the TTS model | |
print("Loading KittenTTS model from Hugging Face...") | |
try: | |
tts_model = KittenTTS("KittenML/kitten-tts-nano-0.1") | |
print("β KittenTTS model loaded successfully!") | |
except Exception as e: | |
print(f"β Error loading model: {e}") | |
print("Make sure the kittentts package is properly installed") | |
raise | |
# Available voices from the model | |
AVAILABLE_VOICES = [ | |
'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f', | |
'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f' | |
] | |
# Create friendly voice names mapping | |
VOICE_MAPPING = { | |
"Voice 2 - Male": "expr-voice-2-m", | |
"Voice 2 - Female": "expr-voice-2-f", | |
"Voice 3 - Male": "expr-voice-3-m", | |
"Voice 3 - Female": "expr-voice-3-f", | |
"Voice 4 - Male": "expr-voice-4-m", | |
"Voice 4 - Female": "expr-voice-4-f", | |
"Voice 5 - Male": "expr-voice-5-m", | |
"Voice 5 - Female": "expr-voice-5-f", | |
} | |
print(f"β Available voices: {AVAILABLE_VOICES}") | |
MAX_CHARS = 420 # we don't know the exact limit at this point - works experimentally | |
def generate_speech(text, voice_choice): | |
""" | |
Generate speech from text using KittenTTS with voice selection | |
Args: | |
text (str): The text to convert to speech | |
voice_choice (str): The selected voice option | |
Returns: | |
tuple: (sample_rate, audio_array) for Gradio audio component | |
""" | |
if not text.strip(): | |
return None, "Please enter some text to generate speech." | |
# Check text length - KittenTTS nano model has context limitations | |
if len(text) > MAX_CHARS: | |
return None, f"Text too long! Please limit to {MAX_CHARS} characters. Current length: {len(text)} characters." | |
text = text + " ..." # Added because the model cuts off the audio sometimes. | |
try: | |
# Get voice identifier | |
voice_id = None | |
if voice_choice in VOICE_MAPPING: | |
voice_id = VOICE_MAPPING[voice_choice] | |
print(f"Using voice: {voice_choice} ({voice_id})") | |
# Generate audio using KittenTTS | |
if voice_id is not None: | |
# Use specific voice | |
audio = tts_model.generate(text, voice=voice_id) | |
else: | |
# Fall back to default voice | |
audio = tts_model.generate(text) | |
# KittenTTS returns audio at 24kHz sample rate | |
sample_rate = 24000 | |
# Ensure audio is in the right format for Gradio | |
if isinstance(audio, np.ndarray): | |
# Make sure audio is float32 and in the right range | |
audio = audio.astype(np.float32) | |
if len(audio) > 0 and (audio.max() > 1.0 or audio.min() < -1.0): | |
audio = audio / np.max(np.abs(audio)) | |
voice_msg = f" with {voice_choice}" if voice_id is not None else "" | |
char_count = len(text) | |
return (sample_rate, audio), f"Speech generated successfully{voice_msg}! ({char_count} characters)" | |
except Exception as e: | |
error_msg = str(e) | |
print(f"Error details: {e}") | |
# Provide helpful error messages for common issues | |
if "INVALID_ARGUMENT" in error_msg and "Expand" in error_msg: | |
return None, "Text is too long or complex for the model. Please try shorter, simpler text." | |
elif "ONNXRuntimeError" in error_msg: | |
return None, "Model processing error. Try shorter text or simpler punctuation." | |
else: | |
return None, f"Error generating speech: {error_msg}" | |
def create_interface(): | |
"""Create the Gradio interface""" | |
with gr.Blocks( | |
title="KittenTTS - High Quality Text-to-Speech", | |
theme=gr.themes.Soft(font=["Arial", "sans-serif"]), | |
) as demo: | |
gr.Markdown(""" | |
# π± KittenTTS - High Quality Text-to-Speech | |
Generate high-quality speech from text using [KittenTTS](https://huggingface.co/KittenML/kitten-tts-nano-0.1), | |
a lightweight TTS model that works without GPU! | |
Choose from multiple voice options and enter your text to hear the synthesized speech. | |
""") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
# Voice selection | |
voice_dropdown = gr.Dropdown( | |
choices=list(VOICE_MAPPING.keys()), | |
value=list(VOICE_MAPPING.keys())[0], | |
label="π€ Select Voice", | |
info="Choose between different male and female voices" | |
) | |
# Text input | |
text_input = gr.Textbox( | |
label="Text to Speech", | |
placeholder=f"Enter text (max {MAX_CHARS} characters for best results)...", | |
lines=3, | |
max_length=MAX_CHARS, | |
show_copy_button=True, | |
info="Keep text short and simple for the nano model" | |
) | |
# Generate button | |
generate_btn = gr.Button( | |
"π΅ Generate Speech", | |
variant="primary", | |
size="lg" | |
) | |
# Status message | |
status_msg = gr.Textbox( | |
label="Status", | |
interactive=False, | |
show_label=True | |
) | |
with gr.Column(scale=1): | |
# Audio output | |
audio_output = gr.Audio( | |
label="Generated Speech", | |
type="numpy", | |
interactive=False | |
) | |
# Example texts | |
gr.Markdown("### π Example Texts to Try (Short & Simple):") | |
examples = [ | |
["Hello world! This is KittenTTS.", "Voice 2 - Female"], | |
["The quick brown fox jumps over the lazy dog.", "Voice 3 - Male"], | |
["This model works without a GPU.", "Voice 4 - Female"], | |
["Welcome to KittenTTS!", "Voice 5 - Male"], | |
["How are you today?", "Voice 2 - Male"], | |
["The weather is nice today.", "Voice 3 - Female"] | |
] | |
gr.Examples( | |
examples=examples, | |
inputs=[text_input, voice_dropdown], | |
label="Click on any example to try it out" | |
) | |
# Event handlers | |
generate_btn.click( | |
fn=generate_speech, | |
inputs=[text_input, voice_dropdown], | |
outputs=[audio_output, status_msg], | |
show_progress=True | |
) | |
# Also allow Enter key to generate | |
text_input.submit( | |
fn=generate_speech, | |
inputs=[text_input, voice_dropdown], | |
outputs=[audio_output, status_msg], | |
show_progress=True | |
) | |
# Footer | |
gr.Markdown(""" | |
--- | |
**About KittenTTS Nano:** | |
- Lightweight 15M parameter text-to-speech model | |
- Works without GPU - optimized for efficiency | |
- Multiple voice options (male and female variants) | |
- 24kHz output sample rate | |
- **Best with short texts (under 400 characters)** | |
- Model: [KittenML/kitten-tts-nano-0.1](https://huggingface.co/KittenML/kitten-tts-nano-0.1) | |
- Built by [KittenML](https://github.com/KittenML/KittenTTS) | |
**Usage Tips for Nano Model:** | |
- β Keep text short and simple (about 400 characters) | |
- β Use common words and standard punctuation | |
- β Break long content into shorter sentences | |
- β Avoid very long sentences or complex punctuation | |
- β Avoid technical jargon or unusual words | |
""") | |
return demo | |
# Create and launch the interface | |
if __name__ == "__main__": | |
demo = create_interface() | |
# Launch the app | |
demo.launch( | |
server_name="0.0.0.0", # Allow external connections | |
server_port=7860, # Standard port for HF Spaces | |
share=False, # Don't create a public link (HF Spaces handles this) | |
show_error=True, # Show errors in the interface | |
quiet=False # Show startup logs | |
) |