Spaces:

Vishwas1
/

KittenTTSDemo

Running

App Files Files Community

KittenTTSDemo / app.py

Vishwas1

Upload 5 files

e327671 verified 29 days ago

raw

history blame contribute delete

5.96 kB

	import gradio as gr
	import soundfile as sf
	import numpy as np
	from kittentts import KittenTTS
	import os

	# Initialize the model
	model = KittenTTS("KittenML/kitten-tts-nano-0.1")

	# Available voices
	AVAILABLE_VOICES = [
	'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f',
	'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f'
	]

	def generate_speech(text, voice, progress=gr.Progress()):
	"""
	Generate speech from text using KittenTTS
	"""
	if not text.strip():
	return None, "Please enter some text to generate speech."

	try:
	progress(0.3, desc="Loading model...")

	# Generate audio
	progress(0.6, desc="Generating speech...")
	audio = model.generate(text, voice=voice)

	progress(0.9, desc="Processing audio...")

	# Convert to the format expected by Gradio
	# Ensure audio is in the correct format (float32, mono)
	if len(audio.shape) > 1:
	audio = audio.mean(axis=1) # Convert stereo to mono if needed

	# Normalize audio
	audio = audio / np.max(np.abs(audio)) if np.max(np.abs(audio)) > 0 else audio

	progress(1.0, desc="Complete!")

	return audio, f"✅ Successfully generated speech with voice: {voice}"

	except Exception as e:
	return None, f"❌ Error generating speech: {str(e)}"

	def create_demo():
	"""
	Create the Gradio demo interface
	"""

	# Custom CSS for better styling
	css = """
	.gradio-container {
	max-width: 800px !important;
	margin: auto !important;
	}
	.main-header {
	text-align: center;
	margin-bottom: 2rem;
	}
	.voice-selector {
	margin: 1rem 0;
	}
	.output-audio {
	margin-top: 1rem;
	}
	"""

	with gr.Blocks(css=css, title="KittenTTS - High Quality Text-to-Speech") as demo:

	# Header
	gr.HTML("""
	<div class="main-header">
	<h1>🎤 KittenTTS</h1>
	<p><em>High Quality Text-to-Speech Generation</em></p>
	<p>Generate natural-sounding speech from text using the KittenTTS model</p>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=2):
	# Text input
	text_input = gr.Textbox(
	label="Enter your text",
	placeholder="Type or paste your text here...",
	lines=4,
	max_lines=10
	)

	# Voice selection
	voice_dropdown = gr.Dropdown(
	choices=AVAILABLE_VOICES,
	value=AVAILABLE_VOICES[1], # Default to female voice
	label="Select Voice",
	info="Choose from 8 different voices (4 male, 4 female)"
	)

	# Generate button
	generate_btn = gr.Button(
	"🎵 Generate Speech",
	variant="primary",
	size="lg"
	)

	with gr.Column(scale=1):
	# Voice info
	gr.HTML("""
	<div style="background: #f0f0f0; padding: 1rem; border-radius: 8px;">
	<h3>Available Voices:</h3>
	<ul>
	<li><strong>Male voices:</strong> expr-voice-2-m, expr-voice-3-m, expr-voice-4-m, expr-voice-5-m</li>
	<li><strong>Female voices:</strong> expr-voice-2-f, expr-voice-3-f, expr-voice-4-f, expr-voice-5-f</li>
	</ul>
	</div>
	""")

	# Output section
	with gr.Row():
	with gr.Column():
	# Audio output
	audio_output = gr.Audio(
	label="Generated Audio",
	type="numpy"
	)

	# Status message
	status_output = gr.Textbox(
	label="Status",
	interactive=False
	)

	# Example texts
	gr.Examples(
	examples=[
	["Hello! This is a demonstration of the KittenTTS model.", "expr-voice-2-f"],
	["The quick brown fox jumps over the lazy dog.", "expr-voice-2-m"],
	["Welcome to our high-quality text-to-speech system.", "expr-voice-3-f"],
	["This model works without requiring a GPU.", "expr-voice-3-m"],
	],
	inputs=[text_input, voice_dropdown]
	)

	# Footer
	gr.HTML("""
	<div style="text-align: center; margin-top: 2rem; padding: 1rem; background: #f9f9f9; border-radius: 8px;">
	<p><strong>KittenTTS</strong> - Powered by <a href="https://huggingface.co/KittenML/kitten-tts-nano-0.1" target="_blank">KittenML/kitten-tts-nano-0.1</a></p>
	<p>Model: KittenTTS Nano v0.1 \| Sample Rate: 24kHz</p>
	</div>
	""")

	# Connect the generate button
	generate_btn.click(
	fn=generate_speech,
	inputs=[text_input, voice_dropdown],
	outputs=[audio_output, status_output]
	)

	# Auto-generate when text is entered and Enter is pressed
	text_input.submit(
	fn=generate_speech,
	inputs=[text_input, voice_dropdown],
	outputs=[audio_output, status_output]
	)

	return demo

	# Create and launch the demo
	if __name__ == "__main__":
	demo = create_demo()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True,
	debug=False
	)