Spaces:

ayousanz
/

piper-plus-demo

Running

App Files Files Community

piper-plus-demo / app.py

ayousanz

Update from GitHub Actions - 2025-08-22 02:29:06

7f37b04 verified about 18 hours ago

raw

history blame contribute delete

12.9 kB

	#!/usr/bin/env python3
	"""
	Piper TTS Gradio Demo for Hugging Face Spaces
	Supports Japanese and English text-to-speech using ONNX models
	"""

	import json
	import logging

	import gradio as gr
	import numpy as np
	import onnxruntime
	from app_imports import ESPEAK_AVAILABLE, PYOPENJTALK_AVAILABLE

	# Download models if not present
	from download_models import download_models


	# Ensure models are downloaded
	download_models()


	# Import optional dependencies
	if PYOPENJTALK_AVAILABLE:
	import pyopenjtalk
	if ESPEAK_AVAILABLE:
	from espeak_phonemizer import Phonemizer


	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Model configurations
	MODELS = {
	"Japanese (Medium)": {
	"path": "models/ja_JP-test-medium.onnx",
	"config": "models/ja_JP-test-medium.onnx.json",
	"language": "ja",
	},
	"English (Test)": {
	"path": "models/test_voice.onnx",
	"config": "models/test_voice.onnx.json",
	"language": "en",
	},
	}

	# Basic English word to IPA mapping for common words
	# This is a simplified fallback when espeak-ng is not available
	ENGLISH_IPA_MAP = {
	"hello": "hɛloʊ",
	"world": "wɜrld",
	"this": "ðɪs",
	"is": "ɪz",
	"a": "ə",
	"test": "tɛst",
	"text": "tɛkst",
	"to": "tu",
	"speech": "spitʃ",
	"demo": "dɛmoʊ",
	"welcome": "wɛlkəm",
	"piper": "paɪpər",
	"tts": "titiɛs",
	"enjoy": "ɛndʒɔɪ",
	"high": "haɪ",
	"quality": "kwɑləti",
	"synthesis": "sɪnθəsɪs",
	"the": "ðə",
	"and": "ænd",
	"for": "fɔr",
	"with": "wɪð",
	"you": "ju",
	"can": "kæn",
	"it": "ɪt",
	"that": "ðæt",
	"have": "hæv",
	"from": "frʌm",
	"or": "ɔr",
	"which": "wɪtʃ",
	"one": "wʌn",
	"would": "wʊd",
	"all": "ɔl",
	"will": "wɪl",
	"there": "ðɛr",
	"say": "seɪ",
	"who": "hu",
	"make": "meɪk",
	"when": "wɛn",
	"time": "taɪm",
	"if": "ɪf",
	"no": "noʊ",
	"way": "weɪ",
	"has": "hæz",
	"yes": "jɛs",
	"good": "gʊd",
	"very": "vɛri",
	}

	# Japanese multi-character phoneme to Unicode PUA mapping
	# This mapping must match the C++ implementation and training data
	PHONEME_TO_PUA = {
	# Long vowels
	"a:": "\ue000",
	"i:": "\ue001",
	"u:": "\ue002",
	"e:": "\ue003",
	"o:": "\ue004",
	# Special consonants
	"cl": "\ue005", # Geminate/glottal stop
	# Palatalized consonants
	"ky": "\ue006",
	"kw": "\ue007",
	"gy": "\ue008",
	"gw": "\ue009",
	"ty": "\ue00a",
	"dy": "\ue00b",
	"py": "\ue00c",
	"by": "\ue00d",
	# Affricates and special sounds
	"ch": "\ue00e",
	"ts": "\ue00f",
	"sh": "\ue010",
	"zy": "\ue011",
	"hy": "\ue012",
	# Palatalized nasals/liquids
	"ny": "\ue013",
	"my": "\ue014",
	"ry": "\ue015",
	}


	def load_model_config(config_path: str) -> dict:
	"""Load model configuration from JSON file"""
	with open(config_path, encoding="utf-8") as f:
	return json.load(f)


	def map_phonemes(phonemes: list[str]) -> list[str]:
	"""Map multi-character phonemes to Unicode PUA characters"""
	result = []
	for phoneme in phonemes:
	if phoneme in PHONEME_TO_PUA:
	result.append(PHONEME_TO_PUA[phoneme])
	else:
	result.append(phoneme)
	return result


	def text_to_phonemes(text: str, language: str) -> list[str]:
	"""Convert text to phoneme strings based on language"""

	if language == "ja":
	if PYOPENJTALK_AVAILABLE:
	# Get phonemes from OpenJTalk
	labels = pyopenjtalk.extract_fullcontext(text)
	phonemes = []

	for label in labels:
	# Extract phoneme from label
	if "-" in label and "+" in label:
	phoneme = label.split("-")[1].split("+")[0]
	if phoneme not in ["sil", "pau"]:
	phonemes.append(phoneme)

	# Add sentence markers
	phonemes = ["^"] + phonemes + ["$"]

	# Convert multi-character phonemes to Unicode PUA
	phonemes = map_phonemes(phonemes)
	else:
	logger.warning("pyopenjtalk not available, using fallback")
	# Simple fallback - just use dummy phonemes
	phonemes = ["^"] + list("aiueo") * 5 + ["$"]

	elif ESPEAK_AVAILABLE: # English
	phonemizer = Phonemizer("en-us")
	phoneme_str = phonemizer.phonemize(text)
	# Convert phoneme string to list
	phonemes = ["^"] + list(phoneme_str.replace(" ", "")) + ["$"]
	else:
	logger.warning("espeak_phonemizer not available, using IPA fallback")
	# IPA-based fallback for better English pronunciation
	words = text.lower().split()
	phonemes = ["^"]

	for i, word in enumerate(words):
	# Add space between words
	if i > 0:
	phonemes.append(" ")

	# Remove punctuation from word
	clean_word = "".join(c for c in word if c.isalpha())

	if clean_word in ENGLISH_IPA_MAP:
	# Use IPA mapping if available
	ipa = ENGLISH_IPA_MAP[clean_word]
	phonemes.extend(list(ipa))
	else:
	# Fall back to character-by-character for unknown words
	phonemes.extend(list(clean_word))

	phonemes.append("$")

	return phonemes


	def phonemes_to_ids(phonemes: list[str], config: dict) -> list[int]:
	"""Convert phonemes to model input IDs"""
	phoneme_id_map = config.get("phoneme_id_map", {})

	ids = []
	for phoneme in phonemes:
	if phoneme in phoneme_id_map:
	ids.extend(phoneme_id_map[phoneme])
	else:
	# Use pad token for unknown phonemes
	ids.append(0)

	return ids


	def synthesize_speech(
	text: str,
	model_name: str,
	speaker_id: int = 0,
	length_scale: float = 1.0,
	noise_scale: float = 0.667,
	noise_w: float = 0.8,
	) -> tuple[int, np.ndarray]:
	"""Generate speech from text using selected model"""

	if not text.strip():
	raise gr.Error("Please enter some text")

	if model_name not in MODELS:
	raise gr.Error("Invalid model selected")

	model_info = MODELS[model_name]
	config = load_model_config(model_info["config"])

	# Convert text to phoneme IDs
	phonemes = text_to_phonemes(text, model_info["language"])
	phoneme_ids = phonemes_to_ids(phonemes, config)

	if not phoneme_ids:
	raise gr.Error("Failed to convert text to phonemes")

	# Load ONNX model
	sess_options = onnxruntime.SessionOptions()
	sess_options.inter_op_num_threads = 1
	sess_options.intra_op_num_threads = 1

	try:
	model = onnxruntime.InferenceSession(
	model_info["path"],
	sess_options=sess_options,
	providers=["CPUExecutionProvider"],
	)
	except Exception as e:
	logger.error(f"Failed to load model: {e}")
	raise gr.Error(f"Failed to load model: {str(e)}") from e

	# Prepare inputs
	text_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
	text_lengths = np.array([text_array.shape[1]], dtype=np.int64)
	scales = np.array([noise_scale, length_scale, noise_w], dtype=np.float32)

	# Handle speaker ID for multi-speaker models
	sid = None
	if config.get("num_speakers", 1) > 1:
	sid = np.array([speaker_id], dtype=np.int64)

	# Run inference
	try:
	inputs = {
	"input": text_array,
	"input_lengths": text_lengths,
	"scales": scales,
	}

	if sid is not None:
	inputs["sid"] = sid

	audio = model.run(None, inputs)[0]

	# Remove batch and channel dimensions
	audio = audio.squeeze()

	# Convert to int16
	audio = np.clip(audio * 32767, -32768, 32767).astype(np.int16)

	sample_rate = config.get("audio", {}).get("sample_rate", 22050)

	return sample_rate, audio

	except Exception as e:
	logger.error(f"Inference failed: {e}")
	raise gr.Error(f"Failed to generate speech: {str(e)}") from e


	def create_interface():
	"""Create Gradio interface"""
	with gr.Blocks(title="Piper TTS Demo") as interface:
	gr.Markdown("""
	# 🎙️ Piper TTS Demo

	High-quality text-to-speech synthesis supporting Japanese and English.

	This demo uses ONNX models for fast CPU inference.
	""")

	with gr.Row():
	with gr.Column(scale=2):
	model_dropdown = gr.Dropdown(
	choices=list(MODELS.keys()),
	label="Select Model",
	value=list(MODELS.keys())[0],
	)

	text_input = gr.Textbox(
	label="Text to synthesize",
	placeholder="Enter text here...",
	lines=3,
	)

	# Advanced Settings without Accordion (flattened)
	gr.Markdown("### Advanced Settings")

	speaker_id = gr.Number(
	label="Speaker ID (for multi-speaker models)",
	value=0,
	precision=0,
	minimum=0,
	maximum=10,
	)

	length_scale = gr.Slider(
	label="Speed (Lower = faster speech)",
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.1,
	)

	noise_scale = gr.Slider(
	label="Expressiveness",
	minimum=0.0,
	maximum=1.0,
	value=0.667,
	step=0.01,
	)

	noise_w = gr.Slider(
	label="Phoneme Duration Variance",
	minimum=0.0,
	maximum=1.0,
	value=0.8,
	step=0.01,
	)

	synthesize_btn = gr.Button("Generate Speech", variant="primary")

	with gr.Column(scale=2):
	audio_output = gr.Audio(
	label="Generated Speech",
	type="numpy",
	autoplay=True,
	)

	gr.Markdown("""
	### Tips:
	- Japanese model expects hiragana/kanji text
	- English model works with standard text
	- Adjust speed for faster/slower speech
	- Higher expressiveness = more natural variation
	""")

	# Examples
	gr.Examples(
	examples=[
	["こんにちは、世界！今日はいい天気ですね。", "Japanese (Medium)"],
	[
	"おはようございます。本日の会議は午後3時から始まります。",
	"Japanese (Medium)",
	],
	["Hello world! This is a text to speech demo.", "English (Test)"],
	[
	"Welcome to Piper TTS. Enjoy high quality speech synthesis.",
	"English (Test)",
	],
	],
	inputs=[text_input, model_dropdown],
	)

	# Event handlers
	synthesize_btn.click(
	fn=synthesize_speech,
	inputs=[
	text_input,
	model_dropdown,
	speaker_id,
	length_scale,
	noise_scale,
	noise_w,
	],
	outputs=audio_output,
	)

	return interface


	def create_minimal_interface():
	"""Create a minimal fallback interface if main interface fails"""
	with gr.Blocks(title="Piper TTS Demo") as interface:
	gr.Markdown("# 🎙️ Piper TTS Demo")

	text_input = gr.Textbox(
	label="Text to synthesize",
	placeholder="Enter text here...",
	lines=3,
	)

	model_dropdown = gr.Dropdown(
	choices=list(MODELS.keys()),
	label="Select Model",
	value=list(MODELS.keys())[0],
	)

	synthesize_btn = gr.Button("Generate Speech", variant="primary")

	audio_output = gr.Audio(
	label="Generated Speech",
	type="numpy",
	)

	synthesize_btn.click(
	fn=lambda text, model: synthesize_speech(text, model, 0, 1.0, 0.667, 0.8),
	inputs=[text_input, model_dropdown],
	outputs=audio_output,
	)

	return interface


	# Create and launch the app
	# Move interface creation inside main block to avoid context issues
	interface = None

	if __name__ == "__main__":
	# Create and launch interface
	interface = create_interface()
	# Launch with minimal configuration for Hugging Face Spaces
	interface.launch()