piper-plus-demo / app.py
ayousanz's picture
Update from GitHub Actions - 2025-08-22 02:29:06
7f37b04 verified
#!/usr/bin/env python3
"""
Piper TTS Gradio Demo for Hugging Face Spaces
Supports Japanese and English text-to-speech using ONNX models
"""
import json
import logging
import gradio as gr
import numpy as np
import onnxruntime
from app_imports import ESPEAK_AVAILABLE, PYOPENJTALK_AVAILABLE
# Download models if not present
from download_models import download_models
# Ensure models are downloaded
download_models()
# Import optional dependencies
if PYOPENJTALK_AVAILABLE:
import pyopenjtalk
if ESPEAK_AVAILABLE:
from espeak_phonemizer import Phonemizer
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Model configurations
MODELS = {
"Japanese (Medium)": {
"path": "models/ja_JP-test-medium.onnx",
"config": "models/ja_JP-test-medium.onnx.json",
"language": "ja",
},
"English (Test)": {
"path": "models/test_voice.onnx",
"config": "models/test_voice.onnx.json",
"language": "en",
},
}
# Basic English word to IPA mapping for common words
# This is a simplified fallback when espeak-ng is not available
ENGLISH_IPA_MAP = {
"hello": "hɛloʊ",
"world": "wɜrld",
"this": "ðɪs",
"is": "ɪz",
"a": "ə",
"test": "tɛst",
"text": "tɛkst",
"to": "tu",
"speech": "spitʃ",
"demo": "dɛmoʊ",
"welcome": "wɛlkəm",
"piper": "paɪpər",
"tts": "titiɛs",
"enjoy": "ɛndʒɔɪ",
"high": "haɪ",
"quality": "kwɑləti",
"synthesis": "sɪnθəsɪs",
"the": "ðə",
"and": "ænd",
"for": "fɔr",
"with": "wɪð",
"you": "ju",
"can": "kæn",
"it": "ɪt",
"that": "ðæt",
"have": "hæv",
"from": "frʌm",
"or": "ɔr",
"which": "wɪtʃ",
"one": "wʌn",
"would": "wʊd",
"all": "ɔl",
"will": "wɪl",
"there": "ðɛr",
"say": "seɪ",
"who": "hu",
"make": "meɪk",
"when": "wɛn",
"time": "taɪm",
"if": "ɪf",
"no": "noʊ",
"way": "weɪ",
"has": "hæz",
"yes": "jɛs",
"good": "gʊd",
"very": "vɛri",
}
# Japanese multi-character phoneme to Unicode PUA mapping
# This mapping must match the C++ implementation and training data
PHONEME_TO_PUA = {
# Long vowels
"a:": "\ue000",
"i:": "\ue001",
"u:": "\ue002",
"e:": "\ue003",
"o:": "\ue004",
# Special consonants
"cl": "\ue005", # Geminate/glottal stop
# Palatalized consonants
"ky": "\ue006",
"kw": "\ue007",
"gy": "\ue008",
"gw": "\ue009",
"ty": "\ue00a",
"dy": "\ue00b",
"py": "\ue00c",
"by": "\ue00d",
# Affricates and special sounds
"ch": "\ue00e",
"ts": "\ue00f",
"sh": "\ue010",
"zy": "\ue011",
"hy": "\ue012",
# Palatalized nasals/liquids
"ny": "\ue013",
"my": "\ue014",
"ry": "\ue015",
}
def load_model_config(config_path: str) -> dict:
"""Load model configuration from JSON file"""
with open(config_path, encoding="utf-8") as f:
return json.load(f)
def map_phonemes(phonemes: list[str]) -> list[str]:
"""Map multi-character phonemes to Unicode PUA characters"""
result = []
for phoneme in phonemes:
if phoneme in PHONEME_TO_PUA:
result.append(PHONEME_TO_PUA[phoneme])
else:
result.append(phoneme)
return result
def text_to_phonemes(text: str, language: str) -> list[str]:
"""Convert text to phoneme strings based on language"""
if language == "ja":
if PYOPENJTALK_AVAILABLE:
# Get phonemes from OpenJTalk
labels = pyopenjtalk.extract_fullcontext(text)
phonemes = []
for label in labels:
# Extract phoneme from label
if "-" in label and "+" in label:
phoneme = label.split("-")[1].split("+")[0]
if phoneme not in ["sil", "pau"]:
phonemes.append(phoneme)
# Add sentence markers
phonemes = ["^"] + phonemes + ["$"]
# Convert multi-character phonemes to Unicode PUA
phonemes = map_phonemes(phonemes)
else:
logger.warning("pyopenjtalk not available, using fallback")
# Simple fallback - just use dummy phonemes
phonemes = ["^"] + list("aiueo") * 5 + ["$"]
elif ESPEAK_AVAILABLE: # English
phonemizer = Phonemizer("en-us")
phoneme_str = phonemizer.phonemize(text)
# Convert phoneme string to list
phonemes = ["^"] + list(phoneme_str.replace(" ", "")) + ["$"]
else:
logger.warning("espeak_phonemizer not available, using IPA fallback")
# IPA-based fallback for better English pronunciation
words = text.lower().split()
phonemes = ["^"]
for i, word in enumerate(words):
# Add space between words
if i > 0:
phonemes.append(" ")
# Remove punctuation from word
clean_word = "".join(c for c in word if c.isalpha())
if clean_word in ENGLISH_IPA_MAP:
# Use IPA mapping if available
ipa = ENGLISH_IPA_MAP[clean_word]
phonemes.extend(list(ipa))
else:
# Fall back to character-by-character for unknown words
phonemes.extend(list(clean_word))
phonemes.append("$")
return phonemes
def phonemes_to_ids(phonemes: list[str], config: dict) -> list[int]:
"""Convert phonemes to model input IDs"""
phoneme_id_map = config.get("phoneme_id_map", {})
ids = []
for phoneme in phonemes:
if phoneme in phoneme_id_map:
ids.extend(phoneme_id_map[phoneme])
else:
# Use pad token for unknown phonemes
ids.append(0)
return ids
def synthesize_speech(
text: str,
model_name: str,
speaker_id: int = 0,
length_scale: float = 1.0,
noise_scale: float = 0.667,
noise_w: float = 0.8,
) -> tuple[int, np.ndarray]:
"""Generate speech from text using selected model"""
if not text.strip():
raise gr.Error("Please enter some text")
if model_name not in MODELS:
raise gr.Error("Invalid model selected")
model_info = MODELS[model_name]
config = load_model_config(model_info["config"])
# Convert text to phoneme IDs
phonemes = text_to_phonemes(text, model_info["language"])
phoneme_ids = phonemes_to_ids(phonemes, config)
if not phoneme_ids:
raise gr.Error("Failed to convert text to phonemes")
# Load ONNX model
sess_options = onnxruntime.SessionOptions()
sess_options.inter_op_num_threads = 1
sess_options.intra_op_num_threads = 1
try:
model = onnxruntime.InferenceSession(
model_info["path"],
sess_options=sess_options,
providers=["CPUExecutionProvider"],
)
except Exception as e:
logger.error(f"Failed to load model: {e}")
raise gr.Error(f"Failed to load model: {str(e)}") from e
# Prepare inputs
text_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
text_lengths = np.array([text_array.shape[1]], dtype=np.int64)
scales = np.array([noise_scale, length_scale, noise_w], dtype=np.float32)
# Handle speaker ID for multi-speaker models
sid = None
if config.get("num_speakers", 1) > 1:
sid = np.array([speaker_id], dtype=np.int64)
# Run inference
try:
inputs = {
"input": text_array,
"input_lengths": text_lengths,
"scales": scales,
}
if sid is not None:
inputs["sid"] = sid
audio = model.run(None, inputs)[0]
# Remove batch and channel dimensions
audio = audio.squeeze()
# Convert to int16
audio = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
sample_rate = config.get("audio", {}).get("sample_rate", 22050)
return sample_rate, audio
except Exception as e:
logger.error(f"Inference failed: {e}")
raise gr.Error(f"Failed to generate speech: {str(e)}") from e
def create_interface():
"""Create Gradio interface"""
with gr.Blocks(title="Piper TTS Demo") as interface:
gr.Markdown("""
# 🎙️ Piper TTS Demo
High-quality text-to-speech synthesis supporting Japanese and English.
This demo uses ONNX models for fast CPU inference.
""")
with gr.Row():
with gr.Column(scale=2):
model_dropdown = gr.Dropdown(
choices=list(MODELS.keys()),
label="Select Model",
value=list(MODELS.keys())[0],
)
text_input = gr.Textbox(
label="Text to synthesize",
placeholder="Enter text here...",
lines=3,
)
# Advanced Settings without Accordion (flattened)
gr.Markdown("### Advanced Settings")
speaker_id = gr.Number(
label="Speaker ID (for multi-speaker models)",
value=0,
precision=0,
minimum=0,
maximum=10,
)
length_scale = gr.Slider(
label="Speed (Lower = faster speech)",
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
)
noise_scale = gr.Slider(
label="Expressiveness",
minimum=0.0,
maximum=1.0,
value=0.667,
step=0.01,
)
noise_w = gr.Slider(
label="Phoneme Duration Variance",
minimum=0.0,
maximum=1.0,
value=0.8,
step=0.01,
)
synthesize_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column(scale=2):
audio_output = gr.Audio(
label="Generated Speech",
type="numpy",
autoplay=True,
)
gr.Markdown("""
### Tips:
- Japanese model expects hiragana/kanji text
- English model works with standard text
- Adjust speed for faster/slower speech
- Higher expressiveness = more natural variation
""")
# Examples
gr.Examples(
examples=[
["こんにちは、世界!今日はいい天気ですね。", "Japanese (Medium)"],
[
"おはようございます。本日の会議は午後3時から始まります。",
"Japanese (Medium)",
],
["Hello world! This is a text to speech demo.", "English (Test)"],
[
"Welcome to Piper TTS. Enjoy high quality speech synthesis.",
"English (Test)",
],
],
inputs=[text_input, model_dropdown],
)
# Event handlers
synthesize_btn.click(
fn=synthesize_speech,
inputs=[
text_input,
model_dropdown,
speaker_id,
length_scale,
noise_scale,
noise_w,
],
outputs=audio_output,
)
return interface
def create_minimal_interface():
"""Create a minimal fallback interface if main interface fails"""
with gr.Blocks(title="Piper TTS Demo") as interface:
gr.Markdown("# 🎙️ Piper TTS Demo")
text_input = gr.Textbox(
label="Text to synthesize",
placeholder="Enter text here...",
lines=3,
)
model_dropdown = gr.Dropdown(
choices=list(MODELS.keys()),
label="Select Model",
value=list(MODELS.keys())[0],
)
synthesize_btn = gr.Button("Generate Speech", variant="primary")
audio_output = gr.Audio(
label="Generated Speech",
type="numpy",
)
synthesize_btn.click(
fn=lambda text, model: synthesize_speech(text, model, 0, 1.0, 0.667, 0.8),
inputs=[text_input, model_dropdown],
outputs=audio_output,
)
return interface
# Create and launch the app
# Move interface creation inside main block to avoid context issues
interface = None
if __name__ == "__main__":
# Create and launch interface
interface = create_interface()
# Launch with minimal configuration for Hugging Face Spaces
interface.launch()