Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Piper TTS Gradio Demo for Hugging Face Spaces | |
Supports Japanese and English text-to-speech using ONNX models | |
""" | |
import json | |
import logging | |
import gradio as gr | |
import numpy as np | |
import onnxruntime | |
from app_imports import ESPEAK_AVAILABLE, PYOPENJTALK_AVAILABLE | |
# Download models if not present | |
from download_models import download_models | |
# Ensure models are downloaded | |
download_models() | |
# Import optional dependencies | |
if PYOPENJTALK_AVAILABLE: | |
import pyopenjtalk | |
if ESPEAK_AVAILABLE: | |
from espeak_phonemizer import Phonemizer | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Model configurations | |
MODELS = { | |
"Japanese (Medium)": { | |
"path": "models/ja_JP-test-medium.onnx", | |
"config": "models/ja_JP-test-medium.onnx.json", | |
"language": "ja", | |
}, | |
"English (Test)": { | |
"path": "models/test_voice.onnx", | |
"config": "models/test_voice.onnx.json", | |
"language": "en", | |
}, | |
} | |
# Basic English word to IPA mapping for common words | |
# This is a simplified fallback when espeak-ng is not available | |
ENGLISH_IPA_MAP = { | |
"hello": "hɛloʊ", | |
"world": "wɜrld", | |
"this": "ðɪs", | |
"is": "ɪz", | |
"a": "ə", | |
"test": "tɛst", | |
"text": "tɛkst", | |
"to": "tu", | |
"speech": "spitʃ", | |
"demo": "dɛmoʊ", | |
"welcome": "wɛlkəm", | |
"piper": "paɪpər", | |
"tts": "titiɛs", | |
"enjoy": "ɛndʒɔɪ", | |
"high": "haɪ", | |
"quality": "kwɑləti", | |
"synthesis": "sɪnθəsɪs", | |
"the": "ðə", | |
"and": "ænd", | |
"for": "fɔr", | |
"with": "wɪð", | |
"you": "ju", | |
"can": "kæn", | |
"it": "ɪt", | |
"that": "ðæt", | |
"have": "hæv", | |
"from": "frʌm", | |
"or": "ɔr", | |
"which": "wɪtʃ", | |
"one": "wʌn", | |
"would": "wʊd", | |
"all": "ɔl", | |
"will": "wɪl", | |
"there": "ðɛr", | |
"say": "seɪ", | |
"who": "hu", | |
"make": "meɪk", | |
"when": "wɛn", | |
"time": "taɪm", | |
"if": "ɪf", | |
"no": "noʊ", | |
"way": "weɪ", | |
"has": "hæz", | |
"yes": "jɛs", | |
"good": "gʊd", | |
"very": "vɛri", | |
} | |
# Japanese multi-character phoneme to Unicode PUA mapping | |
# This mapping must match the C++ implementation and training data | |
PHONEME_TO_PUA = { | |
# Long vowels | |
"a:": "\ue000", | |
"i:": "\ue001", | |
"u:": "\ue002", | |
"e:": "\ue003", | |
"o:": "\ue004", | |
# Special consonants | |
"cl": "\ue005", # Geminate/glottal stop | |
# Palatalized consonants | |
"ky": "\ue006", | |
"kw": "\ue007", | |
"gy": "\ue008", | |
"gw": "\ue009", | |
"ty": "\ue00a", | |
"dy": "\ue00b", | |
"py": "\ue00c", | |
"by": "\ue00d", | |
# Affricates and special sounds | |
"ch": "\ue00e", | |
"ts": "\ue00f", | |
"sh": "\ue010", | |
"zy": "\ue011", | |
"hy": "\ue012", | |
# Palatalized nasals/liquids | |
"ny": "\ue013", | |
"my": "\ue014", | |
"ry": "\ue015", | |
} | |
def load_model_config(config_path: str) -> dict: | |
"""Load model configuration from JSON file""" | |
with open(config_path, encoding="utf-8") as f: | |
return json.load(f) | |
def map_phonemes(phonemes: list[str]) -> list[str]: | |
"""Map multi-character phonemes to Unicode PUA characters""" | |
result = [] | |
for phoneme in phonemes: | |
if phoneme in PHONEME_TO_PUA: | |
result.append(PHONEME_TO_PUA[phoneme]) | |
else: | |
result.append(phoneme) | |
return result | |
def text_to_phonemes(text: str, language: str) -> list[str]: | |
"""Convert text to phoneme strings based on language""" | |
if language == "ja": | |
if PYOPENJTALK_AVAILABLE: | |
# Get phonemes from OpenJTalk | |
labels = pyopenjtalk.extract_fullcontext(text) | |
phonemes = [] | |
for label in labels: | |
# Extract phoneme from label | |
if "-" in label and "+" in label: | |
phoneme = label.split("-")[1].split("+")[0] | |
if phoneme not in ["sil", "pau"]: | |
phonemes.append(phoneme) | |
# Add sentence markers | |
phonemes = ["^"] + phonemes + ["$"] | |
# Convert multi-character phonemes to Unicode PUA | |
phonemes = map_phonemes(phonemes) | |
else: | |
logger.warning("pyopenjtalk not available, using fallback") | |
# Simple fallback - just use dummy phonemes | |
phonemes = ["^"] + list("aiueo") * 5 + ["$"] | |
elif ESPEAK_AVAILABLE: # English | |
phonemizer = Phonemizer("en-us") | |
phoneme_str = phonemizer.phonemize(text) | |
# Convert phoneme string to list | |
phonemes = ["^"] + list(phoneme_str.replace(" ", "")) + ["$"] | |
else: | |
logger.warning("espeak_phonemizer not available, using IPA fallback") | |
# IPA-based fallback for better English pronunciation | |
words = text.lower().split() | |
phonemes = ["^"] | |
for i, word in enumerate(words): | |
# Add space between words | |
if i > 0: | |
phonemes.append(" ") | |
# Remove punctuation from word | |
clean_word = "".join(c for c in word if c.isalpha()) | |
if clean_word in ENGLISH_IPA_MAP: | |
# Use IPA mapping if available | |
ipa = ENGLISH_IPA_MAP[clean_word] | |
phonemes.extend(list(ipa)) | |
else: | |
# Fall back to character-by-character for unknown words | |
phonemes.extend(list(clean_word)) | |
phonemes.append("$") | |
return phonemes | |
def phonemes_to_ids(phonemes: list[str], config: dict) -> list[int]: | |
"""Convert phonemes to model input IDs""" | |
phoneme_id_map = config.get("phoneme_id_map", {}) | |
ids = [] | |
for phoneme in phonemes: | |
if phoneme in phoneme_id_map: | |
ids.extend(phoneme_id_map[phoneme]) | |
else: | |
# Use pad token for unknown phonemes | |
ids.append(0) | |
return ids | |
def synthesize_speech( | |
text: str, | |
model_name: str, | |
speaker_id: int = 0, | |
length_scale: float = 1.0, | |
noise_scale: float = 0.667, | |
noise_w: float = 0.8, | |
) -> tuple[int, np.ndarray]: | |
"""Generate speech from text using selected model""" | |
if not text.strip(): | |
raise gr.Error("Please enter some text") | |
if model_name not in MODELS: | |
raise gr.Error("Invalid model selected") | |
model_info = MODELS[model_name] | |
config = load_model_config(model_info["config"]) | |
# Convert text to phoneme IDs | |
phonemes = text_to_phonemes(text, model_info["language"]) | |
phoneme_ids = phonemes_to_ids(phonemes, config) | |
if not phoneme_ids: | |
raise gr.Error("Failed to convert text to phonemes") | |
# Load ONNX model | |
sess_options = onnxruntime.SessionOptions() | |
sess_options.inter_op_num_threads = 1 | |
sess_options.intra_op_num_threads = 1 | |
try: | |
model = onnxruntime.InferenceSession( | |
model_info["path"], | |
sess_options=sess_options, | |
providers=["CPUExecutionProvider"], | |
) | |
except Exception as e: | |
logger.error(f"Failed to load model: {e}") | |
raise gr.Error(f"Failed to load model: {str(e)}") from e | |
# Prepare inputs | |
text_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0) | |
text_lengths = np.array([text_array.shape[1]], dtype=np.int64) | |
scales = np.array([noise_scale, length_scale, noise_w], dtype=np.float32) | |
# Handle speaker ID for multi-speaker models | |
sid = None | |
if config.get("num_speakers", 1) > 1: | |
sid = np.array([speaker_id], dtype=np.int64) | |
# Run inference | |
try: | |
inputs = { | |
"input": text_array, | |
"input_lengths": text_lengths, | |
"scales": scales, | |
} | |
if sid is not None: | |
inputs["sid"] = sid | |
audio = model.run(None, inputs)[0] | |
# Remove batch and channel dimensions | |
audio = audio.squeeze() | |
# Convert to int16 | |
audio = np.clip(audio * 32767, -32768, 32767).astype(np.int16) | |
sample_rate = config.get("audio", {}).get("sample_rate", 22050) | |
return sample_rate, audio | |
except Exception as e: | |
logger.error(f"Inference failed: {e}") | |
raise gr.Error(f"Failed to generate speech: {str(e)}") from e | |
def create_interface(): | |
"""Create Gradio interface""" | |
with gr.Blocks(title="Piper TTS Demo") as interface: | |
gr.Markdown(""" | |
# 🎙️ Piper TTS Demo | |
High-quality text-to-speech synthesis supporting Japanese and English. | |
This demo uses ONNX models for fast CPU inference. | |
""") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
model_dropdown = gr.Dropdown( | |
choices=list(MODELS.keys()), | |
label="Select Model", | |
value=list(MODELS.keys())[0], | |
) | |
text_input = gr.Textbox( | |
label="Text to synthesize", | |
placeholder="Enter text here...", | |
lines=3, | |
) | |
# Advanced Settings without Accordion (flattened) | |
gr.Markdown("### Advanced Settings") | |
speaker_id = gr.Number( | |
label="Speaker ID (for multi-speaker models)", | |
value=0, | |
precision=0, | |
minimum=0, | |
maximum=10, | |
) | |
length_scale = gr.Slider( | |
label="Speed (Lower = faster speech)", | |
minimum=0.5, | |
maximum=2.0, | |
value=1.0, | |
step=0.1, | |
) | |
noise_scale = gr.Slider( | |
label="Expressiveness", | |
minimum=0.0, | |
maximum=1.0, | |
value=0.667, | |
step=0.01, | |
) | |
noise_w = gr.Slider( | |
label="Phoneme Duration Variance", | |
minimum=0.0, | |
maximum=1.0, | |
value=0.8, | |
step=0.01, | |
) | |
synthesize_btn = gr.Button("Generate Speech", variant="primary") | |
with gr.Column(scale=2): | |
audio_output = gr.Audio( | |
label="Generated Speech", | |
type="numpy", | |
autoplay=True, | |
) | |
gr.Markdown(""" | |
### Tips: | |
- Japanese model expects hiragana/kanji text | |
- English model works with standard text | |
- Adjust speed for faster/slower speech | |
- Higher expressiveness = more natural variation | |
""") | |
# Examples | |
gr.Examples( | |
examples=[ | |
["こんにちは、世界!今日はいい天気ですね。", "Japanese (Medium)"], | |
[ | |
"おはようございます。本日の会議は午後3時から始まります。", | |
"Japanese (Medium)", | |
], | |
["Hello world! This is a text to speech demo.", "English (Test)"], | |
[ | |
"Welcome to Piper TTS. Enjoy high quality speech synthesis.", | |
"English (Test)", | |
], | |
], | |
inputs=[text_input, model_dropdown], | |
) | |
# Event handlers | |
synthesize_btn.click( | |
fn=synthesize_speech, | |
inputs=[ | |
text_input, | |
model_dropdown, | |
speaker_id, | |
length_scale, | |
noise_scale, | |
noise_w, | |
], | |
outputs=audio_output, | |
) | |
return interface | |
def create_minimal_interface(): | |
"""Create a minimal fallback interface if main interface fails""" | |
with gr.Blocks(title="Piper TTS Demo") as interface: | |
gr.Markdown("# 🎙️ Piper TTS Demo") | |
text_input = gr.Textbox( | |
label="Text to synthesize", | |
placeholder="Enter text here...", | |
lines=3, | |
) | |
model_dropdown = gr.Dropdown( | |
choices=list(MODELS.keys()), | |
label="Select Model", | |
value=list(MODELS.keys())[0], | |
) | |
synthesize_btn = gr.Button("Generate Speech", variant="primary") | |
audio_output = gr.Audio( | |
label="Generated Speech", | |
type="numpy", | |
) | |
synthesize_btn.click( | |
fn=lambda text, model: synthesize_speech(text, model, 0, 1.0, 0.667, 0.8), | |
inputs=[text_input, model_dropdown], | |
outputs=audio_output, | |
) | |
return interface | |
# Create and launch the app | |
# Move interface creation inside main block to avoid context issues | |
interface = None | |
if __name__ == "__main__": | |
# Create and launch interface | |
interface = create_interface() | |
# Launch with minimal configuration for Hugging Face Spaces | |
interface.launch() | |