Tttt / app.py
Athspi's picture
Update app.py
e567eaf verified
import os
import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer
import onnxruntime
import scipy.io.wavfile
from huggingface_hub import hf_hub_download
# Define the Hugging Face repository/model ID.
repo_id = "Athspi/Gg"
# Download the ONNX model file from the repository.
onnx_model_path = hf_hub_download(repo_id=repo_id, filename="mms_tts_eng.onnx")
# Load the tokenizer from the repository.
tokenizer = AutoTokenizer.from_pretrained(repo_id)
# Initialize the ONNX runtime session for inference.
ort_session = onnxruntime.InferenceSession(
onnx_model_path, providers=['CPUExecutionProvider']
)
# Define the fixed sampling rate (adjust if your model uses a different rate)
sampling_rate = 16000
def tts_inference(text: str):
"""
Convert input text to speech waveform using the ONNX model.
Parameters:
text (str): Input text to synthesize.
Returns:
Tuple[int, np.ndarray]: A tuple containing the sampling rate (int) and the synthesized
audio waveform (np.ndarray in float32 format).
"""
# Tokenize the input text.
inputs = tokenizer(text, return_tensors="pt")
# Prepare inputs for the ONNX model.
input_ids = inputs.input_ids.cpu().to(torch.long).numpy()
# Run inference on the ONNX model.
onnx_outputs = ort_session.run(None, {"input_ids": input_ids})
waveform = onnx_outputs[0]
# Ensure the output is a NumPy array.
if not isinstance(waveform, np.ndarray):
waveform = np.array(waveform)
# Convert waveform to float32 (required by Gradio's Audio component).
waveform = waveform.astype(np.float32)
# Remove any extra dimensions.
waveform = np.squeeze(waveform)
return (sampling_rate, waveform) # Return as a tuple
# Build the Gradio interface.
iface = gr.Interface(
fn=tts_inference,
inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
outputs=gr.Audio(type="numpy", label="Generated Speech"),
title="ONNX TTS Demo",
description="Text-to-Speech synthesis using an ONNX model from the Athspi/Gg repository on Hugging Face.",
examples=[
["Hello, this is an example of text-to-speech."],
["This model uses ONNX Runtime for fast inference."],
["You can try your own sentences here."]
]
)
if __name__ == "__main__":
iface.launch()