File size: 2,415 Bytes
6417dc9 c21b225 6417dc9 4368215 1f4b6af c21b225 1f4b6af 4368215 1f4b6af 4368215 1f4b6af 4368215 6417dc9 c21b225 6417dc9 c21b225 6417dc9 e567eaf 6417dc9 6c54982 b6b39ee 6c54982 6417dc9 e567eaf c21b225 6c54982 4368215 6417dc9 e567eaf 6417dc9 e567eaf c21b225 4368215 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import os
import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer
import onnxruntime
import scipy.io.wavfile
from huggingface_hub import hf_hub_download
# Define the Hugging Face repository/model ID.
repo_id = "Athspi/Gg"
# Download the ONNX model file from the repository.
onnx_model_path = hf_hub_download(repo_id=repo_id, filename="mms_tts_eng.onnx")
# Load the tokenizer from the repository.
tokenizer = AutoTokenizer.from_pretrained(repo_id)
# Initialize the ONNX runtime session for inference.
ort_session = onnxruntime.InferenceSession(
onnx_model_path, providers=['CPUExecutionProvider']
)
# Define the fixed sampling rate (adjust if your model uses a different rate)
sampling_rate = 16000
def tts_inference(text: str):
"""
Convert input text to speech waveform using the ONNX model.
Parameters:
text (str): Input text to synthesize.
Returns:
Tuple[int, np.ndarray]: A tuple containing the sampling rate (int) and the synthesized
audio waveform (np.ndarray in float32 format).
"""
# Tokenize the input text.
inputs = tokenizer(text, return_tensors="pt")
# Prepare inputs for the ONNX model.
input_ids = inputs.input_ids.cpu().to(torch.long).numpy()
# Run inference on the ONNX model.
onnx_outputs = ort_session.run(None, {"input_ids": input_ids})
waveform = onnx_outputs[0]
# Ensure the output is a NumPy array.
if not isinstance(waveform, np.ndarray):
waveform = np.array(waveform)
# Convert waveform to float32 (required by Gradio's Audio component).
waveform = waveform.astype(np.float32)
# Remove any extra dimensions.
waveform = np.squeeze(waveform)
return (sampling_rate, waveform) # Return as a tuple
# Build the Gradio interface.
iface = gr.Interface(
fn=tts_inference,
inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
outputs=gr.Audio(type="numpy", label="Generated Speech"),
title="ONNX TTS Demo",
description="Text-to-Speech synthesis using an ONNX model from the Athspi/Gg repository on Hugging Face.",
examples=[
["Hello, this is an example of text-to-speech."],
["This model uses ONNX Runtime for fast inference."],
["You can try your own sentences here."]
]
)
if __name__ == "__main__":
iface.launch() |