|
import os |
|
import gradio as gr |
|
import torch |
|
import numpy as np |
|
from transformers import AutoTokenizer |
|
import onnxruntime |
|
import scipy.io.wavfile |
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
repo_id = "Athspi/Gg" |
|
|
|
|
|
onnx_model_path = hf_hub_download(repo_id=repo_id, filename="mms_tts_eng.onnx") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(repo_id) |
|
|
|
|
|
ort_session = onnxruntime.InferenceSession( |
|
onnx_model_path, providers=['CPUExecutionProvider'] |
|
) |
|
|
|
|
|
sampling_rate = 16000 |
|
|
|
def tts_inference(text: str): |
|
""" |
|
Convert input text to speech waveform using the ONNX model. |
|
|
|
Parameters: |
|
text (str): Input text to synthesize. |
|
|
|
Returns: |
|
Tuple[int, np.ndarray]: A tuple containing the sampling rate (int) and the synthesized |
|
audio waveform (np.ndarray in float32 format). |
|
""" |
|
|
|
inputs = tokenizer(text, return_tensors="pt") |
|
|
|
|
|
input_ids = inputs.input_ids.cpu().to(torch.long).numpy() |
|
|
|
|
|
onnx_outputs = ort_session.run(None, {"input_ids": input_ids}) |
|
waveform = onnx_outputs[0] |
|
|
|
|
|
if not isinstance(waveform, np.ndarray): |
|
waveform = np.array(waveform) |
|
|
|
|
|
waveform = waveform.astype(np.float32) |
|
|
|
|
|
waveform = np.squeeze(waveform) |
|
|
|
return (sampling_rate, waveform) |
|
|
|
|
|
|
|
iface = gr.Interface( |
|
fn=tts_inference, |
|
inputs=gr.Textbox(lines=2, placeholder="Enter text here..."), |
|
outputs=gr.Audio(type="numpy", label="Generated Speech"), |
|
title="ONNX TTS Demo", |
|
description="Text-to-Speech synthesis using an ONNX model from the Athspi/Gg repository on Hugging Face.", |
|
examples=[ |
|
["Hello, this is an example of text-to-speech."], |
|
["This model uses ONNX Runtime for fast inference."], |
|
["You can try your own sentences here."] |
|
] |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |