File size: 2,415 Bytes
6417dc9
c21b225
6417dc9
 
4368215
 
 
1f4b6af
c21b225
1f4b6af
 
4368215
1f4b6af
 
4368215
1f4b6af
 
4368215
6417dc9
 
 
 
c21b225
6417dc9
 
c21b225
6417dc9
 
 
 
 
 
 
 
e567eaf
 
6417dc9
 
 
 
 
 
 
 
 
 
 
6c54982
 
 
 
 
b6b39ee
6c54982
 
6417dc9
 
e567eaf
 
c21b225
6c54982
4368215
6417dc9
 
e567eaf
6417dc9
e567eaf
 
 
 
 
 
c21b225
 
 
4368215
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer
import onnxruntime
import scipy.io.wavfile
from huggingface_hub import hf_hub_download

# Define the Hugging Face repository/model ID.
repo_id = "Athspi/Gg"

# Download the ONNX model file from the repository.
onnx_model_path = hf_hub_download(repo_id=repo_id, filename="mms_tts_eng.onnx")

# Load the tokenizer from the repository.
tokenizer = AutoTokenizer.from_pretrained(repo_id)

# Initialize the ONNX runtime session for inference.
ort_session = onnxruntime.InferenceSession(
    onnx_model_path, providers=['CPUExecutionProvider']
)

# Define the fixed sampling rate (adjust if your model uses a different rate)
sampling_rate = 16000

def tts_inference(text: str):
    """
    Convert input text to speech waveform using the ONNX model.
    
    Parameters:
        text (str): Input text to synthesize.
    
    Returns:
        Tuple[int, np.ndarray]: A tuple containing the sampling rate (int) and the synthesized
                                 audio waveform (np.ndarray in float32 format).
    """
    # Tokenize the input text.
    inputs = tokenizer(text, return_tensors="pt")
    
    # Prepare inputs for the ONNX model.
    input_ids = inputs.input_ids.cpu().to(torch.long).numpy()
    
    # Run inference on the ONNX model.
    onnx_outputs = ort_session.run(None, {"input_ids": input_ids})
    waveform = onnx_outputs[0]
    
    # Ensure the output is a NumPy array.
    if not isinstance(waveform, np.ndarray):
        waveform = np.array(waveform)
    
    # Convert waveform to float32 (required by Gradio's Audio component).
    waveform = waveform.astype(np.float32)
    
    # Remove any extra dimensions.
    waveform = np.squeeze(waveform)
    
    return (sampling_rate, waveform)  # Return as a tuple


# Build the Gradio interface.
iface = gr.Interface(
    fn=tts_inference,
    inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
    outputs=gr.Audio(type="numpy", label="Generated Speech"),
    title="ONNX TTS Demo",
    description="Text-to-Speech synthesis using an ONNX model from the Athspi/Gg repository on Hugging Face.",
    examples=[
        ["Hello, this is an example of text-to-speech."],
        ["This model uses ONNX Runtime for fast inference."],
        ["You can try your own sentences here."]
    ]
)

if __name__ == "__main__":
    iface.launch()