truyen-ngontinh / app.py
tu
update 16-4
b23490f
raw
history blame
2.31 kB
import argparse
import gradio as gr
from gradio import components
import os
import torch
import commons
import utils
import numpy as np
from text import text_to_sequence
from scipy.io.wavfile import write
from preprocess import preprocess
import onnxruntime
def get_text(texts, hps):
text_norm_list = []
for text in texts.split(","):
text = preprocess(text)
chunk_strings = []
chunk_len = 30
for i in range(0, len(text.split()), chunk_len):
chunk = " ".join(text.split()[i:i+chunk_len])
chunk_strings.append(chunk)
for chunk_string in chunk_strings:
text_norm = text_to_sequence(chunk_string, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm_list.append(torch.LongTensor(text_norm))
return text_norm_list
def tts(text):
model_path = "model.onnx"
config_path = "config.json"
sid = 6
output_wav_path = "output.wav"
sess_options = onnxruntime.SessionOptions()
model = onnxruntime.InferenceSession(str(model_path), sess_options=sess_options, providers=["CPUExecutionProvider"])
hps = utils.get_hparams_from_file(config_path)
audios = []
stn_tst_list = get_text(text, hps)
for stn_tst in stn_tst_list:
text = np.expand_dims(np.array(stn_tst, dtype=np.int64), 0)
text_lengths = np.array([text.shape[1]], dtype=np.int64)
scales = np.array([0.667, 1.1, 0.85], dtype=np.float32)
sid = np.array([int(sid)]) if sid is not None else None
audio = model.run(
None,
{
"input": text,
"input_lengths": text_lengths,
"scales": scales,
"sid": sid,
},
)[0].squeeze((0, 1))
audios.append(audio)
audios = np.concatenate(audios, axis=0)
write(data=audios, rate=hps.data.sampling_rate, filename=output_wav_path)
return output_wav_path
if __name__ == "__main__":
gr.Interface(
fn=tts,
inputs=[components.Textbox(label="Text Input")],
outputs=components.Audio(type='filepath', label="Generated Speech"),
live=False
).launch(server_name="0.0.0.0", server_port=7860)