import argparse import gradio as gr from gradio import components import os import torch import commons import utils import numpy as np from text import text_to_sequence from scipy.io.wavfile import write from preprocess import preprocess import onnxruntime def get_text(texts, hps): text_norm_list = [] for text in texts.split(","): text = preprocess(text) chunk_strings = [] chunk_len = 30 for i in range(0, len(text.split()), chunk_len): chunk = " ".join(text.split()[i:i+chunk_len]) chunk_strings.append(chunk) for chunk_string in chunk_strings: text_norm = text_to_sequence(chunk_string, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm_list.append(torch.LongTensor(text_norm)) return text_norm_list def tts(text): model_path = "model.onnx" config_path = "config.json" sid = 4 output_wav_path = "output.wav" sess_options = onnxruntime.SessionOptions() model = onnxruntime.InferenceSession(str(model_path), sess_options=sess_options, providers=["CPUExecutionProvider"]) hps = utils.get_hparams_from_file(config_path) audios = [] stn_tst_list = get_text(text, hps) for stn_tst in stn_tst_list: text = np.expand_dims(np.array(stn_tst, dtype=np.int64), 0) text_lengths = np.array([text.shape[1]], dtype=np.int64) scales = np.array([0.667, 1.1, 0.85], dtype=np.float32) sid = np.array([int(sid)]) if sid is not None else None audio = model.run( None, { "input": text, "input_lengths": text_lengths, "scales": scales, "sid": sid, }, )[0].squeeze((0, 1)) audios.append(audio) audios = np.concatenate(audios, axis=0) write(data=audios, rate=hps.data.sampling_rate, filename=output_wav_path) return output_wav_path if __name__ == "__main__": gr.Interface( fn=tts, inputs=[components.Textbox(label="Text Input")], outputs=components.Audio(type='filepath', label="Generated Speech"), live=False ).launch(server_name="0.0.0.0", server_port=7860)