Spaces:

vietvoice
/

truyen-ngontinh

Running

File size: 2,723 Bytes

import argparse
import gradio as gr
from gradio import components
import os
import torch
import commons
import utils
import numpy as np
from text import text_to_sequence
from scipy.io.wavfile import write
from preprocess import preprocess
import onnxruntime
import re

def get_text(texts, hps):
    text_norm_list = []
    texts = texts.replace("“", "")
    texts = texts.replace("”", "")
    texts = texts.replace("…", ".")
    texts = re.split('[!:;\.\n]', texts)
    tmp = []
    for t in texts:
        t = t.rstrip().strip()
        if len(t) > 0:
            tmp.append(t)
    texts = []
    for t in tmp:
        texts.append(t)
        texts.append(".")
        #texts.append(".")

    #print(texts)
    for text in texts:
          text = preprocess(text)
          print(text)
          chunk_strings = []
          chunk_len = 30
          for i in range(0, len(text.split()), chunk_len):
               chunk = " ".join(text.split()[i:i+chunk_len])
               chunk_strings.append(chunk)
          for chunk_string in chunk_strings:
               text_norm = text_to_sequence(chunk_string, hps.data.text_cleaners)
               if hps.data.add_blank:
                    text_norm = commons.intersperse(text_norm, 0)
               text_norm_list.append(torch.LongTensor(text_norm))
    return text_norm_list

def tts(text):
    model_path = "model.onnx"
    config_path = "config.json"
    sid = 9
    output_wav_path = "output.wav"
    sess_options = onnxruntime.SessionOptions()
    model = onnxruntime.InferenceSession(str(model_path), sess_options=sess_options, providers=["CPUExecutionProvider"])

    hps = utils.get_hparams_from_file(config_path)

    audios = []

    stn_tst_list = get_text(text, hps)
    for stn_tst in stn_tst_list:
        text = np.expand_dims(np.array(stn_tst, dtype=np.int64), 0)
        text_lengths = np.array([text.shape[1]], dtype=np.int64)
        scales = np.array([0.667, 1.05, 0.8], dtype=np.float32)
        sid = np.array([int(sid)]) if sid is not None else None

        audio = model.run(
            None,
            {
                "input": text,
                "input_lengths": text_lengths,
                "scales": scales,
                "sid": sid,
            },
        )[0].squeeze((0, 1))
        audios.append(audio)
    audios = np.concatenate(audios, axis=0)

    write(data=audios, rate=hps.data.sampling_rate, filename=output_wav_path)
    return output_wav_path

if __name__ == "__main__":

    gr.Interface(
        fn=tts,
        inputs=[components.Textbox(label="Text Input")],
        outputs=components.Audio(type='filepath', label="Generated Speech"),
        live=False
    ).launch(server_name="0.0.0.0", server_port=7860)