Spaces:
Running
Running
File size: 2,723 Bytes
6f6918a c17bcc7 6f6918a c17bcc7 6f6918a c17bcc7 6f6918a ec47503 c17bcc7 6f6918a ca36ae3 6f6918a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import argparse
import gradio as gr
from gradio import components
import os
import torch
import commons
import utils
import numpy as np
from text import text_to_sequence
from scipy.io.wavfile import write
from preprocess import preprocess
import onnxruntime
import re
def get_text(texts, hps):
text_norm_list = []
texts = texts.replace("“", "")
texts = texts.replace("”", "")
texts = texts.replace("…", ".")
texts = re.split('[!:;\.\n]', texts)
tmp = []
for t in texts:
t = t.rstrip().strip()
if len(t) > 0:
tmp.append(t)
texts = []
for t in tmp:
texts.append(t)
texts.append(".")
#texts.append(".")
#print(texts)
for text in texts:
text = preprocess(text)
print(text)
chunk_strings = []
chunk_len = 30
for i in range(0, len(text.split()), chunk_len):
chunk = " ".join(text.split()[i:i+chunk_len])
chunk_strings.append(chunk)
for chunk_string in chunk_strings:
text_norm = text_to_sequence(chunk_string, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm_list.append(torch.LongTensor(text_norm))
return text_norm_list
def tts(text):
model_path = "model.onnx"
config_path = "config.json"
sid = 9
output_wav_path = "output.wav"
sess_options = onnxruntime.SessionOptions()
model = onnxruntime.InferenceSession(str(model_path), sess_options=sess_options, providers=["CPUExecutionProvider"])
hps = utils.get_hparams_from_file(config_path)
audios = []
stn_tst_list = get_text(text, hps)
for stn_tst in stn_tst_list:
text = np.expand_dims(np.array(stn_tst, dtype=np.int64), 0)
text_lengths = np.array([text.shape[1]], dtype=np.int64)
scales = np.array([0.667, 1.05, 0.8], dtype=np.float32)
sid = np.array([int(sid)]) if sid is not None else None
audio = model.run(
None,
{
"input": text,
"input_lengths": text_lengths,
"scales": scales,
"sid": sid,
},
)[0].squeeze((0, 1))
audios.append(audio)
audios = np.concatenate(audios, axis=0)
write(data=audios, rate=hps.data.sampling_rate, filename=output_wav_path)
return output_wav_path
if __name__ == "__main__":
gr.Interface(
fn=tts,
inputs=[components.Textbox(label="Text Input")],
outputs=components.Audio(type='filepath', label="Generated Speech"),
live=False
).launch(server_name="0.0.0.0", server_port=7860)
|