File size: 5,198 Bytes
b19d8fe d8392c2 b19d8fe d8392c2 b19d8fe a945d7f b19d8fe 25d1a0c 913f594 b19d8fe d52ce0c b19d8fe 25d1a0c 31cc807 5d95ff6 b19d8fe a945d7f d52ce0c 31cc807 b19d8fe 3ce1ef3 b19d8fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import argparse
import json
import os
import re
import tempfile
from pathlib import Path
import librosa
import numpy as np
import torch
from torch import no_grad, LongTensor
import commons
import utils
import gradio as gr
import gradio.utils as gr_utils
import gradio.processing_utils as gr_processing_utils
from models import SynthesizerTrn
from text import text_to_sequence, _clean_text
# from mel_processing import spectrogram_torch
# import sounddevice as sd
# from scipy.io.wavfile import write
# import scikits.audiolab
# import soundfile as sf
# import scipy.io.wavfile as wf
limitation = False
device = torch.device('cpu')
download_audio_js = """
() =>{{
let root = document.querySelector("body > gradio-app");
if (root.shadowRoot != null)
root = root.shadowRoot;
let audio = root.querySelector("#{audio_id}").querySelector("audio");
if (audio == undefined)
return;
audio = audio.src;
let oA = document.createElement("a");
oA.download = Math.floor(Math.random()*100000000)+'.wav';
oA.href = audio;
document.body.appendChild(oA);
oA.click();
oA.remove();
}}
"""
# download = gr.Button("Download Audio")
tts_input1 = gr.TextArea(label="inputText", value="あなたと一緒にいると、とても興奮します", elem_id=f"tts-input{0}")
tts_output2 = gr.Audio(label="outputAudio", elem_id=f"tts-audio{0}")
def get_text(text, hps, is_symbol):
text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = LongTensor(text_norm)
return text_norm
def create_tts_fn(model, hps, speaker_ids):
def tts_fn(text, speaker, speed, is_symbol):
if limitation:
text_len = len(re.sub("\[([A-Z]{2})\]", "", text))
max_len = 150
if is_symbol:
max_len *= 3
if text_len > max_len:
return "Error: Text is too long", None
speaker_id = speaker_ids[speaker]
stn_tst = get_text(text, hps, is_symbol)
with no_grad():
x_tst = stn_tst.unsqueeze(0).to(device)
x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
sid = LongTensor([speaker_id]).to(device)
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
del stn_tst, x_tst, x_tst_lengths, sid
return "Success", (hps.data.sampling_rate, audio)
return tts_fn
def create_to_symbol_fn(hps):
def to_symbol_fn(is_symbol_input, input_text, temp_text):
return (_clean_text(input_text, hps.data.text_cleaners), input_text) if is_symbol_input \
else (temp_text, temp_text)
return to_symbol_fn
def main(input, authorization):
if os.environ['API_TOKEN'] != authorization:
return
models_tts = []
models_vc = []
models_soft_vc = []
device = torch.device("cpu")
global result
with open("saved_model/info.json", "r", encoding="utf-8") as f:
models_info = json.load(f)
for i, info in models_info.items():
if int(i) == 0:
name = info["title"]
author = info["author"]
lang = info["lang"]
example = info["example"]
config_path = f"saved_model/{i}/config.json"
model_path = f"saved_model/{i}/model.pth"
cover = info["cover"]
cover_path = f"saved_model/{i}/{cover}" if cover else None
hps = utils.get_hparams_from_file(config_path)
model = SynthesizerTrn(
len(hps.symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model)
utils.load_checkpoint(model_path, model, None)
model.eval().to(device)
speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
# input_text = get_text("ヨスガノソラ", hps, True)
print(speaker_ids[0])
vtts = create_tts_fn(model, hps, speaker_ids)
symbol = create_to_symbol_fn(hps)
result = vtts(input, speaker_ids[0], 1, False)
# wf.write('anime_girl3.wav', result[1][0], result[1][1])
# print(type(result[1][0]), result[1][0])
# download.click(None, [], [], _js=download_audio_js.format(audio_id=f"tts-audio{0}"))
return result[1][0], result[1][1]
# result[1][1]
print(models_tts)
demo = gr.Interface(fn=main, inputs=["text", "text"], outputs="audio")
# outputs=gr.outputs.Textbox(label="outputAudio"))
if __name__ == "__main__":
demo.launch(debug=True)
# main(input = "あなたと一緒にいると、とても興奮します")
|