import time import matplotlib.pyplot as plt import IPython.display as ipd import re import os import json import math import torch from torch import nn from torch.nn import functional as F from torch.utils.data import DataLoader import gradio as gr import commons import utils from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate from models import SynthesizerTrn from text.symbols import symbols from text import text_to_sequence import unicodedata from scipy.io.wavfile import write def get_text(text, hps): text_norm = text_to_sequence(text, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = torch.LongTensor(text_norm) return text_norm def get_label(text, label): if f'[{label}]' in text: return True, text.replace(f'[{label}]', '') else: return False, text def selection(speaker): if speaker == "高咲侑": spk = 0 return spk elif speaker == "歩夢": spk = 1 return spk elif speaker == "かすみ": spk = 2 return spk elif speaker == "しずく": spk = 3 return spk elif speaker == "果林": spk = 4 return spk elif speaker == "愛": spk = 5 return spk elif speaker == "彼方": spk = 6 return spk elif speaker == "せつ菜": spk = 7 return spk elif speaker == "エマ": spk = 8 return spk elif speaker == "璃奈": spk = 9 return spk elif speaker == "栞子": spk = 10 return spk elif speaker == "ランジュ": spk = 11 return spk elif speaker == "ミア": spk = 12 return spk elif speaker == "派蒙": spk = 16 return spk def sle(language,tts_input0): if language == "中文": tts_input1 = "[ZH]" + tts_input0.replace('\n','。').replace(' ',',') + "[ZH]" return tts_input1 if language == "英文": tts_input1 = "[EN]" + tts_input0.replace('\n','.').replace(' ',',') + "[EN]" return tts_input1 elif language == "日文": tts_input1 = "[JA]" + tts_input0.replace('\n','。').replace(' ',',') + "[JA]" return tts_input1 def infer(language,text,speaker_id, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ): speaker_id = int(selection(speaker_id)) answer = sle(language,text) stn_tst = get_text(answer, hps_ms) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0).to(dev) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev) sid = torch.LongTensor([speaker_id]).to(dev) t1 = time.time() audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy() t2 = time.time() spending_time = "推理时间:"+str(t2-t1)+"s" image = '1.png' print(spending_time) return (hps_ms.data.sampling_rate, audio),image lan = ["中文","日文","英文"] idols = ["高咲侑","歩夢","かすみ","しずく","果林","愛","彼方","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","派蒙"] dev = torch.device("cpu") hps_ms = utils.get_hparams_from_file("config.json") net_g_ms = SynthesizerTrn( len(symbols), hps_ms.data.filter_length // 2 + 1, hps_ms.train.segment_size // hps_ms.data.hop_length, n_speakers=hps_ms.data.n_speakers, **hps_ms.model).to(dev) _ = net_g_ms.eval() _ = utils.load_checkpoint("G_701000.pth", net_g_ms, None) app = gr.Blocks() with app: with gr.Tabs(): with gr.TabItem("Basic"): tts_input1 = gr.TextArea(label="VITS模型,绝赞训练中", value="一次審査、二次審査、それぞれの欄に記入をお願いします。") language = gr.Dropdown(label="选择语言",choices=lan, value="日文", interactive=True) para_input1 = gr.Slider(minimum= 0.01,maximum=1.0,label="更改噪声比例", value=0.667) para_input2 = gr.Slider(minimum= 0.01,maximum=1.0,label="更改噪声偏差", value=0.8) para_input3 = gr.Slider(minimum= 0.1,maximum=10,label="更改时间比例", value=1) tts_submit = gr.Button("Generate", variant="primary") speaker1 = gr.Dropdown(label="选择说话人",choices=idols, value="かすみ", interactive=True) tts_output2 = gr.Audio(label="Output") tts_output3 = gr.Image(label = "Model") tts_submit.click(infer, [language,tts_input1,speaker1,para_input1,para_input2,para_input3], [tts_output2,tts_output3]) #app.launch(share=True) app.launch()