Mahiruoshi's picture
Update app.py
0bc5811
raw
history blame
8.11 kB
import ONNXVITS_models
import utils
from text import text_to_sequence
import torch
import commons
def get_text(text, hps):
text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
hps = utils.get_hparams_from_file("lovelive/config.json")
symbols = hps.symbols
net_g = ONNXVITS_models.SynthesizerTrn(
len(symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model)
_ = net_g.eval()
_ = utils.load_checkpoint("lovelive/G_525000.pth", net_g)
text1 = get_text("[JA]ありがとうございます。[JA]", hps)
stn_tst = text1
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
sid = torch.tensor([0])
o = net_g(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)
'''
import romajitable
import re
import numpy as np
import logging
logging.getLogger('numba').setLevel(logging.WARNING)
import IPython.display as ipd
import torch
import commons
import utils
import ONNXVITS_infer
from text.symbols import symbols
from text import text_to_sequence
import gradio as gr
import time
def get_text(text, hps):
text_norm = text_to_sequence(text, symbols, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
def selection(speaker):
if speaker == "高咲侑":
spk = 0
return spk
elif speaker == "歩夢":
spk = 1
return spk
elif speaker == "かすみ":
spk = 2
return spk
elif speaker == "しずく":
spk = 3
return spk
elif speaker == "果林":
spk = 4
return spk
elif speaker == "愛":
spk = 5
return spk
elif speaker == "彼方":
spk = 6
return spk
elif speaker == "せつ菜":
spk = 7
return spk
elif speaker == "エマ":
spk = 8
return spk
elif speaker == "璃奈":
spk = 9
return spk
elif speaker == "栞子":
spk = 10
return spk
elif speaker == "ランジュ":
spk = 11
return spk
elif speaker == "ミア":
spk = 12
return spk
elif speaker == "三色绘恋1":
spk = 13
return spk
elif speaker == "三色绘恋2":
spk = 15
elif speaker == "派蒙":
spk = 16
return spk
def is_japanese(string):
for ch in string:
if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
return True
return False
def is_english(string):
import re
pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
if pattern.fullmatch(string):
return True
else:
return False
def sle(language,tts_input0):
if language == "中文":
tts_input1 = "[ZH]" + tts_input0.replace('\n','。').replace(' ',',') + "[ZH]"
return tts_input1
if language == "自动":
tts_input1 = f"[JA]{tts_input0}[JA]" if is_japanese(tts_input0) else f"[ZH]{tts_input0}[ZH]"
return tts_input1
elif language == "日文":
tts_input1 = "[JA]" + tts_input0.replace('\n','。').replace(' ',',') + "[JA]"
return tts_input1
def extrac(text):
text = re.sub("<[^>]*>","",text)
result_list = re.split(r'\n', text)
final_list = []
for i in result_list:
if is_english(i):
i = romajitable.to_kana(i).katakana
i = i.replace('\n','').replace(' ','')
if len(i)>1:
if len(i) > 20:
try:
cur_list = re.split(r'。', i)
for i in cur_list:
if len(i)>1:
final_list.append(i+'。')
except:
pass
final_list.append(i)
final_list = [x for x in final_list if x != '']
print(final_list)
return final_list
def infer(language,text,speaker_id, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
speaker_id = int(selection(speaker_id))
a = ['【','[','(','(']
b = ['】',']',')',')']
for i in a:
text = text.replace(i,'<')
for i in b:
text = text.replace(i,'>')
final_list = extrac(text.replace('“','').replace('”',''))
audio_fin = []
c = 0
for sentence in final_list:
c +=1
try:
stn_tst = get_text(sle(language,sentence), hps_ms)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0).to(dev)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
sid = torch.LongTensor([speaker_id]).to(dev)
t1 = time.time()
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
t2 = time.time()
spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
print(spending_time)
audio_fin.append(audio)
except:
print('存在非法字符')
return (hps_ms.data.sampling_rate, np.concatenate(audio_fin))
lan = ["中文","日文","自动"]
idols = ["高咲侑","歩夢","かすみ","しずく","果林","愛","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","派蒙"]
dev = torch.device("cpu")
hps_ms = utils.get_hparams_from_file("lovelive/config.json")
net_g_ms = ONNXVITS_infer.SynthesizerTrn(
len(symbols),
hps_ms.data.filter_length // 2 + 1,
hps_ms.train.segment_size // hps_ms.data.hop_length,
n_speakers=hps_ms.data.n_speakers,
**hps_ms.model)
_ = net_g_ms.eval()
_ = utils.load_checkpoint("lovelive/G_525000.pth", net_g_ms)
app = gr.Blocks()
with app:
with gr.Tabs():
with gr.TabItem("虹团vits模型,现可按句分割实现长文本合成,onnx导出后存在质量损失,建议本地运行vits模型"):
tts_input1 = gr.TextArea(label="去标贝新模型,老版本在lovelive文件夹中", value="数千怀言者已经为你集结,列队在通往主舰桥的过道上。他们歌唱着你们名字,高声呼喊,以一种原始的、咆哮般的合唱作为对你的致敬。你从他们中间走过,一边点头,一边接受他们的赞美,你沉溺其中,几乎被他们巨大的音量所震撼。\n他们之中没有一个胆敢直视你。没有一个能够承受。你对他们超人类的眼睛来说都太过光辉。从他们中间走过时,你巨大的影子从他们身上掠过,他们立时将目光挪开,眼含泪水,吟诵你的大名时甚至不敢看你一眼。他们的吟唱中含有愤怒。几乎是疯狂的绝望。那感觉就好像他们害怕停下来,害怕自己会喘息停顿,好像尖叫出你的名字是唯一能让他们活着的事情。\n或许确实如此。作为对他们崇拜的回应,你谦虚地抬抬手,随后走进主舰桥。\nI In a word, Horus is a joker.")
language = gr.Dropdown(label="选择语言,目前勉强可以做到自动识别",choices=lan, value="自动", interactive=True)
para_input1 = gr.Slider(minimum= 0.01,maximum=1.0,label="更改噪声比例,以控制情感", value=0.667)
para_input2 = gr.Slider(minimum= 0.01,maximum=1.0,label="更改噪声偏差,以控制音素长短", value=0.7)
para_input3 = gr.Slider(minimum= 0.1,maximum=10,label="更改时间比例", value=1)
tts_submit = gr.Button("Generate", variant="primary")
speaker1 = gr.Dropdown(label="选择说话人",choices=idols, value="歩夢", interactive=True)
tts_output2 = gr.Audio(label="Output")
tts_submit.click(infer, [language,tts_input1,speaker1,para_input1,para_input2,para_input3], [tts_output2])
#app.launch(share=True)
app.launch()
'''