|
import time |
|
import matplotlib.pyplot as plt |
|
import IPython.display as ipd |
|
import re |
|
import os |
|
import json |
|
import math |
|
import torch |
|
from torch import nn |
|
from torch.nn import functional as F |
|
from torch.utils.data import DataLoader |
|
import gradio as gr |
|
import commons |
|
import utils |
|
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate |
|
from models import SynthesizerTrn |
|
from text.symbols import symbols |
|
from text import text_to_sequence |
|
import unicodedata |
|
from scipy.io.wavfile import write |
|
import openai |
|
|
|
def get_text(text, hps): |
|
text_norm = text_to_sequence(text, hps.data.text_cleaners) |
|
if hps.data.add_blank: |
|
text_norm = commons.intersperse(text_norm, 0) |
|
text_norm = torch.LongTensor(text_norm) |
|
return text_norm |
|
|
|
def get_label(text, label): |
|
if f'[{label}]' in text: |
|
return True, text.replace(f'[{label}]', '') |
|
else: |
|
return False, text |
|
|
|
def selection(speaker): |
|
if speaker == "高咲侑(误)": |
|
spk = 0 |
|
return spk |
|
|
|
elif speaker == "歩夢": |
|
spk = 1 |
|
return spk |
|
|
|
elif speaker == "かすみ": |
|
spk = 2 |
|
return spk |
|
|
|
elif speaker == "しずく": |
|
spk = 3 |
|
return spk |
|
|
|
elif speaker == "果林": |
|
spk = 4 |
|
return spk |
|
|
|
elif speaker == "愛": |
|
spk = 5 |
|
return spk |
|
|
|
elif speaker == "彼方": |
|
spk = 6 |
|
return spk |
|
|
|
elif speaker == "せつ菜": |
|
spk = 7 |
|
return spk |
|
elif speaker == "エマ": |
|
spk = 8 |
|
return spk |
|
elif speaker == "璃奈": |
|
spk = 9 |
|
return spk |
|
elif speaker == "栞子": |
|
spk = 10 |
|
return spk |
|
elif speaker == "ランジュ": |
|
spk = 11 |
|
return spk |
|
elif speaker == "ミア": |
|
spk = 12 |
|
return spk |
|
elif speaker == "三色绘恋1": |
|
spk = 13 |
|
return spk |
|
elif speaker == "三色绘恋2": |
|
spk = 15 |
|
return spk |
|
elif speaker == "派蒙": |
|
spk = 16 |
|
return spk |
|
def friend_chat(text,key,call_name,tts_input3): |
|
call_name = call_name |
|
openai.api_key = key |
|
identity = tts_input3 |
|
start_sequence = '\n'+str(call_name)+':' |
|
restart_sequence = "\nYou: " |
|
all_text = identity + restart_sequence |
|
if 1 == 1: |
|
prompt0 = text |
|
if text == 'quit': |
|
return prompt0 |
|
prompt = identity + prompt0 + start_sequence |
|
|
|
response = openai.Completion.create( |
|
model="text-davinci-003", |
|
prompt=prompt, |
|
temperature=0.5, |
|
max_tokens=1000, |
|
top_p=1.0, |
|
frequency_penalty=0.5, |
|
presence_penalty=0.0, |
|
stop=["\nYou:"] |
|
) |
|
return response['choices'][0]['text'].strip() |
|
def is_japanese(string): |
|
for ch in string: |
|
if ord(ch) > 0x3040 and ord(ch) < 0x30FF: |
|
return True |
|
return False |
|
def sle(language,text,tts_input2,call_name,tts_input3): |
|
if language == "中文": |
|
tts_input1 = "[ZH]" + text.replace('\n','。').replace(' ',',') + "[ZH]" |
|
return tts_input1 |
|
if language == "对话": |
|
text = friend_chat(text,tts_input2,call_name,tts_input3).replace('\n','。').replace(' ',',') |
|
text = f"[JA]{text}[JA]" if is_japanese(text) else f"[ZH]{text}[ZH]" |
|
return text |
|
elif language == "日文": |
|
tts_input1 = "[JA]" + text.replace('\n','。').replace(' ',',') + "[JA]" |
|
return tts_input1 |
|
def infer(language,text,tts_input2,tts_input3,speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ): |
|
speaker_name = speaker_id |
|
speaker_id = int(selection(speaker_id)) |
|
stn_tst = get_text(sle(language,text,tts_input2,speaker_name,tts_input3), hps_ms) |
|
with torch.no_grad(): |
|
x_tst = stn_tst.unsqueeze(0).to(dev) |
|
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev) |
|
sid = torch.LongTensor([speaker_id]).to(dev) |
|
t1 = time.time() |
|
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy() |
|
t2 = time.time() |
|
spending_time = "推理时间:"+str(t2-t1)+"s" |
|
print(spending_time) |
|
return (hps_ms.data.sampling_rate, audio) |
|
lan = ["中文","日文","对话"] |
|
idols = ["高咲侑(误)","歩夢","かすみ","しずく","果林","愛","彼方","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","三色绘恋1","三色绘恋2","派蒙"] |
|
|
|
|
|
dev = torch.device("cpu") |
|
hps_ms = utils.get_hparams_from_file("config.json") |
|
net_g_ms = SynthesizerTrn( |
|
len(symbols), |
|
hps_ms.data.filter_length // 2 + 1, |
|
hps_ms.train.segment_size // hps_ms.data.hop_length, |
|
n_speakers=hps_ms.data.n_speakers, |
|
**hps_ms.model).to(dev) |
|
_ = net_g_ms.eval() |
|
|
|
_ = utils.load_checkpoint("G_1049000.pth", net_g_ms, None) |
|
|
|
app = gr.Blocks() |
|
|
|
with app: |
|
with gr.Tabs(): |
|
|
|
with gr.TabItem("Basic"): |
|
|
|
tts_input1 = gr.TextArea(label="输入你的文本", value="一次審査、二次審査、それぞれの欄に記入をお願いします。") |
|
tts_input2 = gr.TextArea(label="如需使用openai,输入你的openai-key", value="官网") |
|
tts_input3 = gr.TextArea(label="写上你给她的设定", value="恶魔系学妹。") |
|
language = gr.Dropdown(label="选择合成方式",choices=lan, value="对话", interactive=True) |
|
para_input1 = gr.Slider(minimum= 0.01,maximum=1.0,label="更改噪声比例", value=0.667) |
|
para_input2 = gr.Slider(minimum= 0.01,maximum=1.0,label="更改噪声偏差", value=0.8) |
|
para_input3 = gr.Slider(minimum= 0.1,maximum=10,label="更改时间比例", value=1) |
|
tts_submit = gr.Button("Generate", variant="primary") |
|
speaker1 = gr.Dropdown(label="选择说话人",choices=idols, value="かすみ", interactive=True) |
|
tts_output2 = gr.Audio(label="Output") |
|
tts_submit.click(infer, [language,tts_input1,tts_input2,tts_input3,speaker1,para_input1,para_input2,para_input3], [tts_output2]) |
|
|
|
app.launch() |
|
''' |
|
import time |
|
import matplotlib.pyplot as plt |
|
import IPython.display as ipd |
|
import re |
|
import os |
|
import json |
|
import math |
|
import torch |
|
from torch import nn |
|
from torch.nn import functional as F |
|
from torch.utils.data import DataLoader |
|
import gradio as gr |
|
import commons |
|
import utils |
|
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate |
|
from models import SynthesizerTrn |
|
from text.symbols import symbols |
|
from text import text_to_sequence |
|
import unicodedata |
|
from scipy.io.wavfile import write |
|
def get_text(text, hps): |
|
text_norm = text_to_sequence(text, hps.data.text_cleaners) |
|
if hps.data.add_blank: |
|
text_norm = commons.intersperse(text_norm, 0) |
|
text_norm = torch.LongTensor(text_norm) |
|
return text_norm |
|
|
|
|
|
def get_label(text, label): |
|
if f'[{label}]' in text: |
|
return True, text.replace(f'[{label}]', '') |
|
else: |
|
return False, text |
|
|
|
|
|
|
|
|
|
|
|
def selection(speaker): |
|
if speaker == "高咲侑": |
|
spk = 0 |
|
return spk |
|
|
|
elif speaker == "歩夢": |
|
spk = 1 |
|
return spk |
|
|
|
elif speaker == "かすみ": |
|
spk = 2 |
|
return spk |
|
|
|
elif speaker == "しずく": |
|
spk = 3 |
|
return spk |
|
|
|
elif speaker == "果林": |
|
spk = 4 |
|
return spk |
|
|
|
elif speaker == "愛": |
|
spk = 5 |
|
return spk |
|
|
|
elif speaker == "彼方": |
|
spk = 6 |
|
return spk |
|
|
|
elif speaker == "せつ菜": |
|
spk = 7 |
|
return spk |
|
elif speaker == "エマ": |
|
spk = 8 |
|
return spk |
|
elif speaker == "璃奈": |
|
spk = 9 |
|
return spk |
|
elif speaker == "栞子": |
|
spk = 10 |
|
return spk |
|
elif speaker == "ランジュ": |
|
spk = 11 |
|
return spk |
|
elif speaker == "ミア": |
|
spk = 12 |
|
return spk |
|
elif speaker == "派蒙": |
|
spk = 16 |
|
return spk |
|
|
|
def sle(language,tts_input0): |
|
if language == "中文": |
|
tts_input1 = "[ZH]" + tts_input0.replace('\n','。').replace(' ',',') + "[ZH]" |
|
return tts_input1 |
|
if language == "英文": |
|
tts_input1 = "[EN]" + tts_input0.replace('\n','.').replace(' ',',') + "[EN]" |
|
return tts_input1 |
|
elif language == "日文": |
|
tts_input1 = "[JA]" + tts_input0.replace('\n','。').replace(' ',',') + "[JA]" |
|
return tts_input1 |
|
def infer(language,text,speaker_id, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ): |
|
speaker_id = int(selection(speaker_id)) |
|
answer = sle(language,text) |
|
stn_tst = get_text(answer, hps_ms) |
|
with torch.no_grad(): |
|
x_tst = stn_tst.unsqueeze(0).to(dev) |
|
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev) |
|
sid = torch.LongTensor([speaker_id]).to(dev) |
|
t1 = time.time() |
|
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy() |
|
t2 = time.time() |
|
spending_time = "推理时间:"+str(t2-t1)+"s" |
|
image = '1.png' |
|
print(spending_time) |
|
return (hps_ms.data.sampling_rate, audio),image |
|
lan = ["中文","日文","英文"] |
|
idols = ["高咲侑","歩夢","かすみ","しずく","果林","愛","彼方","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","派蒙"] |
|
dev = torch.device("cpu") |
|
hps_ms = utils.get_hparams_from_file("config.json") |
|
net_g_ms = SynthesizerTrn( |
|
len(symbols), |
|
hps_ms.data.filter_length // 2 + 1, |
|
hps_ms.train.segment_size // hps_ms.data.hop_length, |
|
n_speakers=hps_ms.data.n_speakers, |
|
**hps_ms.model).to(dev) |
|
_ = net_g_ms.eval() |
|
|
|
_ = utils.load_checkpoint("G_1049000.pth", net_g_ms, None) |
|
|
|
app = gr.Blocks() |
|
|
|
with app: |
|
with gr.Tabs(): |
|
with gr.TabItem("Basic"): |
|
tts_input1 = gr.TextArea(label="VITS模型,绝赞训练中", value="一次審査、二次審査、それぞれの欄に記入をお願いします。") |
|
language = gr.Dropdown(label="选择语言",choices=lan, value="日文", interactive=True) |
|
para_input1 = gr.Slider(minimum= 0.01,maximum=1.0,label="更改噪声比例", value=0.667) |
|
para_input2 = gr.Slider(minimum= 0.01,maximum=1.0,label="更改噪声偏差", value=0.8) |
|
para_input3 = gr.Slider(minimum= 0.1,maximum=10,label="更改时间比例", value=1) |
|
tts_submit = gr.Button("Generate", variant="primary") |
|
speaker1 = gr.Dropdown(label="选择说话人",choices=idols, value="かすみ", interactive=True) |
|
tts_output2 = gr.Audio(label="Output") |
|
tts_output3 = gr.Image(label = "Model") |
|
tts_submit.click(infer, [language,tts_input1,speaker1,para_input1,para_input2,para_input3], [tts_output2,tts_output3]) |
|
#app.launch(share=True) |
|
app.launch() |
|
''' |