Spaces:
Build error
Build error
File size: 5,662 Bytes
04749f1 84d6588 44847f5 74ec31e 5a39f1e 84d6588 5a39f1e 2e910d9 72aa6e6 2e910d9 04749f1 5a39f1e 84d6588 450e414 52dc9ce 84d6588 6b2fd4d 84d6588 5a39f1e 84d6588 2e910d9 84d6588 5a39f1e 84d6588 6b2fd4d 84d6588 2e910d9 6b2fd4d 84d6588 15a5722 84d6588 80be904 6b2fd4d 84d6588 6b2fd4d 0e939b1 84d6588 0e939b1 6b2fd4d 84d6588 6b2fd4d 84d6588 205d21f 2d38eef 205d21f 2d38eef 205d21f 5a39f1e 205d21f 04749f1 205d21f 81470ae 205d21f 81470ae 84d6588 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import gradio as gr
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
import numpy as np
def get_text(text, hps):
text_norm = text_to_sequence(text, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
hps = utils.get_hparams_from_file("./configs/leo.json")
net_g = SynthesizerTrn(
len(symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model)
_ = net_g.eval()
_ = utils.load_checkpoint("./logs/leo/G_4000.pth", net_g, None)
all_emotions = np.load("all_emotions.npy")
emotion_dict = {
"小声(目前没区分)": 0,
"激动": 1,
"平静1": 2,
"平静2": 3
}
import random
def tts(txt, emotion, ns, nsw, ls):
stn_tst = get_text(txt, hps)
randsample = None
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
sid = torch.LongTensor([0])
if type(emotion) ==int:
emo = torch.FloatTensor(all_emotions[emotion]).unsqueeze(0)
elif emotion == "random":
emo = torch.randn([1,1024])
elif emotion == "random_sample":
randint = random.randint(0, all_emotions.shape[0])
emo = torch.FloatTensor(all_emotions[randint]).unsqueeze(0)
randsample = randint
elif emotion.endswith("wav"):
import emotion_extract
emo = torch.FloatTensor(emotion_extract.extract_wav(emotion))
else:
emo = torch.FloatTensor(all_emotions[emotion_dict[emotion]]).unsqueeze(0)
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=ns, noise_scale_w=nsw, length_scale=ls, emo=emo)[0][0,0].data.float().numpy()
return audio, randsample
def tts1(text, emotion, ns, nsw, ls):
if len(text) > 150:
return "Error: Text is too long", None
audio, _ = tts(text, emotion, ns, nsw, ls)
return "Success", (hps.data.sampling_rate, audio)
def tts2(text, ns, nsw, ls):
if len(text) > 150:
return "Error: Text is too long", None
audio, randsample = tts(text, "random_sample", ns, nsw, ls)
return str(randsample), (hps.data.sampling_rate, audio)
def tts3(text, sample, ns, nsw, ls):
if len(text) > 150:
return "Error: Text is too long", None
try:
audio, _ = tts(text, int(sample), ns, nsw, ls)
return "Success", (hps.data.sampling_rate, audio)
except:
return "输入参数不为整数或其他错误", None
app = gr.Blocks()
with app:
with gr.Tabs():
with gr.TabItem("使用预制情感合成"):
tts_input1 = gr.TextArea(label="日语文本", value="こんにちは。")
tts_input2 = gr.Dropdown(label="情感", choices=list(emotion_dict.keys()), value="平静1")
ns = gr.Slider(label="noise_scale(控制感情变化程度)", minimum=0.1, maximum=1.0, step=0.1, value=0.6, interactive=True)
nsw = gr.Slider(label="noise_scale_w(控制音素发音长度)", minimum=0.1, maximum=1.0, step=0.1, value=0.668, interactive=True)
ls = gr.Slider(label="length_scale(控制整体语速)", minimum=0.1, maximum=2.0, step=0.1, value=1.2, interactive=True)
tts_submit = gr.Button("合成音频", variant="primary")
tts_output1 = gr.Textbox(label="Message")
tts_output2 = gr.Audio(label="Output")
tts_submit.click(tts1, [tts_input1, tts_input2, ns, nsw, ls], [tts_output1, tts_output2])
with gr.TabItem("随机抽取训练集样本作为情感参数"):
tts_input1 = gr.TextArea(label="日语文本", value="こんにちは。")
ns = gr.Slider(label="noise_scale(控制感情变化程度)", minimum=0.1, maximum=1.0, step=0.1, value=0.6, interactive=True)
nsw = gr.Slider(label="noise_scale_w(控制音素发音长度)", minimum=0.1, maximum=1.0, step=0.1, value=0.668, interactive=True)
ls = gr.Slider(label="length_scale(控制整体语速)", minimum=0.1, maximum=2.0, step=0.1, value=1.2, interactive=True)
tts_submit = gr.Button("合成音频", variant="primary")
tts_output1 = gr.Textbox(label="随机样本id(可用于第三个tab中合成)")
tts_output2 = gr.Audio(label="Output")
tts_submit.click(tts2, [tts_input1, ns, nsw, ls], [tts_output1, tts_output2])
with gr.TabItem("使用情感样本id作为情感参数"):
tts_input1 = gr.TextArea(label="日语文本", value="こんにちは。")
tts_input2 = gr.Number(label="情感样本id", value=0)
ns = gr.Slider(label="noise_scale(控制感情变化程度)", minimum=0.1, maximum=1.0, step=0.1, value=0.6, interactive=True)
nsw = gr.Slider(label="noise_scale_w(控制音素发音长度)", minimum=0.1, maximum=1.0, step=0.1, value=0.668, interactive=True)
ls = gr.Slider(label="length_scale(控制整体语速)", minimum=0.1, maximum=2.0, step=0.1, value=1.2, interactive=True)
tts_submit = gr.Button("合成音频", variant="primary")
tts_output1 = gr.Textbox(label="Message")
tts_output2 = gr.Audio(label="Output")
tts_submit.click(tts3, [tts_input1, tts_input2, ns, nsw, ls], [tts_output1, tts_output2])
with gr.TabItem("使用参考音频作为情感参数"):
tts_input1 = gr.TextArea(label="text", value="暂未实现")
app.launch()
|