leo-emovits

Build error

File size: 5,662 Bytes

04749f1
84d6588
44847f5
74ec31e
5a39f1e
 
 
84d6588
 
5a39f1e
2e910d9
 
 
 
 
 
72aa6e6
2e910d9
 
 
 
 
04749f1
5a39f1e
 
84d6588
 
 
450e414
52dc9ce
 
 
84d6588
 
6b2fd4d
84d6588
 
5a39f1e
 
 
 
84d6588
 
 
 
2e910d9
84d6588
 
 
5a39f1e
 
 
 
84d6588
 
6b2fd4d
84d6588
 
2e910d9
6b2fd4d
84d6588
 
15a5722
84d6588
80be904
6b2fd4d
84d6588
 
6b2fd4d
0e939b1
84d6588
0e939b1
6b2fd4d
84d6588
 
 
6b2fd4d
84d6588
 
 
 
 
205d21f
 
 
2d38eef
205d21f
 
 
 
 
 
 
 
 
2d38eef
205d21f
 
 
 
 
 
 
5a39f1e
205d21f
04749f1
205d21f
 
 
 
 
 
 
 
 
81470ae
205d21f
 
81470ae
84d6588

import gradio as gr
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
import numpy as np


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm
hps = utils.get_hparams_from_file("./configs/leo.json")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model)
_ = net_g.eval()

_ = utils.load_checkpoint("./logs/leo/G_4000.pth", net_g, None)
all_emotions = np.load("all_emotions.npy")
emotion_dict = {
    "小声(目前没区分）": 0,
    "激动": 1,
    "平静1": 2,
    "平静2": 3
}
import random
def tts(txt, emotion, ns, nsw, ls):
    stn_tst = get_text(txt, hps)
    randsample = None
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        sid = torch.LongTensor([0])
        if type(emotion) ==int:
            emo = torch.FloatTensor(all_emotions[emotion]).unsqueeze(0)
        elif emotion == "random":
            emo = torch.randn([1,1024])
        elif emotion == "random_sample":
            randint = random.randint(0, all_emotions.shape[0])
            emo = torch.FloatTensor(all_emotions[randint]).unsqueeze(0)
            randsample = randint
        elif emotion.endswith("wav"):
            import emotion_extract
            emo = torch.FloatTensor(emotion_extract.extract_wav(emotion))
        else:
            emo = torch.FloatTensor(all_emotions[emotion_dict[emotion]]).unsqueeze(0)

        audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=ns, noise_scale_w=nsw, length_scale=ls, emo=emo)[0][0,0].data.float().numpy()
    return audio, randsample


def tts1(text, emotion, ns, nsw, ls):
    if len(text) > 150:
        return "Error: Text is too long", None
    audio, _ = tts(text, emotion, ns, nsw, ls)
    return "Success", (hps.data.sampling_rate, audio)

def tts2(text, ns, nsw, ls):
    if len(text) > 150:
        return "Error: Text is too long", None
    audio, randsample = tts(text, "random_sample", ns, nsw, ls)

    return str(randsample), (hps.data.sampling_rate, audio)

def tts3(text, sample, ns, nsw, ls):
    if len(text) > 150:
        return "Error: Text is too long", None
    try:
        audio, _ = tts(text, int(sample), ns, nsw, ls)
        return "Success", (hps.data.sampling_rate, audio)
    except:
        return "输入参数不为整数或其他错误", None
app = gr.Blocks()
with app:
   
    with gr.Tabs():
        with gr.TabItem("使用预制情感合成"):
            tts_input1 = gr.TextArea(label="日语文本", value="こんにちは。")
            tts_input2 = gr.Dropdown(label="情感", choices=list(emotion_dict.keys()),  value="平静1")
            ns = gr.Slider(label="noise_scale(控制感情变化程度)", minimum=0.1, maximum=1.0, step=0.1, value=0.6, interactive=True)
            nsw = gr.Slider(label="noise_scale_w(控制音素发音长度)", minimum=0.1, maximum=1.0, step=0.1, value=0.668, interactive=True)
            ls = gr.Slider(label="length_scale(控制整体语速)", minimum=0.1, maximum=2.0, step=0.1, value=1.2, interactive=True)
            tts_submit = gr.Button("合成音频", variant="primary")
            tts_output1 = gr.Textbox(label="Message")
            tts_output2 = gr.Audio(label="Output")
            tts_submit.click(tts1, [tts_input1, tts_input2, ns, nsw, ls], [tts_output1, tts_output2])
        with gr.TabItem("随机抽取训练集样本作为情感参数"):
            tts_input1 = gr.TextArea(label="日语文本", value="こんにちは。")
            ns = gr.Slider(label="noise_scale(控制感情变化程度)", minimum=0.1, maximum=1.0, step=0.1, value=0.6, interactive=True)
            nsw = gr.Slider(label="noise_scale_w(控制音素发音长度)", minimum=0.1, maximum=1.0, step=0.1, value=0.668, interactive=True)
            ls = gr.Slider(label="length_scale(控制整体语速)", minimum=0.1, maximum=2.0, step=0.1, value=1.2, interactive=True)
            tts_submit = gr.Button("合成音频", variant="primary")
            tts_output1 = gr.Textbox(label="随机样本id（可用于第三个tab中合成）")
            tts_output2 = gr.Audio(label="Output")
            tts_submit.click(tts2, [tts_input1, ns, nsw, ls], [tts_output1, tts_output2])

        with gr.TabItem("使用情感样本id作为情感参数"):

            tts_input1 = gr.TextArea(label="日语文本", value="こんにちは。")
            tts_input2 = gr.Number(label="情感样本id", value=0)
            ns = gr.Slider(label="noise_scale(控制感情变化程度)", minimum=0.1, maximum=1.0, step=0.1, value=0.6, interactive=True)
            nsw = gr.Slider(label="noise_scale_w(控制音素发音长度)", minimum=0.1, maximum=1.0, step=0.1, value=0.668, interactive=True)
            ls = gr.Slider(label="length_scale(控制整体语速)", minimum=0.1, maximum=2.0, step=0.1, value=1.2, interactive=True)
            tts_submit = gr.Button("合成音频", variant="primary")
            tts_output1 = gr.Textbox(label="Message")
            tts_output2 = gr.Audio(label="Output")
            tts_submit.click(tts3, [tts_input1, tts_input2, ns, nsw, ls], [tts_output1, tts_output2])

        with gr.TabItem("使用参考音频作为情感参数"):
            tts_input1 = gr.TextArea(label="text", value="暂未实现")

    app.launch()