File size: 2,744 Bytes
5a39f1e
72aa6e6
 
 
 
 
5a39f1e
72aa6e6
 
 
5a39f1e
 
72aa6e6
5a39f1e
 
 
72aa6e6
5a39f1e
 
72aa6e6
 
5a39f1e
72aa6e6
 
 
5a39f1e
72aa6e6
5a39f1e
72aa6e6
 
3980d4c
a15140f
 
 
72aa6e6
5a39f1e
 
 
 
72aa6e6
 
 
 
f6fdc84
 
 
 
 
 
5a39f1e
72aa6e6
5a39f1e
 
 
f6fdc84
72aa6e6
 
 
 
5a39f1e
72aa6e6
 
 
5a39f1e
72aa6e6
 
 
 
 
5a39f1e
72aa6e6
5a39f1e
5fed7f2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
import matplotlib.pyplot as plt
import IPython.display as ipd
import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import commons
import utils
from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
from scipy.io.wavfile import write
import numpy as np

# 加载情感字典
emotion_dict = json.load(open("configs/leo.json", "r"))

# 加载预训练模型
hps = utils.get_hparams_from_file("./configs/leo.json")
net_g = SynthesizerTrn(len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, **hps.model)
_ = net_g.eval()
_ = utils.load_checkpoint("logs/leo/G_4000.pth", net_g, None)

# 定义文本转语音函数
def tts(txt, emotion, roma=False, length_scale=1):
    if roma:
        stn_tst = get_text_byroma(txt, hps)
    else:
        stn_tst = get_text(txt, hps)
    
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        sid = torch.LongTensor([0])
        
        if emotion == "random_sample":
            # 随机选择一个情感参考音频
            random_emotion_root = "wavs"
            while True:
                rand_wav = random.sample(os.listdir(random_emotion_root), 1)[0]
                if rand_wav.endswith('wav') and os.path.exists(f"{random_emotion_root}/{rand_wav}.emo.npy"):
                    break
            emo = torch.FloatTensor(np.load(f"{random_emotion_root}/{rand_wav}.emo.npy")).unsqueeze(0)
            print(f"{random_emotion_root}/{rand_wav}")
        elif emotion.endswith("wav"):
            # 从提供的音频中提取情感特征
            import emotion_extract
            emo = torch.FloatTensor(emotion_extract.extract_wav(emotion))
        else:
            print("emotion参数不正确")
    
        audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.2, emo=emo)[0][0, 0].data.float().numpy()
    
    ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

# 创建GUI界面
def run_tts(text, emotion, roma=False):
    tts(text, emotion, roma)

inputs = [
    gr.inputs.Textbox(label="请输入文本"),
    gr.inputs.Textbox(label="请输入参考音频路径或选择'random_sample'随机选择"),
    gr.inputs.Checkbox(label="是否使用音素合成")
]

outputs = gr.outputs.Audio(label="合成音频")

interface = gr.Interface(fn=run_tts, inputs=inputs, outputs=outputs, title="中文文本转语音")
interface.launch()