Spaces:
Build error
Build error
File size: 2,744 Bytes
5a39f1e 72aa6e6 5a39f1e 72aa6e6 5a39f1e 72aa6e6 5a39f1e 72aa6e6 5a39f1e 72aa6e6 5a39f1e 72aa6e6 5a39f1e 72aa6e6 5a39f1e 72aa6e6 3980d4c a15140f 72aa6e6 5a39f1e 72aa6e6 f6fdc84 5a39f1e 72aa6e6 5a39f1e f6fdc84 72aa6e6 5a39f1e 72aa6e6 5a39f1e 72aa6e6 5a39f1e 72aa6e6 5a39f1e 5fed7f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import gradio as gr
import matplotlib.pyplot as plt
import IPython.display as ipd
import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import commons
import utils
from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
from scipy.io.wavfile import write
import numpy as np
# 加载情感字典
emotion_dict = json.load(open("configs/leo.json", "r"))
# 加载预训练模型
hps = utils.get_hparams_from_file("./configs/leo.json")
net_g = SynthesizerTrn(len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, **hps.model)
_ = net_g.eval()
_ = utils.load_checkpoint("logs/leo/G_4000.pth", net_g, None)
# 定义文本转语音函数
def tts(txt, emotion, roma=False, length_scale=1):
if roma:
stn_tst = get_text_byroma(txt, hps)
else:
stn_tst = get_text(txt, hps)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
sid = torch.LongTensor([0])
if emotion == "random_sample":
# 随机选择一个情感参考音频
random_emotion_root = "wavs"
while True:
rand_wav = random.sample(os.listdir(random_emotion_root), 1)[0]
if rand_wav.endswith('wav') and os.path.exists(f"{random_emotion_root}/{rand_wav}.emo.npy"):
break
emo = torch.FloatTensor(np.load(f"{random_emotion_root}/{rand_wav}.emo.npy")).unsqueeze(0)
print(f"{random_emotion_root}/{rand_wav}")
elif emotion.endswith("wav"):
# 从提供的音频中提取情感特征
import emotion_extract
emo = torch.FloatTensor(emotion_extract.extract_wav(emotion))
else:
print("emotion参数不正确")
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.2, emo=emo)[0][0, 0].data.float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
# 创建GUI界面
def run_tts(text, emotion, roma=False):
tts(text, emotion, roma)
inputs = [
gr.inputs.Textbox(label="请输入文本"),
gr.inputs.Textbox(label="请输入参考音频路径或选择'random_sample'随机选择"),
gr.inputs.Checkbox(label="是否使用音素合成")
]
outputs = gr.outputs.Audio(label="合成音频")
interface = gr.Interface(fn=run_tts, inputs=inputs, outputs=outputs, title="中文文本转语音")
interface.launch()
|