chilge commited on
Commit
72aa6e6
1 Parent(s): e708b8b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -79
app.py CHANGED
@@ -1,51 +1,46 @@
1
  import gradio as gr
 
 
 
 
 
2
  import torch
 
 
 
3
  import commons
4
  import utils
5
- import os
6
  from models import SynthesizerTrn
7
  from text.symbols import symbols
8
  from text import text_to_sequence
 
9
  import numpy as np
10
 
 
 
11
 
12
- def get_text(text, hps):
13
- text_norm = text_to_sequence(text, hps.data.text_cleaners)
14
- if hps.data.add_blank:
15
- text_norm = commons.intersperse(text_norm, 0)
16
- text_norm = torch.LongTensor(text_norm)
17
- return text_norm
18
- hps = utils.get_hparams_from_file("./configs/vtubers.json")
19
- net_g = SynthesizerTrn(
20
- len(symbols),
21
- hps.data.filter_length // 2 + 1,
22
- hps.train.segment_size // hps.data.hop_length,
23
- n_speakers=hps.data.n_speakers,
24
- **hps.model)
25
  _ = net_g.eval()
 
26
 
27
- _ = utils.load_checkpoint("./nene_final.pth", net_g, None)
28
- file_names = os.listdir("./wavs")
29
-
30
- # 构建新的emotion
31
- emotion = {}
32
- for file_name in file_names:
33
- emotion_name= os.path.splitext(file_name)[0]
34
- emotion[emotion_name] = os.path.join("./wavs", file_name)
35
- import random
36
- def tts(txt, emotion):
37
  if roma:
38
  stn_tst = get_text_byroma(txt, hps)
39
  else:
40
  stn_tst = get_text(txt, hps)
41
- randsample = None
42
  with torch.no_grad():
43
  x_tst = stn_tst.unsqueeze(0)
44
  x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
45
  sid = torch.LongTensor([0])
46
- if os.path.exists(f"{emotion}.emo.npy"):
47
- emo = torch.FloatTensor(np.load(f"{emotion}.emo.npy")).unsqueeze(0)
48
- elif emotion == "random_sample":
 
49
  while True:
50
  rand_wav = random.sample(os.listdir(random_emotion_root), 1)[0]
51
  if rand_wav.endswith('wav') and os.path.exists(f"{random_emotion_root}/{rand_wav}.emo.npy"):
@@ -53,63 +48,27 @@ def tts(txt, emotion):
53
  emo = torch.FloatTensor(np.load(f"{random_emotion_root}/{rand_wav}.emo.npy")).unsqueeze(0)
54
  print(f"{random_emotion_root}/{rand_wav}")
55
  elif emotion.endswith("wav"):
 
56
  import emotion_extract
57
  emo = torch.FloatTensor(emotion_extract.extract_wav(emotion))
58
  else:
59
  print("emotion参数不正确")
 
 
 
 
60
 
61
- audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1, emo=emo)[0][0,0].data.float().numpy()
62
- return audio, randsample
63
-
64
-
65
- def tts1(text, emotion):
66
- if len(text) > 150:
67
- return "Error: Text is too long", None
68
- audio, _ = tts(text, emotion)
69
- return "Success", (hps.data.sampling_rate, audio)
70
-
71
- def tts2(text):
72
- if len(text) > 150:
73
- return "Error: Text is too long", None
74
- audio, randsample = tts(text, "random_sample")
75
-
76
- return str(randsample), (hps.data.sampling_rate, audio)
77
-
78
- def tts3(text, emotion):
79
- if len(text) > 150:
80
- return "Error: Text is too long", None
81
- try:
82
- audio, _ = tts(text, int(sample))
83
- return "Success", (hps.data.sampling_rate, audio)
84
- except:
85
- return "输入参数不为整数或其他错误", None
86
- app = gr.Blocks()
87
- with app:
88
- with gr.Tabs():
89
- # with gr.TabItem("使用预制情感合成"):
90
- # tts_input1 = gr.TextArea(label="日语文本", value="こんにちは。私わあやちねねです。")
91
- # tts_input2 = gr.Dropdown(label="情感", choices=list(emotion_dict.keys()), value="平静1")
92
- # tts_submit = gr.Button("合成音频", variant="primary")
93
- # tts_output1 = gr.Textbox(label="Message")
94
- # tts_output2 = gr.Audio(label="Output")
95
- # tts_submit.click(tts1, [tts_input1, tts_input2], [tts_output1, tts_output2])
96
- # with gr.TabItem("随机抽取训练集样本作为情感参数"):
97
- # tts_input1 = gr.TextArea(label="日语文本", value="こんにちは。私わあやちねねです。")
98
- # tts_submit = gr.Button("合成音频", variant="primary")
99
- # tts_output1 = gr.Textbox(label="随机样本id(可用于第三个tab中合成)")
100
- # tts_output2 = gr.Audio(label="Output")
101
- # tts_submit.click(tts2, [tts_input1], [tts_output1, tts_output2])
102
 
103
- with gr.TabItem("使用指定情感样本作为情感参数"):
 
 
 
 
104
 
105
- tts_input1 = gr.TextArea(label="日语文本", value="こんにちは。私わあやちねねです。")
106
- tts_input2 = gr.TextArea(label="情感样本名字", value="")
107
- tts_submit = gr.Button("合成音频", variant="primary")
108
- tts_output1 = gr.Textbox(label="Message")
109
- tts_output2 = gr.Audio(label="Output")
110
- tts_submit.click(tts3, [tts_input1, tts_input2], [tts_output1, tts_output2])
111
 
112
- with gr.TabItem("使用参考音频作为情感参数"):
113
- tts_input1 = gr.TextArea(label="text", value="暂未实现")
114
 
115
- app.launch()
 
1
  import gradio as gr
2
+ import matplotlib.pyplot as plt
3
+ import IPython.display as ipd
4
+ import os
5
+ import json
6
+ import math
7
  import torch
8
+ from torch import nn
9
+ from torch.nn import functional as F
10
+ from torch.utils.data import DataLoader
11
  import commons
12
  import utils
13
+ from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
14
  from models import SynthesizerTrn
15
  from text.symbols import symbols
16
  from text import text_to_sequence
17
+ from scipy.io.wavfile import write
18
  import numpy as np
19
 
20
+ # 加载情感字典
21
+ emotion_dict = json.load(open("configs/leo.json", "r"))
22
 
23
+ # 加载预训练模型
24
+ hps = utils.get_hparams_from_file("./configs/leo.json")
25
+ net_g = SynthesizerTrn(len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, **hps.model)
 
 
 
 
 
 
 
 
 
 
26
  _ = net_g.eval()
27
+ _ = utils.load_checkpoint("logs/leo/G_4000.pth", net_g, None)
28
 
29
+ # 定义文本转语音函数
30
+ def tts(txt, emotion, roma=False, length_scale=1):
 
 
 
 
 
 
 
 
31
  if roma:
32
  stn_tst = get_text_byroma(txt, hps)
33
  else:
34
  stn_tst = get_text(txt, hps)
35
+
36
  with torch.no_grad():
37
  x_tst = stn_tst.unsqueeze(0)
38
  x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
39
  sid = torch.LongTensor([0])
40
+
41
+ if emotion == "random_sample":
42
+ # 随机选择一个情感参考音频
43
+ random_emotion_root = "wavs"
44
  while True:
45
  rand_wav = random.sample(os.listdir(random_emotion_root), 1)[0]
46
  if rand_wav.endswith('wav') and os.path.exists(f"{random_emotion_root}/{rand_wav}.emo.npy"):
 
48
  emo = torch.FloatTensor(np.load(f"{random_emotion_root}/{rand_wav}.emo.npy")).unsqueeze(0)
49
  print(f"{random_emotion_root}/{rand_wav}")
50
  elif emotion.endswith("wav"):
51
+ # 从提供的音频中提取情感特征
52
  import emotion_extract
53
  emo = torch.FloatTensor(emotion_extract.extract_wav(emotion))
54
  else:
55
  print("emotion参数不正确")
56
+
57
+ audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.2, emo=emo)[0][0, 0].data.float().numpy()
58
+
59
+ ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
60
 
61
+ # 创建GUI界面
62
+ def run_tts(text, emotion, roma=False):
63
+ tts(text, emotion, roma)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ inputs = [
66
+ gr.inputs.Textbox(label="请输入文本"),
67
+ gr.inputs.Textbox(label="请输入参考音频路径或选择'random_sample'随机选择"),
68
+ gr.inputs.Checkbox(label="是否使用音素合成")
69
+ ]
70
 
71
+ outputs = gr.outputs.Audio(label="合成音频")
 
 
 
 
 
72
 
73
+ interface = gr.Interface(fn=run_tts, inputs=inputs, outputs=outputs, title="中文文本转
 
74