Ritori commited on
Commit
b99e56b
1 Parent(s): 234ff35

Upload Yue_gradio_cpu.py

Browse files
Files changed (1) hide show
  1. Yue_gradio_cpu.py +245 -0
Yue_gradio_cpu.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #好用的
2
+
3
+ import os
4
+ os.system('pip install -U tensorflow')
5
+ os.system('pip install -q unidecode tensorboardX')
6
+ os.system('pip install librosa==0.8.0')
7
+ os.system('pip install pysoundfile==0.9.0.post1')
8
+ os.system('pip install unidecode==1.3.4')
9
+ os.system('pip install pyopenjtalk --no-build-isolation')
10
+ os.system('pip install inflect==5.6.2')
11
+ os.system('pip install janome==0.4.2')
12
+ os.system('pip install tqdm -q')
13
+ os.system('pip install gdown')
14
+ os.system('pip install -q librosa unidecode')
15
+
16
+ os.system('pip install ipython')
17
+ os.system('pip install --upgrade jupyter ipywidgets')
18
+ os.system('jupyter nbextension enable --py widgetsnbextension')
19
+ os.system('pip uninstall tqdm')
20
+ os.system('pip install tqdm')
21
+
22
+ import time
23
+ import pyopenjtalk
24
+ import soundfile as sf
25
+ import gradio as gr
26
+ import torch
27
+ import IPython.display as ipd
28
+ import numpy as np
29
+ import torch
30
+ import json
31
+ from hparams import create_hparams
32
+ from model import Tacotron2
33
+ from layers import TacotronSTFT
34
+ from audio_processing import griffin_lim
35
+ from text import text_to_sequence
36
+ from env import AttrDict
37
+ from meldataset import MAX_WAV_VALUE
38
+ from models import Generator
39
+
40
+ #@,tlitle 配置并运行
41
+
42
+ #国际 HiFi-GAN 模型(有点机器音): 1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW
43
+ #@markdown 你训练好的tacotron2模型的路径填在`Tacotron2_Model`这里
44
+ Tacotron2_Model = 'Yui_TrapGenesis'#@param {type:"string"}
45
+ TACOTRON2_ID = Tacotron2_Model
46
+ HIFIGAN_ID = "1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW"
47
+ #@markdown 选择预处理文本的cleaner
48
+ text_cleaner = 'japanese_phrase_cleaners'#@param {type:"string"}
49
+ import pyopenjtalk
50
+ import soundfile as sf
51
+ import gradio as gr
52
+
53
+ # 全局变量声明
54
+ model = None
55
+ hparams = None
56
+ hifigan = None
57
+ thisdict = None
58
+ pronounciation_dictionary = False
59
+ show_graphs = False # 添加show_graphs变量,并赋予默认值
60
+
61
+ # 初始化函数
62
+ def initialize():
63
+ global model, hparams, hifigan, thisdict, pronounciation_dictionary
64
+
65
+ # 检查是否已初始化
66
+ try:
67
+ initialized
68
+ except NameError:
69
+ print("Setting up, please wait.\n")
70
+
71
+ from tqdm.notebook import tqdm
72
+ with tqdm(total=5, leave=False) as pbar:
73
+ import os
74
+ from os.path import exists, join, basename, splitext
75
+ git_repo_url = 'https://github.com/CjangCjengh/tacotron2-japanese.git'
76
+ project_name = splitext(basename(git_repo_url))[0]
77
+ if not exists(project_name):
78
+ # clone and install
79
+ os.system('git clone -q --recursive {git_repo_url}')
80
+ os.system('git clone -q --recursive https://github.com/SortAnon/hifi-gan')
81
+
82
+ pbar.update(1) # downloaded TT2 and HiFi-GAN
83
+ import sys
84
+ sys.path.append('hifi-gan')
85
+ sys.path.append(project_name)
86
+ import time
87
+ import matplotlib
88
+ import matplotlib.pylab as plt
89
+ import gdown
90
+ d = 'https://drive.google.com/uc?id='
91
+
92
+ # %matplotlib inline
93
+ import IPython.display as ipd
94
+ import numpy as np
95
+ import torch
96
+ import json
97
+ from hparams import create_hparams
98
+ from model import Tacotron2
99
+ from layers import TacotronSTFT
100
+ from audio_processing import griffin_lim
101
+ from text import text_to_sequence
102
+ from env import AttrDict
103
+ from meldataset import MAX_WAV_VALUE
104
+ from models import Generator
105
+
106
+ pbar.update(1) # initialized Dependancies
107
+
108
+ graph_width = 900
109
+ graph_height = 360
110
+ def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))):
111
+ # %matplotlib inline
112
+ fig, axes = plt.subplots(1, len(data), figsize=figsize)
113
+ for i in range(len(data)):
114
+ axes[i].imshow(data[i], aspect='auto', origin='upper',
115
+ interpolation='none', cmap='inferno')
116
+ fig.canvas.draw()
117
+ plt.show()
118
+
119
+ # Setup Pronounciation Dictionary
120
+ os.system('wget https://github.com/wind4000/tacotron2/releases/download/v0.2/merged.dict.txt')
121
+ thisdict = {}
122
+ for line in reversed((open('merged.dict.txt', "r").read()).splitlines()):
123
+ thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
124
+
125
+ pbar.update(1) # Downloaded and Set up Pronounciation Dictionary
126
+
127
+ def ARPA(text, punctuation=r"!?,.;", EOS_Token=True):
128
+ out = ''
129
+ for word_ in text.split(" "):
130
+ word=word_; end_chars = ''
131
+ while any(elem in word for elem in punctuation) and len(word) > 1:
132
+ if word[-1] in punctuation: end_chars = word[-1] + end_chars; word = word[:-1]
133
+ else: break
134
+ try:
135
+ word_arpa = thisdict[word.upper()]
136
+ word = "{" + str(word_arpa) + "}"
137
+ except KeyError: pass
138
+ out = (out + " " + word + end_chars).strip()
139
+ if EOS_Token and out[-1] != ";": out += ";"
140
+ return out
141
+
142
+ def get_hifigan(MODEL_ID):
143
+ # Download HiFi-GAN
144
+ hifigan_pretrained_model = 'hifimodel'
145
+ gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False)
146
+ if not exists(hifigan_pretrained_model):
147
+ raise Exception("HiFI-GAN model failed to download!")
148
+
149
+ # Load HiFi-GAN
150
+ conf = os.path.join("hifi-gan", "config_v1.json")
151
+ with open(conf) as f:
152
+ json_config = json.loads(f.read())
153
+ h = AttrDict(json_config)
154
+ torch.manual_seed(h.seed)
155
+ hifigan = Generator(h).to(torch.device("cpu"))
156
+ state_dict_g = torch.load(hifigan_pretrained_model, map_location=torch.device("cpu"))
157
+ hifigan.load_state_dict(state_dict_g["generator"])
158
+ hifigan.eval()
159
+ hifigan.remove_weight_norm()
160
+ return hifigan, h
161
+
162
+ hifigan, h = get_hifigan(HIFIGAN_ID)
163
+ pbar.update(1) # Downloaded and Set up HiFi-GAN
164
+
165
+ def has_MMI(STATE_DICT):
166
+ return any(True for x in STATE_DICT.keys() if "mi." in x)
167
+
168
+ def get_Tactron2(MODEL_ID):
169
+ # Download Tacotron2
170
+ tacotron2_pretrained_model = TACOTRON2_ID
171
+ if not exists(tacotron2_pretrained_model):
172
+ raise Exception("Tacotron2 model failed to download!")
173
+ # Load Tacotron2 and Config
174
+ hparams = create_hparams()
175
+ hparams.sampling_rate = 22050
176
+ hparams.max_decoder_steps = 2000 # Max Duration
177
+ hparams.gate_threshold = 0.80 # Model must be 25% sure the clip is over before ending generation
178
+ model = Tacotron2(hparams)
179
+ state_dict = torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['state_dict']
180
+
181
+ if has_MMI(state_dict):
182
+ raise Exception("ERROR: This notebook does not currently support MMI models.")
183
+ model.load_state_dict(state_dict)
184
+ _ = model.cpu().eval().float()
185
+ return model, hparams
186
+
187
+ model, hparams = get_Tactron2(TACOTRON2_ID)
188
+ previous_tt2_id = TACOTRON2_ID
189
+
190
+ pbar.update(1) # Downloaded and Set up Tacotron2
191
+
192
+ # 初始化
193
+ initialize()
194
+
195
+ import soundfile as sf
196
+
197
+ def end_to_end_infer(text, pronounciation_dictionary, show_graphs):
198
+ audio = None # 定义一个变量用于存储音频数据
199
+ for i in [x for x in text.split("\n") if len(x)]:
200
+ if not pronounciation_dictionary:
201
+ if i[-1] != ";":
202
+ i = i + ";"
203
+ else:
204
+ i = ARPA(i)
205
+ with torch.no_grad():
206
+ sequence = np.array(text_to_sequence(i, [text_cleaner]))[None, :]
207
+ sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cpu().long() # 或者使用 .int()
208
+
209
+ mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
210
+ if show_graphs:
211
+ plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],
212
+ alignments.float().data.cpu().numpy()[0].T))
213
+ y_g_hat = hifigan(mel_outputs_postnet.float())
214
+ audio = y_g_hat.squeeze()
215
+ audio = audio * MAX_WAV_VALUE
216
+ output_filename = f"output_{time.strftime('%Y%m%d%H%M%S')}.wav"
217
+ sf.write(output_filename, audio.cpu().numpy().astype('int16'), hparams.sampling_rate)
218
+ print(f"音频已保存为 {output_filename}")
219
+ print("")
220
+ ipd.display(ipd.Audio(audio.cpu().numpy().astype("int16"), rate=hparams.sampling_rate))
221
+ return audio # 返回音频数据
222
+
223
+ # 文本到语音转换函数
224
+ def text_to_speech(text, max_decoder_steps=2000, gate_threshold=0.5):
225
+ global model, hparams, hifigan, thisdict, pronounciation_dictionary, show_graphs
226
+
227
+ hparams.max_decoder_steps = max_decoder_steps
228
+ hparams.gate_threshold = gate_threshold
229
+ output_filename = f"output_{time.strftime('%Y%m%d%H%M%S')}.wav"
230
+ audio = end_to_end_infer(text, pronounciation_dictionary, show_graphs)
231
+ if audio is not None:
232
+ sf.write(output_filename, audio.cpu().numpy().astype('int16'), hparams.sampling_rate)
233
+ return output_filename
234
+ else:
235
+ return None
236
+
237
+ # Gradio界面
238
+ inputs = [
239
+ gr.inputs.Textbox(lines=3, label="输入文本"),
240
+ gr.inputs.Slider(minimum=100, maximum=5000, default=2000, step=100, label="最大解码步数"),
241
+ gr.inputs.Slider(minimum=0.0, maximum=1.0, default=0.5, step=0.05, label="门控阈值")
242
+ ]
243
+ outputs = gr.outputs.File(label="下载生成的音频")
244
+
245
+ gr.Interface(fn=text_to_speech, inputs=inputs, outputs=outputs).launch(debug=True)