Upload Yue_gradio_cpu.py
Browse files- Yue_gradio_cpu.py +245 -0
Yue_gradio_cpu.py
ADDED
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#好用的
|
2 |
+
|
3 |
+
import os
|
4 |
+
os.system('pip install -U tensorflow')
|
5 |
+
os.system('pip install -q unidecode tensorboardX')
|
6 |
+
os.system('pip install librosa==0.8.0')
|
7 |
+
os.system('pip install pysoundfile==0.9.0.post1')
|
8 |
+
os.system('pip install unidecode==1.3.4')
|
9 |
+
os.system('pip install pyopenjtalk --no-build-isolation')
|
10 |
+
os.system('pip install inflect==5.6.2')
|
11 |
+
os.system('pip install janome==0.4.2')
|
12 |
+
os.system('pip install tqdm -q')
|
13 |
+
os.system('pip install gdown')
|
14 |
+
os.system('pip install -q librosa unidecode')
|
15 |
+
|
16 |
+
os.system('pip install ipython')
|
17 |
+
os.system('pip install --upgrade jupyter ipywidgets')
|
18 |
+
os.system('jupyter nbextension enable --py widgetsnbextension')
|
19 |
+
os.system('pip uninstall tqdm')
|
20 |
+
os.system('pip install tqdm')
|
21 |
+
|
22 |
+
import time
|
23 |
+
import pyopenjtalk
|
24 |
+
import soundfile as sf
|
25 |
+
import gradio as gr
|
26 |
+
import torch
|
27 |
+
import IPython.display as ipd
|
28 |
+
import numpy as np
|
29 |
+
import torch
|
30 |
+
import json
|
31 |
+
from hparams import create_hparams
|
32 |
+
from model import Tacotron2
|
33 |
+
from layers import TacotronSTFT
|
34 |
+
from audio_processing import griffin_lim
|
35 |
+
from text import text_to_sequence
|
36 |
+
from env import AttrDict
|
37 |
+
from meldataset import MAX_WAV_VALUE
|
38 |
+
from models import Generator
|
39 |
+
|
40 |
+
#@,tlitle 配置并运行
|
41 |
+
|
42 |
+
#国际 HiFi-GAN 模型(有点机器音): 1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW
|
43 |
+
#@markdown 你训练好的tacotron2模型的路径填在`Tacotron2_Model`这里
|
44 |
+
Tacotron2_Model = 'Yui_TrapGenesis'#@param {type:"string"}
|
45 |
+
TACOTRON2_ID = Tacotron2_Model
|
46 |
+
HIFIGAN_ID = "1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW"
|
47 |
+
#@markdown 选择预处理文本的cleaner
|
48 |
+
text_cleaner = 'japanese_phrase_cleaners'#@param {type:"string"}
|
49 |
+
import pyopenjtalk
|
50 |
+
import soundfile as sf
|
51 |
+
import gradio as gr
|
52 |
+
|
53 |
+
# 全局变量声明
|
54 |
+
model = None
|
55 |
+
hparams = None
|
56 |
+
hifigan = None
|
57 |
+
thisdict = None
|
58 |
+
pronounciation_dictionary = False
|
59 |
+
show_graphs = False # 添加show_graphs变量,并赋予默认值
|
60 |
+
|
61 |
+
# 初始化函数
|
62 |
+
def initialize():
|
63 |
+
global model, hparams, hifigan, thisdict, pronounciation_dictionary
|
64 |
+
|
65 |
+
# 检查是否已初始化
|
66 |
+
try:
|
67 |
+
initialized
|
68 |
+
except NameError:
|
69 |
+
print("Setting up, please wait.\n")
|
70 |
+
|
71 |
+
from tqdm.notebook import tqdm
|
72 |
+
with tqdm(total=5, leave=False) as pbar:
|
73 |
+
import os
|
74 |
+
from os.path import exists, join, basename, splitext
|
75 |
+
git_repo_url = 'https://github.com/CjangCjengh/tacotron2-japanese.git'
|
76 |
+
project_name = splitext(basename(git_repo_url))[0]
|
77 |
+
if not exists(project_name):
|
78 |
+
# clone and install
|
79 |
+
os.system('git clone -q --recursive {git_repo_url}')
|
80 |
+
os.system('git clone -q --recursive https://github.com/SortAnon/hifi-gan')
|
81 |
+
|
82 |
+
pbar.update(1) # downloaded TT2 and HiFi-GAN
|
83 |
+
import sys
|
84 |
+
sys.path.append('hifi-gan')
|
85 |
+
sys.path.append(project_name)
|
86 |
+
import time
|
87 |
+
import matplotlib
|
88 |
+
import matplotlib.pylab as plt
|
89 |
+
import gdown
|
90 |
+
d = 'https://drive.google.com/uc?id='
|
91 |
+
|
92 |
+
# %matplotlib inline
|
93 |
+
import IPython.display as ipd
|
94 |
+
import numpy as np
|
95 |
+
import torch
|
96 |
+
import json
|
97 |
+
from hparams import create_hparams
|
98 |
+
from model import Tacotron2
|
99 |
+
from layers import TacotronSTFT
|
100 |
+
from audio_processing import griffin_lim
|
101 |
+
from text import text_to_sequence
|
102 |
+
from env import AttrDict
|
103 |
+
from meldataset import MAX_WAV_VALUE
|
104 |
+
from models import Generator
|
105 |
+
|
106 |
+
pbar.update(1) # initialized Dependancies
|
107 |
+
|
108 |
+
graph_width = 900
|
109 |
+
graph_height = 360
|
110 |
+
def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))):
|
111 |
+
# %matplotlib inline
|
112 |
+
fig, axes = plt.subplots(1, len(data), figsize=figsize)
|
113 |
+
for i in range(len(data)):
|
114 |
+
axes[i].imshow(data[i], aspect='auto', origin='upper',
|
115 |
+
interpolation='none', cmap='inferno')
|
116 |
+
fig.canvas.draw()
|
117 |
+
plt.show()
|
118 |
+
|
119 |
+
# Setup Pronounciation Dictionary
|
120 |
+
os.system('wget https://github.com/wind4000/tacotron2/releases/download/v0.2/merged.dict.txt')
|
121 |
+
thisdict = {}
|
122 |
+
for line in reversed((open('merged.dict.txt', "r").read()).splitlines()):
|
123 |
+
thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
|
124 |
+
|
125 |
+
pbar.update(1) # Downloaded and Set up Pronounciation Dictionary
|
126 |
+
|
127 |
+
def ARPA(text, punctuation=r"!?,.;", EOS_Token=True):
|
128 |
+
out = ''
|
129 |
+
for word_ in text.split(" "):
|
130 |
+
word=word_; end_chars = ''
|
131 |
+
while any(elem in word for elem in punctuation) and len(word) > 1:
|
132 |
+
if word[-1] in punctuation: end_chars = word[-1] + end_chars; word = word[:-1]
|
133 |
+
else: break
|
134 |
+
try:
|
135 |
+
word_arpa = thisdict[word.upper()]
|
136 |
+
word = "{" + str(word_arpa) + "}"
|
137 |
+
except KeyError: pass
|
138 |
+
out = (out + " " + word + end_chars).strip()
|
139 |
+
if EOS_Token and out[-1] != ";": out += ";"
|
140 |
+
return out
|
141 |
+
|
142 |
+
def get_hifigan(MODEL_ID):
|
143 |
+
# Download HiFi-GAN
|
144 |
+
hifigan_pretrained_model = 'hifimodel'
|
145 |
+
gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False)
|
146 |
+
if not exists(hifigan_pretrained_model):
|
147 |
+
raise Exception("HiFI-GAN model failed to download!")
|
148 |
+
|
149 |
+
# Load HiFi-GAN
|
150 |
+
conf = os.path.join("hifi-gan", "config_v1.json")
|
151 |
+
with open(conf) as f:
|
152 |
+
json_config = json.loads(f.read())
|
153 |
+
h = AttrDict(json_config)
|
154 |
+
torch.manual_seed(h.seed)
|
155 |
+
hifigan = Generator(h).to(torch.device("cpu"))
|
156 |
+
state_dict_g = torch.load(hifigan_pretrained_model, map_location=torch.device("cpu"))
|
157 |
+
hifigan.load_state_dict(state_dict_g["generator"])
|
158 |
+
hifigan.eval()
|
159 |
+
hifigan.remove_weight_norm()
|
160 |
+
return hifigan, h
|
161 |
+
|
162 |
+
hifigan, h = get_hifigan(HIFIGAN_ID)
|
163 |
+
pbar.update(1) # Downloaded and Set up HiFi-GAN
|
164 |
+
|
165 |
+
def has_MMI(STATE_DICT):
|
166 |
+
return any(True for x in STATE_DICT.keys() if "mi." in x)
|
167 |
+
|
168 |
+
def get_Tactron2(MODEL_ID):
|
169 |
+
# Download Tacotron2
|
170 |
+
tacotron2_pretrained_model = TACOTRON2_ID
|
171 |
+
if not exists(tacotron2_pretrained_model):
|
172 |
+
raise Exception("Tacotron2 model failed to download!")
|
173 |
+
# Load Tacotron2 and Config
|
174 |
+
hparams = create_hparams()
|
175 |
+
hparams.sampling_rate = 22050
|
176 |
+
hparams.max_decoder_steps = 2000 # Max Duration
|
177 |
+
hparams.gate_threshold = 0.80 # Model must be 25% sure the clip is over before ending generation
|
178 |
+
model = Tacotron2(hparams)
|
179 |
+
state_dict = torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['state_dict']
|
180 |
+
|
181 |
+
if has_MMI(state_dict):
|
182 |
+
raise Exception("ERROR: This notebook does not currently support MMI models.")
|
183 |
+
model.load_state_dict(state_dict)
|
184 |
+
_ = model.cpu().eval().float()
|
185 |
+
return model, hparams
|
186 |
+
|
187 |
+
model, hparams = get_Tactron2(TACOTRON2_ID)
|
188 |
+
previous_tt2_id = TACOTRON2_ID
|
189 |
+
|
190 |
+
pbar.update(1) # Downloaded and Set up Tacotron2
|
191 |
+
|
192 |
+
# 初始化
|
193 |
+
initialize()
|
194 |
+
|
195 |
+
import soundfile as sf
|
196 |
+
|
197 |
+
def end_to_end_infer(text, pronounciation_dictionary, show_graphs):
|
198 |
+
audio = None # 定义一个变量用于存储音频数据
|
199 |
+
for i in [x for x in text.split("\n") if len(x)]:
|
200 |
+
if not pronounciation_dictionary:
|
201 |
+
if i[-1] != ";":
|
202 |
+
i = i + ";"
|
203 |
+
else:
|
204 |
+
i = ARPA(i)
|
205 |
+
with torch.no_grad():
|
206 |
+
sequence = np.array(text_to_sequence(i, [text_cleaner]))[None, :]
|
207 |
+
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cpu().long() # 或者使用 .int()
|
208 |
+
|
209 |
+
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
|
210 |
+
if show_graphs:
|
211 |
+
plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],
|
212 |
+
alignments.float().data.cpu().numpy()[0].T))
|
213 |
+
y_g_hat = hifigan(mel_outputs_postnet.float())
|
214 |
+
audio = y_g_hat.squeeze()
|
215 |
+
audio = audio * MAX_WAV_VALUE
|
216 |
+
output_filename = f"output_{time.strftime('%Y%m%d%H%M%S')}.wav"
|
217 |
+
sf.write(output_filename, audio.cpu().numpy().astype('int16'), hparams.sampling_rate)
|
218 |
+
print(f"音频已保存为 {output_filename}")
|
219 |
+
print("")
|
220 |
+
ipd.display(ipd.Audio(audio.cpu().numpy().astype("int16"), rate=hparams.sampling_rate))
|
221 |
+
return audio # 返回音频数据
|
222 |
+
|
223 |
+
# 文本到语音转换函数
|
224 |
+
def text_to_speech(text, max_decoder_steps=2000, gate_threshold=0.5):
|
225 |
+
global model, hparams, hifigan, thisdict, pronounciation_dictionary, show_graphs
|
226 |
+
|
227 |
+
hparams.max_decoder_steps = max_decoder_steps
|
228 |
+
hparams.gate_threshold = gate_threshold
|
229 |
+
output_filename = f"output_{time.strftime('%Y%m%d%H%M%S')}.wav"
|
230 |
+
audio = end_to_end_infer(text, pronounciation_dictionary, show_graphs)
|
231 |
+
if audio is not None:
|
232 |
+
sf.write(output_filename, audio.cpu().numpy().astype('int16'), hparams.sampling_rate)
|
233 |
+
return output_filename
|
234 |
+
else:
|
235 |
+
return None
|
236 |
+
|
237 |
+
# Gradio界面
|
238 |
+
inputs = [
|
239 |
+
gr.inputs.Textbox(lines=3, label="输入文本"),
|
240 |
+
gr.inputs.Slider(minimum=100, maximum=5000, default=2000, step=100, label="最大解码步数"),
|
241 |
+
gr.inputs.Slider(minimum=0.0, maximum=1.0, default=0.5, step=0.05, label="门控阈值")
|
242 |
+
]
|
243 |
+
outputs = gr.outputs.File(label="下载生成的音频")
|
244 |
+
|
245 |
+
gr.Interface(fn=text_to_speech, inputs=inputs, outputs=outputs).launch(debug=True)
|