PhoenixStormJr commited on
Commit
8967934
·
verified ·
1 Parent(s): fe95d07

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +322 -0
app.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ import torch
4
+
5
+ # os.system("wget -P cvec/ https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt")
6
+ import gradio as gr
7
+ import librosa
8
+ import numpy as np
9
+ import soundfile
10
+ import logging
11
+ from fairseq import checkpoint_utils
12
+ from my_utils import load_audio
13
+ from vc_infer_pipeline import VC
14
+ import traceback
15
+ from config import Config
16
+ from infer_pack.models import (
17
+ SynthesizerTrnMs256NSFsid,
18
+ SynthesizerTrnMs256NSFsid_nono,
19
+ SynthesizerTrnMs768NSFsid,
20
+ SynthesizerTrnMs768NSFsid_nono,
21
+ )
22
+ from i18n import I18nAuto
23
+
24
+ logging.getLogger("numba").setLevel(logging.WARNING)
25
+ logging.getLogger("markdown_it").setLevel(logging.WARNING)
26
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
27
+ logging.getLogger("matplotlib").setLevel(logging.WARNING)
28
+
29
+ i18n = I18nAuto()
30
+ i18n.print()
31
+
32
+ config = Config()
33
+
34
+ weight_root = "weights"
35
+ weight_uvr5_root = "uvr5_weights"
36
+ index_root = "logs"
37
+ names = []
38
+ hubert_model = None
39
+ for name in os.listdir(weight_root):
40
+ if name.endswith(".pth"):
41
+ names.append(name)
42
+ index_paths = []
43
+ for root, dirs, files in os.walk(index_root, topdown=False):
44
+ for name in files:
45
+ if name.endswith(".index") and "trained" not in name:
46
+ index_paths.append("%s/%s" % (root, name))
47
+
48
+
49
+ def get_vc(sid):
50
+ global n_spk, tgt_sr, net_g, vc, cpt, version
51
+ if sid == "" or sid == []:
52
+ global hubert_model
53
+ if hubert_model != None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
54
+ print("clean_empty_cache")
55
+ del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
56
+ hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
57
+ if torch.cuda.is_available():
58
+ torch.cuda.empty_cache()
59
+ ###楼下不这么折腾清理不干净
60
+ if_f0 = cpt.get("f0", 1)
61
+ version = cpt.get("version", "v1")
62
+ if version == "v1":
63
+ if if_f0 == 1:
64
+ net_g = SynthesizerTrnMs256NSFsid(
65
+ *cpt["config"], is_half=config.is_half
66
+ )
67
+ else:
68
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
69
+ elif version == "v2":
70
+ if if_f0 == 1:
71
+ net_g = SynthesizerTrnMs768NSFsid(
72
+ *cpt["config"], is_half=config.is_half
73
+ )
74
+ else:
75
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
76
+ del net_g, cpt
77
+ if torch.cuda.is_available():
78
+ torch.cuda.empty_cache()
79
+ cpt = None
80
+ return {"visible": False, "__type__": "update"}
81
+ person = "%s/%s" % (weight_root, sid)
82
+ print("loading %s" % person)
83
+ cpt = torch.load(person, map_location="cpu")
84
+ tgt_sr = cpt["config"][-1]
85
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
86
+ if_f0 = cpt.get("f0", 1)
87
+ version = cpt.get("version", "v1")
88
+ if version == "v1":
89
+ if if_f0 == 1:
90
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
91
+ else:
92
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
93
+ elif version == "v2":
94
+ if if_f0 == 1:
95
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
96
+ else:
97
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
98
+ del net_g.enc_q
99
+ print(net_g.load_state_dict(cpt["weight"], strict=False))
100
+ net_g.eval().to(config.device)
101
+ if config.is_half:
102
+ net_g = net_g.half()
103
+ else:
104
+ net_g = net_g.float()
105
+ vc = VC(tgt_sr, config)
106
+ n_spk = cpt["config"][-3]
107
+ return {"visible": True, "maximum": n_spk, "__type__": "update"}
108
+
109
+
110
+ def load_hubert():
111
+ global hubert_model
112
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
113
+ ["hubert_base.pt"],
114
+ suffix="",
115
+ )
116
+ hubert_model = models[0]
117
+ hubert_model = hubert_model.to(config.device)
118
+ if config.is_half:
119
+ hubert_model = hubert_model.half()
120
+ else:
121
+ hubert_model = hubert_model.float()
122
+ hubert_model.eval()
123
+
124
+
125
+ def vc_single(
126
+ sid,
127
+ input_audio_path,
128
+ f0_up_key,
129
+ f0_file,
130
+ f0_method,
131
+ file_index,
132
+ file_index2,
133
+ # file_big_npy,
134
+ index_rate,
135
+ filter_radius,
136
+ resample_sr,
137
+ rms_mix_rate,
138
+ protect,
139
+ ): # spk_item, input_audio0, vc_transform0,f0_file,f0method0
140
+ global tgt_sr, net_g, vc, hubert_model, version
141
+ if input_audio_path is None:
142
+ return "You need to upload an audio", None
143
+ f0_up_key = int(f0_up_key)
144
+ try:
145
+ audio = input_audio_path[1] / 32768.0
146
+ if len(audio.shape) == 2:
147
+ audio = np.mean(audio, -1)
148
+ audio = librosa.resample(audio, orig_sr=input_audio_path[0], target_sr=16000)
149
+ audio_max = np.abs(audio).max() / 0.95
150
+ if audio_max > 1:
151
+ audio /= audio_max
152
+ times = [0, 0, 0]
153
+ if hubert_model == None:
154
+ load_hubert()
155
+ if_f0 = cpt.get("f0", 1)
156
+ file_index = (
157
+ (
158
+ file_index.strip(" ")
159
+ .strip('"')
160
+ .strip("\n")
161
+ .strip('"')
162
+ .strip(" ")
163
+ .replace("trained", "added")
164
+ )
165
+ if file_index != ""
166
+ else file_index2
167
+ ) # 防止小白写错,自动帮他替换掉
168
+ # file_big_npy = (
169
+ # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
170
+ # )
171
+ audio_opt = vc.pipeline(
172
+ hubert_model,
173
+ net_g,
174
+ sid,
175
+ audio,
176
+ input_audio_path,
177
+ times,
178
+ f0_up_key,
179
+ f0_method,
180
+ file_index,
181
+ # file_big_npy,
182
+ index_rate,
183
+ if_f0,
184
+ filter_radius,
185
+ tgt_sr,
186
+ resample_sr,
187
+ rms_mix_rate,
188
+ version,
189
+ protect,
190
+ f0_file=f0_file,
191
+ )
192
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
193
+ tgt_sr = resample_sr
194
+ index_info = (
195
+ "Using index:%s." % file_index
196
+ if os.path.exists(file_index)
197
+ else "Index not used."
198
+ )
199
+ return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
200
+ index_info,
201
+ times[0],
202
+ times[1],
203
+ times[2],
204
+ ), (tgt_sr, audio_opt)
205
+ except:
206
+ info = traceback.format_exc()
207
+ print(info)
208
+ return info, (None, None)
209
+
210
+
211
+ app = gr.Blocks()
212
+ with app:
213
+ with gr.Tabs():
214
+ with gr.TabItem("在线demo"):
215
+ gr.Markdown(
216
+ value="""
217
+ RVC 在线demo
218
+ """
219
+ )
220
+ sid = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names))
221
+ with gr.Column():
222
+ spk_item = gr.Slider(
223
+ minimum=0,
224
+ maximum=2333,
225
+ step=1,
226
+ label=i18n("请选择说话人id"),
227
+ value=0,
228
+ visible=False,
229
+ interactive=True,
230
+ )
231
+ sid.change(
232
+ fn=get_vc,
233
+ inputs=[sid],
234
+ outputs=[spk_item],
235
+ )
236
+ gr.Markdown(
237
+ value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ")
238
+ )
239
+ vc_input3 = gr.Audio(label="上传音频(长度小于90秒)")
240
+ vc_transform0 = gr.Number(label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0)
241
+ f0method0 = gr.Radio(
242
+ label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"),
243
+ choices=["pm", "harvest", "crepe"],
244
+ value="pm",
245
+ interactive=True,
246
+ )
247
+ filter_radius0 = gr.Slider(
248
+ minimum=0,
249
+ maximum=7,
250
+ label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"),
251
+ value=3,
252
+ step=1,
253
+ interactive=True,
254
+ )
255
+ with gr.Column():
256
+ file_index1 = gr.Textbox(
257
+ label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
258
+ value="",
259
+ interactive=False,
260
+ visible=False,
261
+ )
262
+ file_index2 = gr.Dropdown(
263
+ label=i18n("自动检测index路径,下拉式选择(dropdown)"),
264
+ choices=sorted(index_paths),
265
+ interactive=True,
266
+ )
267
+ index_rate1 = gr.Slider(
268
+ minimum=0,
269
+ maximum=1,
270
+ label=i18n("检索特征占比"),
271
+ value=0.88,
272
+ interactive=True,
273
+ )
274
+ resample_sr0 = gr.Slider(
275
+ minimum=0,
276
+ maximum=48000,
277
+ label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
278
+ value=0,
279
+ step=1,
280
+ interactive=True,
281
+ )
282
+ rms_mix_rate0 = gr.Slider(
283
+ minimum=0,
284
+ maximum=1,
285
+ label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"),
286
+ value=1,
287
+ interactive=True,
288
+ )
289
+ protect0 = gr.Slider(
290
+ minimum=0,
291
+ maximum=0.5,
292
+ label=i18n("保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"),
293
+ value=0.33,
294
+ step=0.01,
295
+ interactive=True,
296
+ )
297
+ f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"))
298
+ but0 = gr.Button(i18n("转换"), variant="primary")
299
+ vc_output1 = gr.Textbox(label=i18n("输出信息"))
300
+ vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)"))
301
+ but0.click(
302
+ vc_single,
303
+ [
304
+ spk_item,
305
+ vc_input3,
306
+ vc_transform0,
307
+ f0_file,
308
+ f0method0,
309
+ file_index1,
310
+ file_index2,
311
+ # file_big_npy1,
312
+ index_rate1,
313
+ filter_radius0,
314
+ resample_sr0,
315
+ rms_mix_rate0,
316
+ protect0,
317
+ ],
318
+ [vc_output1, vc_output2],
319
+ )
320
+
321
+
322
+ app.launch()