Spaces:
Build error
Build error
Ilzhabimantara
commited on
Commit
·
5cc9b3a
1
Parent(s):
98d555d
Update app.py
Browse files
app.py
CHANGED
@@ -34,14 +34,17 @@ f0method_mode = []
|
|
34 |
f0method_info = ""
|
35 |
if limitation is True:
|
36 |
audio_mode = ["Upload audio", "TTS Audio"]
|
37 |
-
f0method_mode = ["pm", "harvest"]
|
38 |
-
f0method_info = "PM is fast,
|
39 |
else:
|
40 |
-
audio_mode = ["
|
41 |
-
f0method_mode = ["pm", "
|
42 |
-
f0method_info = "PM is fast,
|
43 |
|
44 |
-
|
|
|
|
|
|
|
45 |
def vc_fn(
|
46 |
vc_audio_mode,
|
47 |
vc_input,
|
@@ -57,7 +60,6 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
|
|
57 |
protect,
|
58 |
):
|
59 |
try:
|
60 |
-
print(f"Converting using {model_name}...")
|
61 |
if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
|
62 |
audio, sr = librosa.load(vc_input, sr=16000, mono=True)
|
63 |
elif vc_audio_mode == "Upload audio":
|
@@ -65,15 +67,15 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
|
|
65 |
return "You need to upload an audio", None
|
66 |
sampling_rate, audio = vc_upload
|
67 |
duration = audio.shape[0] / sampling_rate
|
68 |
-
if duration >
|
69 |
-
return "Please upload an audio file that is less than
|
70 |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
71 |
if len(audio.shape) > 1:
|
72 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
73 |
if sampling_rate != 16000:
|
74 |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
75 |
elif vc_audio_mode == "TTS Audio":
|
76 |
-
if len(tts_text) >
|
77 |
return "Text is too long", None
|
78 |
if tts_text is None or tts_voice is None:
|
79 |
return "You need to enter text and select a voice", None
|
@@ -104,12 +106,12 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
|
|
104 |
f0_file=None,
|
105 |
)
|
106 |
info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
|
107 |
-
print(f"{
|
108 |
return info, (tgt_sr, audio_opt)
|
109 |
except:
|
110 |
info = traceback.format_exc()
|
111 |
print(info)
|
112 |
-
return info, None
|
113 |
return vc_fn
|
114 |
|
115 |
def load_model():
|
@@ -121,7 +123,6 @@ def load_model():
|
|
121 |
continue
|
122 |
category_title = category_info['title']
|
123 |
category_folder = category_info['folder_path']
|
124 |
-
description = category_info['description']
|
125 |
models = []
|
126 |
with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
|
127 |
models_info = json.load(f)
|
@@ -159,8 +160,8 @@ def load_model():
|
|
159 |
net_g = net_g.float()
|
160 |
vc = VC(tgt_sr, config)
|
161 |
print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
|
162 |
-
models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(
|
163 |
-
categories.append([category_title, category_folder,
|
164 |
return categories
|
165 |
|
166 |
def cut_vocal_and_inst(url, audio_provider, split_model):
|
@@ -169,17 +170,25 @@ def cut_vocal_and_inst(url, audio_provider, split_model):
|
|
169 |
os.mkdir("dl_audio")
|
170 |
if audio_provider == "Youtube":
|
171 |
ydl_opts = {
|
172 |
-
|
173 |
-
|
174 |
-
'
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
"outtmpl": 'dl_audio/youtube_audio',
|
179 |
}
|
180 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
181 |
ydl.download([url])
|
182 |
audio_path = "dl_audio/youtube_audio.wav"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
if split_model == "htdemucs":
|
184 |
command = f"demucs --two-stems=vocals {audio_path} -o output"
|
185 |
result = subprocess.run(command.split(), stdout=subprocess.PIPE)
|
@@ -232,7 +241,6 @@ def change_audio_mode(vc_audio_mode):
|
|
232 |
return (
|
233 |
# Input & Upload
|
234 |
gr.Textbox.update(visible=True),
|
235 |
-
gr.Checkbox.update(visible=False),
|
236 |
gr.Audio.update(visible=False),
|
237 |
# Youtube
|
238 |
gr.Dropdown.update(visible=False),
|
@@ -253,7 +261,6 @@ def change_audio_mode(vc_audio_mode):
|
|
253 |
return (
|
254 |
# Input & Upload
|
255 |
gr.Textbox.update(visible=False),
|
256 |
-
gr.Checkbox.update(visible=True),
|
257 |
gr.Audio.update(visible=True),
|
258 |
# Youtube
|
259 |
gr.Dropdown.update(visible=False),
|
@@ -274,7 +281,6 @@ def change_audio_mode(vc_audio_mode):
|
|
274 |
return (
|
275 |
# Input & Upload
|
276 |
gr.Textbox.update(visible=False),
|
277 |
-
gr.Checkbox.update(visible=False),
|
278 |
gr.Audio.update(visible=False),
|
279 |
# Youtube
|
280 |
gr.Dropdown.update(visible=True),
|
@@ -295,7 +301,6 @@ def change_audio_mode(vc_audio_mode):
|
|
295 |
return (
|
296 |
# Input & Upload
|
297 |
gr.Textbox.update(visible=False),
|
298 |
-
gr.Checkbox.update(visible=False),
|
299 |
gr.Audio.update(visible=False),
|
300 |
# Youtube
|
301 |
gr.Dropdown.update(visible=False),
|
@@ -316,7 +321,6 @@ def change_audio_mode(vc_audio_mode):
|
|
316 |
return (
|
317 |
# Input & Upload
|
318 |
gr.Textbox.update(visible=False),
|
319 |
-
gr.Checkbox.update(visible=True),
|
320 |
gr.Audio.update(visible=True),
|
321 |
# Youtube
|
322 |
gr.Dropdown.update(visible=False),
|
@@ -334,28 +338,20 @@ def change_audio_mode(vc_audio_mode):
|
|
334 |
gr.Dropdown.update(visible=False)
|
335 |
)
|
336 |
|
337 |
-
def use_microphone(microphone):
|
338 |
-
if microphone == True:
|
339 |
-
return gr.Audio.update(source="microphone")
|
340 |
-
else:
|
341 |
-
return gr.Audio.update(source="upload")
|
342 |
-
|
343 |
if __name__ == '__main__':
|
344 |
load_hubert()
|
345 |
categories = load_model()
|
346 |
tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
|
347 |
voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
|
348 |
-
with gr.Blocks() as app:
|
349 |
gr.Markdown(
|
350 |
-
"<
|
351 |
-
"
|
352 |
-
"[![
|
353 |
-
"
|
354 |
)
|
355 |
-
for (folder_title, folder,
|
356 |
with gr.TabItem(folder_title):
|
357 |
-
if description:
|
358 |
-
gr.Markdown(f"### <center> {description}")
|
359 |
with gr.Tabs():
|
360 |
if not models:
|
361 |
gr.Markdown("# <center> No Model Loaded.")
|
@@ -375,11 +371,9 @@ if __name__ == '__main__':
|
|
375 |
with gr.Row():
|
376 |
with gr.Column():
|
377 |
vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio")
|
378 |
-
# Input
|
379 |
vc_input = gr.Textbox(label="Input audio path", visible=False)
|
380 |
-
|
381 |
-
vc_microphone_mode = gr.Checkbox(label="Use Microphone", value=False, visible=True, interactive=True)
|
382 |
-
vc_upload = gr.Audio(label="Upload audio file", source="upload", visible=True, interactive=True)
|
383 |
# Youtube
|
384 |
vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
|
385 |
vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
|
@@ -404,8 +398,8 @@ if __name__ == '__main__':
|
|
404 |
minimum=0,
|
405 |
maximum=1,
|
406 |
label="Retrieval feature ratio",
|
407 |
-
info="(Default: 0.
|
408 |
-
value=0.
|
409 |
interactive=True,
|
410 |
)
|
411 |
filter_radius0 = gr.Slider(
|
@@ -413,7 +407,7 @@ if __name__ == '__main__':
|
|
413 |
maximum=7,
|
414 |
label="Apply Median Filtering",
|
415 |
info="The value represents the filter radius and can reduce breathiness.",
|
416 |
-
value=
|
417 |
step=1,
|
418 |
interactive=True,
|
419 |
)
|
@@ -439,7 +433,7 @@ if __name__ == '__main__':
|
|
439 |
maximum=0.5,
|
440 |
label="Voice Protection",
|
441 |
info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
|
442 |
-
value=0.
|
443 |
step=0.01,
|
444 |
interactive=True,
|
445 |
)
|
@@ -463,7 +457,7 @@ if __name__ == '__main__':
|
|
463 |
fn=vc_fn,
|
464 |
inputs=[
|
465 |
vc_audio_mode,
|
466 |
-
vc_input,
|
467 |
vc_upload,
|
468 |
tts_text,
|
469 |
tts_voice,
|
@@ -487,17 +481,11 @@ if __name__ == '__main__':
|
|
487 |
inputs=[vc_output, vc_volume, vc_split_model],
|
488 |
outputs=[vc_combined_output]
|
489 |
)
|
490 |
-
vc_microphone_mode.change(
|
491 |
-
fn=use_microphone,
|
492 |
-
inputs=vc_microphone_mode,
|
493 |
-
outputs=vc_upload
|
494 |
-
)
|
495 |
vc_audio_mode.change(
|
496 |
fn=change_audio_mode,
|
497 |
inputs=[vc_audio_mode],
|
498 |
outputs=[
|
499 |
-
vc_input,
|
500 |
-
vc_microphone_mode,
|
501 |
vc_upload,
|
502 |
vc_download_audio,
|
503 |
vc_link,
|
@@ -513,4 +501,7 @@ if __name__ == '__main__':
|
|
513 |
tts_voice
|
514 |
]
|
515 |
)
|
516 |
-
|
|
|
|
|
|
|
|
34 |
f0method_info = ""
|
35 |
if limitation is True:
|
36 |
audio_mode = ["Upload audio", "TTS Audio"]
|
37 |
+
f0method_mode = ["pm", "crepe", "harvest"]
|
38 |
+
f0method_info = "PM is fast, rmvpe is middle, Crepe or harvest is good but it was extremely slow (Default: PM)"
|
39 |
else:
|
40 |
+
audio_mode = ["Upload audio", "Youtube", "TTS Audio"]
|
41 |
+
f0method_mode = ["pm", "crepe", "harvest"]
|
42 |
+
f0method_info = "PM is fast, rmvpe is middle. Crepe or harvest is good but it was extremely slow (Default: PM))"
|
43 |
|
44 |
+
if os.path.isfile("rmvpe.pt"):
|
45 |
+
f0method_mode.insert(2, "rmvpe")
|
46 |
+
|
47 |
+
def create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, file_index):
|
48 |
def vc_fn(
|
49 |
vc_audio_mode,
|
50 |
vc_input,
|
|
|
60 |
protect,
|
61 |
):
|
62 |
try:
|
|
|
63 |
if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
|
64 |
audio, sr = librosa.load(vc_input, sr=16000, mono=True)
|
65 |
elif vc_audio_mode == "Upload audio":
|
|
|
67 |
return "You need to upload an audio", None
|
68 |
sampling_rate, audio = vc_upload
|
69 |
duration = audio.shape[0] / sampling_rate
|
70 |
+
if duration > 360 and limitation:
|
71 |
+
return "Please upload an audio file that is less than 1 minute.", None
|
72 |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
73 |
if len(audio.shape) > 1:
|
74 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
75 |
if sampling_rate != 16000:
|
76 |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
77 |
elif vc_audio_mode == "TTS Audio":
|
78 |
+
if len(tts_text) > 600 and limitation:
|
79 |
return "Text is too long", None
|
80 |
if tts_text is None or tts_voice is None:
|
81 |
return "You need to enter text and select a voice", None
|
|
|
106 |
f0_file=None,
|
107 |
)
|
108 |
info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
|
109 |
+
print(f"{model_title} | {info}")
|
110 |
return info, (tgt_sr, audio_opt)
|
111 |
except:
|
112 |
info = traceback.format_exc()
|
113 |
print(info)
|
114 |
+
return info, (None, None)
|
115 |
return vc_fn
|
116 |
|
117 |
def load_model():
|
|
|
123 |
continue
|
124 |
category_title = category_info['title']
|
125 |
category_folder = category_info['folder_path']
|
|
|
126 |
models = []
|
127 |
with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
|
128 |
models_info = json.load(f)
|
|
|
160 |
net_g = net_g.float()
|
161 |
vc = VC(tgt_sr, config)
|
162 |
print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
|
163 |
+
models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, model_index)))
|
164 |
+
categories.append([category_title, category_folder, models])
|
165 |
return categories
|
166 |
|
167 |
def cut_vocal_and_inst(url, audio_provider, split_model):
|
|
|
170 |
os.mkdir("dl_audio")
|
171 |
if audio_provider == "Youtube":
|
172 |
ydl_opts = {
|
173 |
+
'format': 'bestaudio/best',
|
174 |
+
'postprocessors': [{
|
175 |
+
'key': 'FFmpegExtractAudio',
|
176 |
+
'preferredcodec': 'wav',
|
177 |
+
}],
|
178 |
+
"outtmpl": 'dl_audio/youtube_audio',
|
|
|
179 |
}
|
180 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
181 |
ydl.download([url])
|
182 |
audio_path = "dl_audio/youtube_audio.wav"
|
183 |
+
else:
|
184 |
+
# Spotify doesnt work.
|
185 |
+
# Need to find other solution soon.
|
186 |
+
'''
|
187 |
+
command = f"spotdl download {url} --output dl_audio/.wav"
|
188 |
+
result = subprocess.run(command.split(), stdout=subprocess.PIPE)
|
189 |
+
print(result.stdout.decode())
|
190 |
+
audio_path = "dl_audio/spotify_audio.wav"
|
191 |
+
'''
|
192 |
if split_model == "htdemucs":
|
193 |
command = f"demucs --two-stems=vocals {audio_path} -o output"
|
194 |
result = subprocess.run(command.split(), stdout=subprocess.PIPE)
|
|
|
241 |
return (
|
242 |
# Input & Upload
|
243 |
gr.Textbox.update(visible=True),
|
|
|
244 |
gr.Audio.update(visible=False),
|
245 |
# Youtube
|
246 |
gr.Dropdown.update(visible=False),
|
|
|
261 |
return (
|
262 |
# Input & Upload
|
263 |
gr.Textbox.update(visible=False),
|
|
|
264 |
gr.Audio.update(visible=True),
|
265 |
# Youtube
|
266 |
gr.Dropdown.update(visible=False),
|
|
|
281 |
return (
|
282 |
# Input & Upload
|
283 |
gr.Textbox.update(visible=False),
|
|
|
284 |
gr.Audio.update(visible=False),
|
285 |
# Youtube
|
286 |
gr.Dropdown.update(visible=True),
|
|
|
301 |
return (
|
302 |
# Input & Upload
|
303 |
gr.Textbox.update(visible=False),
|
|
|
304 |
gr.Audio.update(visible=False),
|
305 |
# Youtube
|
306 |
gr.Dropdown.update(visible=False),
|
|
|
321 |
return (
|
322 |
# Input & Upload
|
323 |
gr.Textbox.update(visible=False),
|
|
|
324 |
gr.Audio.update(visible=True),
|
325 |
# Youtube
|
326 |
gr.Dropdown.update(visible=False),
|
|
|
338 |
gr.Dropdown.update(visible=False)
|
339 |
)
|
340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
if __name__ == '__main__':
|
342 |
load_hubert()
|
343 |
categories = load_model()
|
344 |
tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
|
345 |
voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
|
346 |
+
with gr.Blocks(theme=gr.themes.Base()) as app:
|
347 |
gr.Markdown(
|
348 |
+
"# <center> RVC Models\n"
|
349 |
+
"### <center> will update every hololive ai model that i can find or make.\n"
|
350 |
+
"[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/aziib/hololive-rvc-models-v2/blob/main/hololive_rvc_models_v2.ipynb)\n\n"
|
351 |
+
"[![ko-fi](https://ko-fi.com/img/githubbutton_sm.svg)](https://ko-fi.com/megaaziib)\n\n"
|
352 |
)
|
353 |
+
for (folder_title, folder, models) in categories:
|
354 |
with gr.TabItem(folder_title):
|
|
|
|
|
355 |
with gr.Tabs():
|
356 |
if not models:
|
357 |
gr.Markdown("# <center> No Model Loaded.")
|
|
|
371 |
with gr.Row():
|
372 |
with gr.Column():
|
373 |
vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio")
|
374 |
+
# Input and Upload
|
375 |
vc_input = gr.Textbox(label="Input audio path", visible=False)
|
376 |
+
vc_upload = gr.Audio(label="Upload audio file", visible=True, interactive=True)
|
|
|
|
|
377 |
# Youtube
|
378 |
vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
|
379 |
vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
|
|
|
398 |
minimum=0,
|
399 |
maximum=1,
|
400 |
label="Retrieval feature ratio",
|
401 |
+
info="Accents controling. Too high prob gonna sounds too robotic (Default: 0.4)",
|
402 |
+
value=0.4,
|
403 |
interactive=True,
|
404 |
)
|
405 |
filter_radius0 = gr.Slider(
|
|
|
407 |
maximum=7,
|
408 |
label="Apply Median Filtering",
|
409 |
info="The value represents the filter radius and can reduce breathiness.",
|
410 |
+
value=1,
|
411 |
step=1,
|
412 |
interactive=True,
|
413 |
)
|
|
|
433 |
maximum=0.5,
|
434 |
label="Voice Protection",
|
435 |
info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
|
436 |
+
value=0.23,
|
437 |
step=0.01,
|
438 |
interactive=True,
|
439 |
)
|
|
|
457 |
fn=vc_fn,
|
458 |
inputs=[
|
459 |
vc_audio_mode,
|
460 |
+
vc_input,
|
461 |
vc_upload,
|
462 |
tts_text,
|
463 |
tts_voice,
|
|
|
481 |
inputs=[vc_output, vc_volume, vc_split_model],
|
482 |
outputs=[vc_combined_output]
|
483 |
)
|
|
|
|
|
|
|
|
|
|
|
484 |
vc_audio_mode.change(
|
485 |
fn=change_audio_mode,
|
486 |
inputs=[vc_audio_mode],
|
487 |
outputs=[
|
488 |
+
vc_input,
|
|
|
489 |
vc_upload,
|
490 |
vc_download_audio,
|
491 |
vc_link,
|
|
|
501 |
tts_voice
|
502 |
]
|
503 |
)
|
504 |
+
if limitation is True:
|
505 |
+
app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
|
506 |
+
else:
|
507 |
+
app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=True)
|