Ilzhabimantara commited on
Commit
49ee920
1 Parent(s): b502c0f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +518 -781
app.py CHANGED
@@ -1,378 +1,195 @@
1
- import torch, os, traceback, sys, warnings, shutil, numpy as np
 
 
 
 
2
  import gradio as gr
 
3
  import librosa
 
4
  import asyncio
5
- import rarfile
6
  import edge_tts
7
  import yt_dlp
8
  import ffmpeg
9
- import gdown
10
  import subprocess
 
 
11
  import wave
12
- import soundfile as sf
13
- from scipy.io import wavfile
14
  from datetime import datetime
15
- from urllib.parse import urlparse
16
- from mega import Mega
17
-
18
- now_dir = os.getcwd()
19
- tmp = os.path.join(now_dir, "TEMP")
20
- shutil.rmtree(tmp, ignore_errors=True)
21
- os.makedirs(tmp, exist_ok=True)
22
- os.environ["TEMP"] = tmp
23
  from lib.infer_pack.models import (
24
  SynthesizerTrnMs256NSFsid,
25
  SynthesizerTrnMs256NSFsid_nono,
26
  SynthesizerTrnMs768NSFsid,
27
  SynthesizerTrnMs768NSFsid_nono,
28
  )
29
- from fairseq import checkpoint_utils
30
  from vc_infer_pipeline import VC
31
  from config import Config
32
  config = Config()
 
 
 
 
 
 
 
 
33
 
34
- tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
35
- voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
 
36
 
37
- hubert_model = None
38
-
39
- f0method_mode = ["pm", "harvest", "crepe"]
40
- f0method_info = "PM is fast, Harvest is good but extremely slow, and Crepe effect is good but requires GPU (Default: PM)"
 
 
 
 
 
 
 
41
 
42
  if os.path.isfile("rmvpe.pt"):
43
  f0method_mode.insert(2, "rmvpe")
44
- f0method_info = "PM is fast, Harvest is good but extremely slow, Rvmpe is alternative to harvest (might be better), and Crepe effect is good but requires GPU (Default: PM)"
45
 
46
- def load_hubert():
47
- global hubert_model
48
- models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
49
- ["hubert_base.pt"],
50
- suffix="",
51
- )
52
- hubert_model = models[0]
53
- hubert_model = hubert_model.to(config.device)
54
- if config.is_half:
55
- hubert_model = hubert_model.half()
56
- else:
57
- hubert_model = hubert_model.float()
58
- hubert_model.eval()
59
-
60
- load_hubert()
61
-
62
- weight_root = "weights"
63
- index_root = "weights/index"
64
- weights_model = []
65
- weights_index = []
66
- for _, _, model_files in os.walk(weight_root):
67
- for file in model_files:
68
- if file.endswith(".pth"):
69
- weights_model.append(file)
70
- for _, _, index_files in os.walk(index_root):
71
- for file in index_files:
72
- if file.endswith('.index') and "trained" not in file:
73
- weights_index.append(os.path.join(index_root, file))
74
-
75
- def check_models():
76
- weights_model = []
77
- weights_index = []
78
- for _, _, model_files in os.walk(weight_root):
79
- for file in model_files:
80
- if file.endswith(".pth"):
81
- weights_model.append(file)
82
- for _, _, index_files in os.walk(index_root):
83
- for file in index_files:
84
- if file.endswith('.index') and "trained" not in file:
85
- weights_index.append(os.path.join(index_root, file))
86
- return (
87
- gr.Dropdown.update(choices=sorted(weights_model), value=weights_model[0]),
88
- gr.Dropdown.update(choices=sorted(weights_index))
89
- )
90
-
91
- def clean():
92
- return (
93
- gr.Dropdown.update(value=""),
94
- gr.Slider.update(visible=False)
95
- )
96
-
97
- def vc_single(
98
- sid,
99
- vc_audio_mode,
100
- input_audio_path,
101
- input_upload_audio,
102
- vocal_audio,
103
- tts_text,
104
- tts_voice,
105
- f0_up_key,
106
- f0_file,
107
- f0_method,
108
- file_index,
109
- index_rate,
110
- filter_radius,
111
- resample_sr,
112
- rms_mix_rate,
113
- protect
114
- ): # spk_item, input_audio0, vc_transform0,f0_file,f0method0
115
- global tgt_sr, net_g, vc, hubert_model, version, cpt
116
- try:
117
- logs = []
118
- print(f"Converting...")
119
- logs.append(f"Converting...")
120
- yield "\n".join(logs), None
121
- if vc_audio_mode == "Input path" or "Youtube" and input_audio_path != "":
122
- audio, sr = librosa.load(input_audio_path, sr=16000, mono=True)
123
- elif vc_audio_mode == "Upload audio":
124
- selected_audio = input_upload_audio
125
- if vocal_audio:
126
- selected_audio = vocal_audio
127
- elif input_upload_audio:
128
- selected_audio = input_upload_audio
129
- sampling_rate, audio = selected_audio
130
- duration = audio.shape[0] / sampling_rate
131
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
132
- if len(audio.shape) > 1:
133
- audio = librosa.to_mono(audio.transpose(1, 0))
134
- if sampling_rate != 16000:
135
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
136
- elif vc_audio_mode == "TTS Audio":
137
- if tts_text is None or tts_voice is None:
138
- return "You need to enter text and select a voice", None
139
- asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
140
- audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
141
- input_audio_path = "tts.mp3"
142
- f0_up_key = int(f0_up_key)
143
- times = [0, 0, 0]
144
- if hubert_model == None:
145
- load_hubert()
146
- if_f0 = cpt.get("f0", 1)
147
- audio_opt = vc.pipeline(
148
- hubert_model,
149
- net_g,
150
- sid,
151
- audio,
152
- input_audio_path,
153
- times,
154
- f0_up_key,
155
- f0_method,
156
- file_index,
157
- # file_big_npy,
158
- index_rate,
159
- if_f0,
160
- filter_radius,
161
- tgt_sr,
162
- resample_sr,
163
- rms_mix_rate,
164
- version,
165
- protect,
166
- f0_file=f0_file
167
- )
168
- if resample_sr >= 16000 and tgt_sr != resample_sr:
169
- tgt_sr = resample_sr
170
- index_info = (
171
- "Using index:%s." % file_index
172
- if os.path.exists(file_index)
173
- else "Index not used."
174
- )
175
- print("Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
176
- index_info,
177
- times[0],
178
- times[1],
179
- times[2],
180
- ))
181
- info = f"{index_info}\n[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
182
- logs.append(info)
183
- yield "\n".join(logs), (tgt_sr, audio_opt)
184
- except:
185
- info = traceback.format_exc()
186
- print(info)
187
- logs.append(info)
188
- yield "\n".join(logs), None
189
 
190
- def get_vc(sid, to_return_protect0):
191
- global n_spk, tgt_sr, net_g, vc, cpt, version, weights_index
192
- if sid == "" or sid == []:
193
- global hubert_model
194
- if hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
195
- print("clean_empty_cache")
196
- del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
197
- hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
198
- if torch.cuda.is_available():
199
- torch.cuda.empty_cache()
200
- ###楼下不这么折腾清理不干净
201
- if_f0 = cpt.get("f0", 1)
202
- version = cpt.get("version", "v1")
203
- if version == "v1":
204
- if if_f0 == 1:
205
- net_g = SynthesizerTrnMs256NSFsid(
206
- *cpt["config"], is_half=config.is_half
207
- )
208
- else:
209
- net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
210
- elif version == "v2":
211
- if if_f0 == 1:
212
- net_g = SynthesizerTrnMs768NSFsid(
213
- *cpt["config"], is_half=config.is_half
214
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  else:
216
- net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
217
- del net_g, cpt
218
- if torch.cuda.is_available():
219
- torch.cuda.empty_cache()
220
- cpt = None
221
- return (
222
- gr.Slider.update(maximum=2333, visible=False),
223
- gr.Slider.update(visible=True),
224
- gr.Dropdown.update(choices=sorted(weights_index), value=""),
225
- gr.Markdown.update(value="# <center> No model selected")
226
- )
227
- print(f"Loading {sid} model...")
228
- selected_model = sid[:-4]
229
- cpt = torch.load(os.path.join(weight_root, sid), map_location="cpu")
230
- tgt_sr = cpt["config"][-1]
231
- cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
232
- if_f0 = cpt.get("f0", 1)
233
- if if_f0 == 0:
234
- to_return_protect0 = {
235
- "visible": False,
236
- "value": 0.5,
237
- "__type__": "update",
238
- }
239
  else:
240
- to_return_protect0 = {
241
- "visible": True,
242
- "value": to_return_protect0,
243
- "__type__": "update",
244
- }
245
- version = cpt.get("version", "v1")
246
- if version == "v1":
247
- if if_f0 == 1:
248
- net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
249
- else:
250
- net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
251
- elif version == "v2":
252
- if if_f0 == 1:
253
- net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
254
- else:
255
- net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
256
- del net_g.enc_q
257
- print(net_g.load_state_dict(cpt["weight"], strict=False))
258
- net_g.eval().to(config.device)
259
- if config.is_half:
260
- net_g = net_g.half()
261
- else:
262
- net_g = net_g.float()
263
- vc = VC(tgt_sr, config)
264
- n_spk = cpt["config"][-3]
265
- weights_index = []
266
- for _, _, index_files in os.walk(index_root):
267
- for file in index_files:
268
- if file.endswith('.index') and "trained" not in file:
269
- weights_index.append(os.path.join(index_root, file))
270
- if weights_index == []:
271
- selected_index = gr.Dropdown.update(value="")
272
- else:
273
- selected_index = gr.Dropdown.update(value=weights_index[0])
274
- for index, model_index in enumerate(weights_index):
275
- if selected_model in model_index:
276
- selected_index = gr.Dropdown.update(value=weights_index[index])
277
- break
278
- return (
279
- gr.Slider.update(maximum=n_spk, visible=True),
280
- to_return_protect0,
281
- selected_index,
282
- gr.Markdown.update(
283
- f'## <center> {selected_model}\n'+
284
- f'### <center> RVC {version} Model'
285
- )
286
- )
287
-
288
- def find_audio_files(folder_path, extensions):
289
- audio_files = []
290
- for root, dirs, files in os.walk(folder_path):
291
- for file in files:
292
- if any(file.endswith(ext) for ext in extensions):
293
- audio_files.append(file)
294
- return audio_files
295
-
296
- def vc_multi(
297
- spk_item,
298
- vc_input,
299
- vc_output,
300
- vc_transform0,
301
- f0method0,
302
- file_index,
303
- index_rate,
304
- filter_radius,
305
- resample_sr,
306
- rms_mix_rate,
307
- protect,
308
- ):
309
- global tgt_sr, net_g, vc, hubert_model, version, cpt
310
- logs = []
311
- logs.append("Converting...")
312
- yield "\n".join(logs)
313
- print()
314
- try:
315
- if os.path.exists(vc_input):
316
- folder_path = vc_input
317
- extensions = [".mp3", ".wav", ".flac", ".ogg"]
318
- audio_files = find_audio_files(folder_path, extensions)
319
- for index, file in enumerate(audio_files, start=1):
320
- audio, sr = librosa.load(os.path.join(folder_path, file), sr=16000, mono=True)
321
- input_audio_path = folder_path, file
322
- f0_up_key = int(vc_transform0)
323
- times = [0, 0, 0]
324
- if hubert_model == None:
325
- load_hubert()
326
- if_f0 = cpt.get("f0", 1)
327
- audio_opt = vc.pipeline(
328
- hubert_model,
329
- net_g,
330
- spk_item,
331
- audio,
332
- input_audio_path,
333
- times,
334
- f0_up_key,
335
- f0method0,
336
- file_index,
337
- index_rate,
338
- if_f0,
339
- filter_radius,
340
- tgt_sr,
341
- resample_sr,
342
- rms_mix_rate,
343
- version,
344
- protect,
345
- f0_file=None
346
- )
347
- if resample_sr >= 16000 and tgt_sr != resample_sr:
348
- tgt_sr = resample_sr
349
- output_path = f"{os.path.join(vc_output, file)}"
350
- os.makedirs(os.path.join(vc_output), exist_ok=True)
351
- sf.write(
352
- output_path,
353
- audio_opt,
354
- tgt_sr,
355
- )
356
- info = f"{index} / {len(audio_files)} | {file}"
357
- print(info)
358
- logs.append(info)
359
- yield "\n".join(logs)
360
- else:
361
- logs.append("Folder not found or path doesn't exist.")
362
- yield "\n".join(logs)
363
- except:
364
- info = traceback.format_exc()
365
- print(info)
366
- logs.append(info)
367
- yield "\n".join(logs)
368
 
369
  def download_audio(url, audio_provider):
370
  logs = []
371
- os.makedirs("dl_audio", exist_ok=True)
372
  if url == "":
373
- logs.append("URL required!")
374
- yield None, "\n".join(logs)
375
- return None, "\n".join(logs)
 
376
  if audio_provider == "Youtube":
377
  logs.append("Downloading the audio...")
378
  yield None, "\n".join(logs)
@@ -383,132 +200,58 @@ def download_audio(url, audio_provider):
383
  'key': 'FFmpegExtractAudio',
384
  'preferredcodec': 'wav',
385
  }],
386
- "outtmpl": 'result/dl_audio/audio',
387
  }
388
- audio_path = "result/dl_audio/audio.wav"
389
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
390
  ydl.download([url])
391
  logs.append("Download Complete.")
392
  yield audio_path, "\n".join(logs)
393
 
394
- def cut_vocal_and_inst_yt(split_model):
395
  logs = []
396
  logs.append("Starting the audio splitting process...")
397
- yield "\n".join(logs), None, None, None
398
- command = f"demucs --two-stems=vocals -n {split_model} result/dl_audio/audio.wav -o output"
399
  result = subprocess.Popen(command.split(), stdout=subprocess.PIPE, text=True)
400
  for line in result.stdout:
401
  logs.append(line)
402
- yield "\n".join(logs), None, None, None
403
  print(result.stdout)
404
  vocal = f"output/{split_model}/audio/vocals.wav"
405
  inst = f"output/{split_model}/audio/no_vocals.wav"
406
  logs.append("Audio splitting complete.")
407
  yield "\n".join(logs), vocal, inst, vocal
408
 
409
- def cut_vocal_and_inst(split_model, audio_data):
410
- logs = []
411
- vocal_path = "output/result/audio.wav"
412
- os.makedirs("output/result", exist_ok=True)
413
- wavfile.write(vocal_path, audio_data[0], audio_data[1])
414
- logs.append("Starting the audio splitting process...")
415
- yield "\n".join(logs), None, None
416
- command = f"demucs --two-stems=vocals -n {split_model} {vocal_path} -o output"
417
- result = subprocess.Popen(command.split(), stdout=subprocess.PIPE, text=True)
418
- for line in result.stdout:
419
- logs.append(line)
420
- yield "\n".join(logs), None, None
421
- print(result.stdout)
422
- vocal = f"output/{split_model}/audio/vocals.wav"
423
- inst = f"output/{split_model}/audio/no_vocals.wav"
424
- logs.append("Audio splitting complete.")
425
- yield "\n".join(logs), vocal, inst
426
-
427
  def combine_vocal_and_inst(audio_data, vocal_volume, inst_volume, split_model):
428
- os.makedirs("output/result", exist_ok=True)
 
429
  vocal_path = "output/result/output.wav"
430
  output_path = "output/result/combine.mp3"
431
  inst_path = f"output/{split_model}/audio/no_vocals.wav"
432
- wavfile.write(vocal_path, audio_data[0], audio_data[1])
 
 
 
 
433
  command = f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [0:a]volume={inst_volume}[i];[1:a]volume={vocal_volume}[v];[i][v]amix=inputs=2:duration=longest[a] -map [a] -b:a 320k -c:a libmp3lame {output_path}'
434
  result = subprocess.run(command.split(), stdout=subprocess.PIPE)
435
  print(result.stdout.decode())
436
  return output_path
437
 
438
- def download_and_extract_models(urls):
439
- logs = []
440
- os.makedirs("zips", exist_ok=True)
441
- os.makedirs(os.path.join("zips", "extract"), exist_ok=True)
442
- os.makedirs(os.path.join(weight_root), exist_ok=True)
443
- os.makedirs(os.path.join(index_root), exist_ok=True)
444
- for link in urls.splitlines():
445
- url = link.strip()
446
- if not url:
447
- raise gr.Error("URL Required!")
448
- return "No URLs provided."
449
- model_zip = urlparse(url).path.split('/')[-2] + '.zip'
450
- model_zip_path = os.path.join('zips', model_zip)
451
- logs.append(f"Downloading...")
452
- yield "\n".join(logs)
453
- if "drive.google.com" in url:
454
- gdown.download(url, os.path.join("zips", "extract"), quiet=False)
455
- elif "mega.nz" in url:
456
- m = Mega()
457
- m.download_url(url, 'zips')
458
- else:
459
- os.system(f"wget {url} -O {model_zip_path}")
460
- logs.append(f"Extracting...")
461
- yield "\n".join(logs)
462
- for filename in os.listdir("zips"):
463
- archived_file = os.path.join("zips", filename)
464
- if filename.endswith(".zip"):
465
- shutil.unpack_archive(archived_file, os.path.join("zips", "extract"), 'zip')
466
- elif filename.endswith(".rar"):
467
- with rarfile.RarFile(archived_file, 'r') as rar:
468
- rar.extractall(os.path.join("zips", "extract"))
469
- for _, dirs, files in os.walk(os.path.join("zips", "extract")):
470
- logs.append(f"Searching Model and Index...")
471
- yield "\n".join(logs)
472
- model = False
473
- index = False
474
- if files:
475
- for file in files:
476
- if file.endswith(".pth"):
477
- basename = file[:-4]
478
- shutil.move(os.path.join("zips", "extract", file), os.path.join(weight_root, file))
479
- model = True
480
- if file.endswith('.index') and "trained" not in file:
481
- shutil.move(os.path.join("zips", "extract", file), os.path.join(index_root, file))
482
- index = True
483
- else:
484
- logs.append("No model in main folder.")
485
- yield "\n".join(logs)
486
- logs.append("Searching in subfolders...")
487
- yield "\n".join(logs)
488
- for sub_dir in dirs:
489
- for _, _, sub_files in os.walk(os.path.join("zips", "extract", sub_dir)):
490
- for file in sub_files:
491
- if file.endswith(".pth"):
492
- basename = file[:-4]
493
- shutil.move(os.path.join("zips", "extract", sub_dir, file), os.path.join(weight_root, file))
494
- model = True
495
- if file.endswith('.index') and "trained" not in file:
496
- shutil.move(os.path.join("zips", "extract", sub_dir, file), os.path.join(index_root, file))
497
- index = True
498
- shutil.rmtree(os.path.join("zips", "extract", sub_dir))
499
- if index is False:
500
- logs.append("Model only file, no Index file detected.")
501
- yield "\n".join(logs)
502
- logs.append("Download Completed!")
503
- yield "\n".join(logs)
504
- logs.append("Successfully download all models! Refresh your model list to load the model")
505
- yield "\n".join(logs)
506
-
507
- def use_microphone(microphone):
508
- if microphone == True:
509
- return gr.Audio.update(source="microphone")
510
  else:
511
- return gr.Audio.update(source="upload")
 
512
 
513
  def change_audio_mode(vc_audio_mode):
514
  if vc_audio_mode == "Input path":
@@ -523,17 +266,16 @@ def change_audio_mode(vc_audio_mode):
523
  gr.Textbox.update(visible=False),
524
  gr.Button.update(visible=False),
525
  # Splitter
526
- gr.Dropdown.update(visible=True),
527
- gr.Textbox.update(visible=True),
528
- gr.Button.update(visible=True),
529
  gr.Button.update(visible=False),
530
  gr.Audio.update(visible=False),
531
- gr.Audio.update(visible=True),
532
- gr.Audio.update(visible=True),
533
- gr.Slider.update(visible=True),
534
- gr.Slider.update(visible=True),
535
- gr.Audio.update(visible=True),
536
- gr.Button.update(visible=True),
537
  # TTS
538
  gr.Textbox.update(visible=False),
539
  gr.Dropdown.update(visible=False)
@@ -550,17 +292,16 @@ def change_audio_mode(vc_audio_mode):
550
  gr.Textbox.update(visible=False),
551
  gr.Button.update(visible=False),
552
  # Splitter
553
- gr.Dropdown.update(visible=True),
554
- gr.Textbox.update(visible=True),
555
  gr.Button.update(visible=False),
556
- gr.Button.update(visible=True),
557
  gr.Audio.update(visible=False),
558
- gr.Audio.update(visible=True),
559
- gr.Audio.update(visible=True),
560
- gr.Slider.update(visible=True),
561
- gr.Slider.update(visible=True),
562
- gr.Audio.update(visible=True),
563
- gr.Button.update(visible=True),
564
  # TTS
565
  gr.Textbox.update(visible=False),
566
  gr.Dropdown.update(visible=False)
@@ -580,7 +321,6 @@ def change_audio_mode(vc_audio_mode):
580
  gr.Dropdown.update(visible=True),
581
  gr.Textbox.update(visible=True),
582
  gr.Button.update(visible=True),
583
- gr.Button.update(visible=False),
584
  gr.Audio.update(visible=True),
585
  gr.Audio.update(visible=True),
586
  gr.Audio.update(visible=True),
@@ -607,7 +347,6 @@ def change_audio_mode(vc_audio_mode):
607
  gr.Dropdown.update(visible=False),
608
  gr.Textbox.update(visible=False),
609
  gr.Button.update(visible=False),
610
- gr.Button.update(visible=False),
611
  gr.Audio.update(visible=False),
612
  gr.Audio.update(visible=False),
613
  gr.Audio.update(visible=False),
@@ -619,324 +358,322 @@ def change_audio_mode(vc_audio_mode):
619
  gr.Textbox.update(visible=True),
620
  gr.Dropdown.update(visible=True)
621
  )
622
-
623
- with gr.Blocks() as app:
624
- gr.Markdown(
625
- "# <center> Advanced RVC Inference\n"
626
- )
627
- with gr.Row():
628
- sid = gr.Dropdown(
629
- label="Weight",
630
- choices=sorted(weights_model),
631
- )
632
- file_index = gr.Dropdown(
633
- label="List of index file",
634
- choices=sorted(weights_index),
635
- interactive=True,
636
- )
637
- spk_item = gr.Slider(
638
- minimum=0,
639
- maximum=2333,
640
- step=1,
641
- label="Speaker ID",
642
- value=0,
643
- visible=False,
644
- interactive=True,
645
- )
646
- refresh_model = gr.Button("Refresh model list", variant="primary")
647
- clean_button = gr.Button("Clear Model from memory", variant="primary")
648
- refresh_model.click(
649
- fn=check_models, inputs=[], outputs=[sid, file_index]
650
- )
651
- clean_button.click(fn=clean, inputs=[], outputs=[sid, spk_item])
652
- with gr.TabItem("Inference"):
653
- selected_model = gr.Markdown(value="# <center> No model selected")
654
- with gr.Row():
655
- with gr.Column():
656
- vc_audio_mode = gr.Dropdown(label="Input voice", choices=["Input path", "Upload audio", "Youtube", "TTS Audio"], allow_custom_value=False, value="Upload audio")
657
- # Input
658
- vc_input = gr.Textbox(label="Input audio path", visible=False)
659
- # Upload
660
- vc_microphone_mode = gr.Checkbox(label="Use Microphone", value=False, visible=True, interactive=True)
661
- vc_upload = gr.Audio(label="Upload audio file", source="upload", visible=True, interactive=True)
662
- # Youtube
663
- vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
664
- vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
665
- vc_log_yt = gr.Textbox(label="Output Information", visible=False, interactive=False)
666
- vc_download_button = gr.Button("Download Audio", variant="primary", visible=False)
667
- vc_audio_preview = gr.Audio(label="Downloaded Audio Preview", visible=False)
668
- # TTS
669
- tts_text = gr.Textbox(label="TTS text", info="Text to speech input", visible=False)
670
- tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
671
- # Splitter
672
- vc_split_model = gr.Dropdown(label="Splitter Model", choices=["hdemucs_mmi", "htdemucs", "htdemucs_ft", "mdx", "mdx_q", "mdx_extra_q"], allow_custom_value=False, visible=True, value="htdemucs", info="Select the splitter model (Default: htdemucs)")
673
- vc_split_log = gr.Textbox(label="Output Information", visible=True, interactive=False)
674
- vc_split_yt = gr.Button("Split Audio", variant="primary", visible=False)
675
- vc_split = gr.Button("Split Audio", variant="primary", visible=True)
676
- vc_vocal_preview = gr.Audio(label="Vocal Preview", interactive=False, visible=True)
677
- vc_inst_preview = gr.Audio(label="Instrumental Preview", interactive=False, visible=True)
678
- with gr.Column():
679
- vc_transform0 = gr.Number(
680
- label="Transpose",
681
- info='Type "12" to change from male to female convertion or Type "-12" to change female to male convertion.',
682
- value=0
683
- )
684
- f0method0 = gr.Radio(
685
- label="Pitch extraction algorithm",
686
- info=f0method_info,
687
- choices=f0method_mode,
688
- value="pm",
689
- interactive=True,
690
- )
691
- index_rate0 = gr.Slider(
692
- minimum=0,
693
- maximum=1,
694
- label="Retrieval feature ratio",
695
- value=0.7,
696
- interactive=True,
697
- )
698
- filter_radius0 = gr.Slider(
699
- minimum=0,
700
- maximum=7,
701
- label="Apply Median Filtering",
702
- info="The value represents the filter radius and can reduce breathiness.",
703
- value=3,
704
- step=1,
705
- interactive=True,
706
- )
707
- resample_sr0 = gr.Slider(
708
- minimum=0,
709
- maximum=48000,
710
- label="Resample the output audio",
711
- info="Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling",
712
- value=0,
713
- step=1,
714
- interactive=True,
715
- )
716
- rms_mix_rate0 = gr.Slider(
717
- minimum=0,
718
- maximum=1,
719
- label="Volume Envelope",
720
- info="Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used",
721
- value=1,
722
- interactive=True,
723
- )
724
- protect0 = gr.Slider(
725
- minimum=0,
726
- maximum=0.5,
727
- label="Voice Protection",
728
- info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
729
- value=0.5,
730
- step=0.01,
731
- interactive=True,
732
- )
733
- f0_file0 = gr.File(
734
- label="F0 curve file (Optional)",
735
- info="One pitch per line, Replace the default F0 and pitch modulation"
736
- )
737
- with gr.Column():
738
- vc_log = gr.Textbox(label="Output Information", interactive=False)
739
- vc_output = gr.Audio(label="Output Audio", interactive=False)
740
- vc_convert = gr.Button("Convert", variant="primary")
741
- vc_vocal_volume = gr.Slider(
742
- minimum=0,
743
- maximum=10,
744
- label="Vocal volume",
745
- value=1,
746
- interactive=True,
747
- step=1,
748
- info="Adjust vocal volume (Default: 1}",
749
- visible=True
750
- )
751
- vc_inst_volume = gr.Slider(
752
- minimum=0,
753
- maximum=10,
754
- label="Instrument volume",
755
- value=1,
756
- interactive=True,
757
- step=1,
758
- info="Adjust instrument volume (Default: 1}",
759
- visible=True
760
- )
761
- vc_combined_output = gr.Audio(label="Output Combined Audio", visible=True)
762
- vc_combine = gr.Button("Combine",variant="primary", visible=True)
763
- vc_convert.click(
764
- vc_single,
765
- [
766
- spk_item,
767
- vc_audio_mode,
768
- vc_input,
769
- vc_upload,
770
- vc_vocal_preview,
771
- tts_text,
772
- tts_voice,
773
- vc_transform0,
774
- f0_file0,
775
- f0method0,
776
- file_index,
777
- index_rate0,
778
- filter_radius0,
779
- resample_sr0,
780
- rms_mix_rate0,
781
- protect0,
782
- ],
783
- [vc_log, vc_output],
784
- )
785
- vc_download_button.click(
786
- fn=download_audio,
787
- inputs=[vc_link, vc_download_audio],
788
- outputs=[vc_audio_preview, vc_log_yt]
789
- )
790
- vc_split_yt.click(
791
- fn=cut_vocal_and_inst_yt,
792
- inputs=[vc_split_model],
793
- outputs=[vc_split_log, vc_vocal_preview, vc_inst_preview, vc_input]
794
- )
795
- vc_split.click(
796
- fn=cut_vocal_and_inst,
797
- inputs=[vc_split_model, vc_upload],
798
- outputs=[vc_split_log, vc_vocal_preview, vc_inst_preview]
799
- )
800
- vc_combine.click(
801
- fn=combine_vocal_and_inst,
802
- inputs=[vc_output, vc_vocal_volume, vc_inst_volume, vc_split_model],
803
- outputs=[vc_combined_output]
804
- )
805
- vc_microphone_mode.change(
806
- fn=use_microphone,
807
- inputs=vc_microphone_mode,
808
- outputs=vc_upload
809
- )
810
- vc_audio_mode.change(
811
- fn=change_audio_mode,
812
- inputs=[vc_audio_mode],
813
- outputs=[
814
- # Input & Upload
815
- vc_input,
816
- vc_microphone_mode,
817
- vc_upload,
818
- # Youtube
819
- vc_download_audio,
820
- vc_link,
821
- vc_log_yt,
822
- vc_download_button,
823
- # Splitter
824
- vc_split_model,
825
- vc_split_log,
826
- vc_split_yt,
827
- vc_split,
828
- vc_audio_preview,
829
- vc_vocal_preview,
830
- vc_inst_preview,
831
- vc_vocal_volume,
832
- vc_inst_volume,
833
- vc_combined_output,
834
- vc_combine,
835
- # TTS
836
- tts_text,
837
- tts_voice
838
- ]
839
- )
840
- sid.change(fn=get_vc, inputs=[sid, protect0], outputs=[spk_item, protect0, file_index, selected_model])
841
- with gr.TabItem("Batch Inference"):
842
- with gr.Row():
843
- with gr.Column():
844
- vc_input_bat = gr.Textbox(label="Input audio path (folder)", visible=True)
845
- vc_output_bat = gr.Textbox(label="Output audio path (folder)", value="result/batch", visible=True)
846
- with gr.Column():
847
- vc_transform0_bat = gr.Number(
848
- label="Transpose",
849
- info='Type "12" to change from male to female convertion or Type "-12" to change female to male convertion.',
850
- value=0
851
- )
852
- f0method0_bat = gr.Radio(
853
- label="Pitch extraction algorithm",
854
- info=f0method_info,
855
- choices=f0method_mode,
856
- value="pm",
857
- interactive=True,
858
- )
859
- index_rate0_bat = gr.Slider(
860
- minimum=0,
861
- maximum=1,
862
- label="Retrieval feature ratio",
863
- value=0.7,
864
- interactive=True,
865
- )
866
- filter_radius0_bat = gr.Slider(
867
- minimum=0,
868
- maximum=7,
869
- label="Apply Median Filtering",
870
- info="The value represents the filter radius and can reduce breathiness.",
871
- value=3,
872
- step=1,
873
- interactive=True,
874
- )
875
- resample_sr0_bat = gr.Slider(
876
- minimum=0,
877
- maximum=48000,
878
- label="Resample the output audio",
879
- info="Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling",
880
- value=0,
881
- step=1,
882
- interactive=True,
883
- )
884
- rms_mix_rate0_bat = gr.Slider(
885
- minimum=0,
886
- maximum=1,
887
- label="Volume Envelope",
888
- info="Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used",
889
- value=1,
890
- interactive=True,
891
- )
892
- protect0_bat = gr.Slider(
893
- minimum=0,
894
- maximum=0.5,
895
- label="Voice Protection",
896
- info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
897
- value=0.5,
898
- step=0.01,
899
- interactive=True,
900
- )
901
- with gr.Column():
902
- vc_log_bat = gr.Textbox(label="Output Information", interactive=False)
903
- vc_convert_bat = gr.Button("Convert", variant="primary")
904
- vc_convert_bat.click(
905
- vc_multi,
906
- [
907
- spk_item,
908
- vc_input_bat,
909
- vc_output_bat,
910
- vc_transform0_bat,
911
- f0method0_bat,
912
- file_index,
913
- index_rate0_bat,
914
- filter_radius0_bat,
915
- resample_sr0_bat,
916
- rms_mix_rate0_bat,
917
- protect0_bat,
918
- ],
919
- [vc_log_bat],
920
- )
921
- with gr.TabItem("Model Downloader"):
922
  gr.Markdown(
923
- "# <center> Model Downloader (Beta)\n"+
924
- "#### <center> To download multi link you have to put your link to the textbox and every link separated by space\n"+
925
- "#### <center> Support Direct Link, Mega, Google Drive, etc"
 
 
 
 
926
  )
927
- with gr.Column():
928
- md_text = gr.Textbox(label="URL")
929
- with gr.Row():
930
- md_download = gr.Button(label="Convert", variant="primary")
931
- md_download_logs = gr.Textbox(label="Output information", interactive=False)
932
- md_download.click(
933
- fn=download_and_extract_models,
934
- inputs=[md_text],
935
- outputs=[md_download_logs]
936
  )
937
- with gr.TabItem("Settings"):
938
- gr.Markdown(
939
- "# <center> Settings\n"+
940
- "#### <center> Work in progress"
941
- )
942
- app.queue(concurrency_count=1, max_size=50, api_open=config.api).launch(share=config.colab)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ import json
4
+ import traceback
5
+ import logging
6
  import gradio as gr
7
+ import numpy as np
8
  import librosa
9
+ import torch
10
  import asyncio
 
11
  import edge_tts
12
  import yt_dlp
13
  import ffmpeg
 
14
  import subprocess
15
+ import sys
16
+ import io
17
  import wave
 
 
18
  from datetime import datetime
19
+ from fairseq import checkpoint_utils
 
 
 
 
 
 
 
20
  from lib.infer_pack.models import (
21
  SynthesizerTrnMs256NSFsid,
22
  SynthesizerTrnMs256NSFsid_nono,
23
  SynthesizerTrnMs768NSFsid,
24
  SynthesizerTrnMs768NSFsid_nono,
25
  )
 
26
  from vc_infer_pipeline import VC
27
  from config import Config
28
  config = Config()
29
+ logging.getLogger("numba").setLevel(logging.WARNING)
30
+ spaces = os.getenv("SYSTEM") == "spaces"
31
+ force_support = None
32
+ if config.unsupported is False:
33
+ if config.device == "mps" or config.device == "cpu":
34
+ force_support = False
35
+ else:
36
+ force_support = True
37
 
38
+ audio_mode = []
39
+ f0method_mode = []
40
+ f0method_info = ""
41
 
42
+ if force_support is False or spaces is True:
43
+ if spaces is True:
44
+ audio_mode = ["Upload audio", "TTS Audio"]
45
+ else:
46
+ audio_mode = ["Input path", "Upload audio", "TTS Audio"]
47
+ f0method_mode = ["pm", "harvest"]
48
+ f0method_info = "PM is fast, Harvest is good but extremely slow, Rvmpe is alternative to harvest (might be better). (Default: PM)"
49
+ else:
50
+ audio_mode = ["Input path", "Upload audio", "Youtube", "TTS Audio"]
51
+ f0method_mode = ["pm", "harvest", "crepe"]
52
+ f0method_info = "PM is fast, Harvest is good but extremely slow, Rvmpe is alternative to harvest (might be better), and Crepe effect is good but requires GPU (Default: PM)"
53
 
54
  if os.path.isfile("rmvpe.pt"):
55
  f0method_mode.insert(2, "rmvpe")
 
56
 
57
+ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
58
+ def vc_fn(
59
+ vc_audio_mode,
60
+ vc_input,
61
+ vc_upload,
62
+ tts_text,
63
+ tts_voice,
64
+ f0_up_key,
65
+ f0_method,
66
+ index_rate,
67
+ filter_radius,
68
+ resample_sr,
69
+ rms_mix_rate,
70
+ protect,
71
+ ):
72
+ try:
73
+ logs = []
74
+ print(f"Converting using {model_name}...")
75
+ logs.append(f"Converting using {model_name}...")
76
+ yield "\n".join(logs), None
77
+ if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
78
+ audio, sr = librosa.load(vc_input, sr=16000, mono=True)
79
+ elif vc_audio_mode == "Upload audio":
80
+ if vc_upload is None:
81
+ return "You need to upload an audio", None
82
+ sampling_rate, audio = vc_upload
83
+ duration = audio.shape[0] / sampling_rate
84
+ if duration > 20 and spaces:
85
+ return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
86
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
87
+ if len(audio.shape) > 1:
88
+ audio = librosa.to_mono(audio.transpose(1, 0))
89
+ if sampling_rate != 16000:
90
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
91
+ elif vc_audio_mode == "TTS Audio":
92
+ if len(tts_text) > 100 and spaces:
93
+ return "Text is too long", None
94
+ if tts_text is None or tts_voice is None:
95
+ return "You need to enter text and select a voice", None
96
+ asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
97
+ audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
98
+ vc_input = "tts.mp3"
99
+ times = [0, 0, 0]
100
+ f0_up_key = int(f0_up_key)
101
+ audio_opt = vc.pipeline(
102
+ hubert_model,
103
+ net_g,
104
+ 0,
105
+ audio,
106
+ vc_input,
107
+ times,
108
+ f0_up_key,
109
+ f0_method,
110
+ file_index,
111
+ # file_big_npy,
112
+ index_rate,
113
+ if_f0,
114
+ filter_radius,
115
+ tgt_sr,
116
+ resample_sr,
117
+ rms_mix_rate,
118
+ version,
119
+ protect,
120
+ f0_file=None,
121
+ )
122
+ info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
123
+ print(f"{model_name} | {info}")
124
+ logs.append(f"Successfully Convert {model_name}\n{info}")
125
+ yield "\n".join(logs), (tgt_sr, audio_opt)
126
+ except:
127
+ info = traceback.format_exc()
128
+ print(info)
129
+ yield info, None
130
+ return vc_fn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
+ def load_model():
133
+ categories = []
134
+ if os.path.isfile("weights/folder_info.json"):
135
+ with open("weights/folder_info.json", "r", encoding="utf-8") as f:
136
+ folder_info = json.load(f)
137
+ for category_name, category_info in folder_info.items():
138
+ if not category_info['enable']:
139
+ continue
140
+ category_title = category_info['title']
141
+ category_folder = category_info['folder_path']
142
+ description = category_info['description']
143
+ models = []
144
+ with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
145
+ models_info = json.load(f)
146
+ for character_name, info in models_info.items():
147
+ if not info['enable']:
148
+ continue
149
+ model_title = info['title']
150
+ model_name = info['model_path']
151
+ model_author = info.get("author", None)
152
+ model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}"
153
+ model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
154
+ cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
155
+ tgt_sr = cpt["config"][-1]
156
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
157
+ if_f0 = cpt.get("f0", 1)
158
+ version = cpt.get("version", "v1")
159
+ if version == "v1":
160
+ if if_f0 == 1:
161
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
162
+ else:
163
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
164
+ model_version = "V1"
165
+ elif version == "v2":
166
+ if if_f0 == 1:
167
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
168
+ else:
169
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
170
+ model_version = "V2"
171
+ del net_g.enc_q
172
+ print(net_g.load_state_dict(cpt["weight"], strict=False))
173
+ net_g.eval().to(config.device)
174
+ if config.is_half:
175
+ net_g = net_g.half()
176
  else:
177
+ net_g = net_g.float()
178
+ vc = VC(tgt_sr, config)
179
+ print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
180
+ models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
181
+ categories.append([category_title, category_folder, description, models])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  else:
183
+ categories = []
184
+ return categories
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
  def download_audio(url, audio_provider):
187
  logs = []
 
188
  if url == "":
189
+ raise gr.Error("URL Required!")
190
+ return "URL Required"
191
+ if not os.path.exists("dl_audio"):
192
+ os.mkdir("dl_audio")
193
  if audio_provider == "Youtube":
194
  logs.append("Downloading the audio...")
195
  yield None, "\n".join(logs)
 
200
  'key': 'FFmpegExtractAudio',
201
  'preferredcodec': 'wav',
202
  }],
203
+ "outtmpl": 'dl_audio/audio',
204
  }
205
+ audio_path = "dl_audio/audio.wav"
206
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
207
  ydl.download([url])
208
  logs.append("Download Complete.")
209
  yield audio_path, "\n".join(logs)
210
 
211
+ def cut_vocal_and_inst(split_model):
212
  logs = []
213
  logs.append("Starting the audio splitting process...")
214
+ yield "\n".join(logs), None, None, None, None
215
+ command = f"demucs --two-stems=vocals -n {split_model} dl_audio/audio.wav -o output"
216
  result = subprocess.Popen(command.split(), stdout=subprocess.PIPE, text=True)
217
  for line in result.stdout:
218
  logs.append(line)
219
+ yield "\n".join(logs), None, None, None, None
220
  print(result.stdout)
221
  vocal = f"output/{split_model}/audio/vocals.wav"
222
  inst = f"output/{split_model}/audio/no_vocals.wav"
223
  logs.append("Audio splitting complete.")
224
  yield "\n".join(logs), vocal, inst, vocal
225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  def combine_vocal_and_inst(audio_data, vocal_volume, inst_volume, split_model):
227
+ if not os.path.exists("output/result"):
228
+ os.mkdir("output/result")
229
  vocal_path = "output/result/output.wav"
230
  output_path = "output/result/combine.mp3"
231
  inst_path = f"output/{split_model}/audio/no_vocals.wav"
232
+ with wave.open(vocal_path, "w") as wave_file:
233
+ wave_file.setnchannels(1)
234
+ wave_file.setsampwidth(2)
235
+ wave_file.setframerate(audio_data[0])
236
+ wave_file.writeframes(audio_data[1].tobytes())
237
  command = f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [0:a]volume={inst_volume}[i];[1:a]volume={vocal_volume}[v];[i][v]amix=inputs=2:duration=longest[a] -map [a] -b:a 320k -c:a libmp3lame {output_path}'
238
  result = subprocess.run(command.split(), stdout=subprocess.PIPE)
239
  print(result.stdout.decode())
240
  return output_path
241
 
242
+ def load_hubert():
243
+ global hubert_model
244
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
245
+ ["hubert_base.pt"],
246
+ suffix="",
247
+ )
248
+ hubert_model = models[0]
249
+ hubert_model = hubert_model.to(config.device)
250
+ if config.is_half:
251
+ hubert_model = hubert_model.half()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  else:
253
+ hubert_model = hubert_model.float()
254
+ hubert_model.eval()
255
 
256
  def change_audio_mode(vc_audio_mode):
257
  if vc_audio_mode == "Input path":
 
266
  gr.Textbox.update(visible=False),
267
  gr.Button.update(visible=False),
268
  # Splitter
269
+ gr.Dropdown.update(visible=False),
270
+ gr.Textbox.update(visible=False),
 
271
  gr.Button.update(visible=False),
272
  gr.Audio.update(visible=False),
273
+ gr.Audio.update(visible=False),
274
+ gr.Audio.update(visible=False),
275
+ gr.Slider.update(visible=False),
276
+ gr.Slider.update(visible=False),
277
+ gr.Audio.update(visible=False),
278
+ gr.Button.update(visible=False),
279
  # TTS
280
  gr.Textbox.update(visible=False),
281
  gr.Dropdown.update(visible=False)
 
292
  gr.Textbox.update(visible=False),
293
  gr.Button.update(visible=False),
294
  # Splitter
295
+ gr.Dropdown.update(visible=False),
296
+ gr.Textbox.update(visible=False),
297
  gr.Button.update(visible=False),
 
298
  gr.Audio.update(visible=False),
299
+ gr.Audio.update(visible=False),
300
+ gr.Audio.update(visible=False),
301
+ gr.Slider.update(visible=False),
302
+ gr.Slider.update(visible=False),
303
+ gr.Audio.update(visible=False),
304
+ gr.Button.update(visible=False),
305
  # TTS
306
  gr.Textbox.update(visible=False),
307
  gr.Dropdown.update(visible=False)
 
321
  gr.Dropdown.update(visible=True),
322
  gr.Textbox.update(visible=True),
323
  gr.Button.update(visible=True),
 
324
  gr.Audio.update(visible=True),
325
  gr.Audio.update(visible=True),
326
  gr.Audio.update(visible=True),
 
347
  gr.Dropdown.update(visible=False),
348
  gr.Textbox.update(visible=False),
349
  gr.Button.update(visible=False),
 
350
  gr.Audio.update(visible=False),
351
  gr.Audio.update(visible=False),
352
  gr.Audio.update(visible=False),
 
358
  gr.Textbox.update(visible=True),
359
  gr.Dropdown.update(visible=True)
360
  )
361
+
362
+ def use_microphone(microphone):
363
+ if microphone == True:
364
+ return gr.Audio.update(source="microphone")
365
+ else:
366
+ return gr.Audio.update(source="upload")
367
+
368
+ if __name__ == '__main__':
369
+ load_hubert()
370
+ categories = load_model()
371
+ tts_voice_list = asyncio.new_event_loop().run_until_complete(edge_tts.list_voices())
372
+ voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
373
+ with gr.Blocks() as app:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  gr.Markdown(
375
+ "<div align='center'>\n\n"+
376
+ "# RVC Wuthering Waves\n\n"+
377
+ "### Recommended to use Google Colab to use other character and feature.\n\n"+
378
+ "[![Colab](https://img.shields.io/badge/Colab-RVC%20Blue%20Archives-blue?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/drive/19Eo2xO7EKcMqvJDc_yXrWmixuNA4NtEU)\n\n"+
379
+ "</div>\n\n"+
380
+ "[![Repository](https://img.shields.io/badge/Github-Multi%20Model%20RVC%20Inference-blue?style=for-the-badge&logo=github)](https://github.com/ArkanDash/Multi-Model-RVC-Inference)\n\n"+
381
+ "</div>"
382
  )
383
+ if categories == []:
384
+ gr.Markdown(
385
+ "<div align='center'>\n\n"+
386
+ "## No model found, please add the model into weights folder\n\n"+
387
+ "</div>"
 
 
 
 
388
  )
389
+ for (folder_title, folder, description, models) in categories:
390
+ with gr.TabItem(folder_title):
391
+ if description:
392
+ gr.Markdown(f"### <center> {description}")
393
+ with gr.Tabs():
394
+ if not models:
395
+ gr.Markdown("# <center> No Model Loaded.")
396
+ gr.Markdown("## <center> Please add the model or fix your model path.")
397
+ continue
398
+ for (name, title, author, cover, model_version, vc_fn) in models:
399
+ with gr.TabItem(name):
400
+ with gr.Row():
401
+ gr.Markdown(
402
+ '<div align="center">'
403
+ f'<div>{title}</div>\n'+
404
+ f'<div>RVC {model_version} Model</div>\n'+
405
+ (f'<div>Model author: {author}</div>' if author else "")+
406
+ (f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else "")+
407
+ '</div>'
408
+ )
409
+ with gr.Row():
410
+ if spaces is False:
411
+ with gr.TabItem("Input"):
412
+ with gr.Row():
413
+ with gr.Column():
414
+ vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio")
415
+ # Input
416
+ vc_input = gr.Textbox(label="Input audio path", visible=False)
417
+ # Upload
418
+ vc_microphone_mode = gr.Checkbox(label="Use Microphone", value=False, visible=True, interactive=True)
419
+ vc_upload = gr.Audio(label="Upload audio file", source="upload", visible=True, interactive=True)
420
+ # Youtube
421
+ vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
422
+ vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
423
+ vc_log_yt = gr.Textbox(label="Output Information", visible=False, interactive=False)
424
+ vc_download_button = gr.Button("Download Audio", variant="primary", visible=False)
425
+ vc_audio_preview = gr.Audio(label="Audio Preview", visible=False)
426
+ # TTS
427
+ tts_text = gr.Textbox(label="TTS text", info="Text to speech input", visible=False)
428
+ tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
429
+ with gr.Column():
430
+ vc_split_model = gr.Dropdown(label="Splitter Model", choices=["hdemucs_mmi", "htdemucs", "htdemucs_ft", "mdx", "mdx_q", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)")
431
+ vc_split_log = gr.Textbox(label="Output Information", visible=False, interactive=False)
432
+ vc_split = gr.Button("Split Audio", variant="primary", visible=False)
433
+ vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False)
434
+ vc_inst_preview = gr.Audio(label="Instrumental Preview", visible=False)
435
+ with gr.TabItem("Convert"):
436
+ with gr.Row():
437
+ with gr.Column():
438
+ vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
439
+ f0method0 = gr.Radio(
440
+ label="Pitch extraction algorithm",
441
+ info=f0method_info,
442
+ choices=f0method_mode,
443
+ value="pm",
444
+ interactive=True
445
+ )
446
+ index_rate1 = gr.Slider(
447
+ minimum=0,
448
+ maximum=1,
449
+ label="Retrieval feature ratio",
450
+ info="(Default: 0.7)",
451
+ value=0.7,
452
+ interactive=True,
453
+ )
454
+ filter_radius0 = gr.Slider(
455
+ minimum=0,
456
+ maximum=7,
457
+ label="Apply Median Filtering",
458
+ info="The value represents the filter radius and can reduce breathiness.",
459
+ value=3,
460
+ step=1,
461
+ interactive=True,
462
+ )
463
+ resample_sr0 = gr.Slider(
464
+ minimum=0,
465
+ maximum=48000,
466
+ label="Resample the output audio",
467
+ info="Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling",
468
+ value=0,
469
+ step=1,
470
+ interactive=True,
471
+ )
472
+ rms_mix_rate0 = gr.Slider(
473
+ minimum=0,
474
+ maximum=1,
475
+ label="Volume Envelope",
476
+ info="Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used",
477
+ value=1,
478
+ interactive=True,
479
+ )
480
+ protect0 = gr.Slider(
481
+ minimum=0,
482
+ maximum=0.5,
483
+ label="Voice Protection",
484
+ info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
485
+ value=0.5,
486
+ step=0.01,
487
+ interactive=True,
488
+ )
489
+ with gr.Column():
490
+ vc_log = gr.Textbox(label="Output Information", interactive=False)
491
+ vc_output = gr.Audio(label="Output Audio", interactive=False)
492
+ vc_convert = gr.Button("Convert", variant="primary")
493
+ vc_vocal_volume = gr.Slider(
494
+ minimum=0,
495
+ maximum=10,
496
+ label="Vocal volume",
497
+ value=1,
498
+ interactive=True,
499
+ step=1,
500
+ info="Adjust vocal volume (Default: 1}",
501
+ visible=False
502
+ )
503
+ vc_inst_volume = gr.Slider(
504
+ minimum=0,
505
+ maximum=10,
506
+ label="Instrument volume",
507
+ value=1,
508
+ interactive=True,
509
+ step=1,
510
+ info="Adjust instrument volume (Default: 1}",
511
+ visible=False
512
+ )
513
+ vc_combined_output = gr.Audio(label="Output Combined Audio", visible=False)
514
+ vc_combine = gr.Button("Combine",variant="primary", visible=False)
515
+ else:
516
+ with gr.Column():
517
+ vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio")
518
+ # Input
519
+ vc_input = gr.Textbox(label="Input audio path", visible=False)
520
+ # Upload
521
+ vc_microphone_mode = gr.Checkbox(label="Use Microphone", value=False, visible=True, interactive=True)
522
+ vc_upload = gr.Audio(label="Upload audio file", source="upload", visible=True, interactive=True)
523
+ # Youtube
524
+ vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
525
+ vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
526
+ vc_log_yt = gr.Textbox(label="Output Information", visible=False, interactive=False)
527
+ vc_download_button = gr.Button("Download Audio", variant="primary", visible=False)
528
+ vc_audio_preview = gr.Audio(label="Audio Preview", visible=False)
529
+ # Splitter
530
+ vc_split_model = gr.Dropdown(label="Splitter Model", choices=["hdemucs_mmi", "htdemucs", "htdemucs_ft", "mdx", "mdx_q", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)")
531
+ vc_split_log = gr.Textbox(label="Output Information", visible=False, interactive=False)
532
+ vc_split = gr.Button("Split Audio", variant="primary", visible=False)
533
+ vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False)
534
+ vc_inst_preview = gr.Audio(label="Instrumental Preview", visible=False)
535
+ # TTS
536
+ tts_text = gr.Textbox(label="TTS text", info="Text to speech input", visible=False)
537
+ tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
538
+ with gr.Column():
539
+ vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
540
+ f0method0 = gr.Radio(
541
+ label="Pitch extraction algorithm",
542
+ info=f0method_info,
543
+ choices=f0method_mode,
544
+ value="pm",
545
+ interactive=True
546
+ )
547
+ index_rate1 = gr.Slider(
548
+ minimum=0,
549
+ maximum=1,
550
+ label="Retrieval feature ratio",
551
+ info="(Default: 0.7)",
552
+ value=0.7,
553
+ interactive=True,
554
+ )
555
+ filter_radius0 = gr.Slider(
556
+ minimum=0,
557
+ maximum=7,
558
+ label="Apply Median Filtering",
559
+ info="The value represents the filter radius and can reduce breathiness.",
560
+ value=3,
561
+ step=1,
562
+ interactive=True,
563
+ )
564
+ resample_sr0 = gr.Slider(
565
+ minimum=0,
566
+ maximum=48000,
567
+ label="Resample the output audio",
568
+ info="Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling",
569
+ value=0,
570
+ step=1,
571
+ interactive=True,
572
+ )
573
+ rms_mix_rate0 = gr.Slider(
574
+ minimum=0,
575
+ maximum=1,
576
+ label="Volume Envelope",
577
+ info="Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used",
578
+ value=1,
579
+ interactive=True,
580
+ )
581
+ protect0 = gr.Slider(
582
+ minimum=0,
583
+ maximum=0.5,
584
+ label="Voice Protection",
585
+ info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
586
+ value=0.5,
587
+ step=0.01,
588
+ interactive=True,
589
+ )
590
+ with gr.Column():
591
+ vc_log = gr.Textbox(label="Output Information", interactive=False)
592
+ vc_output = gr.Audio(label="Output Audio", interactive=False)
593
+ vc_convert = gr.Button("Convert", variant="primary")
594
+ vc_vocal_volume = gr.Slider(
595
+ minimum=0,
596
+ maximum=10,
597
+ label="Vocal volume",
598
+ value=1,
599
+ interactive=True,
600
+ step=1,
601
+ info="Adjust vocal volume (Default: 1}",
602
+ visible=False
603
+ )
604
+ vc_inst_volume = gr.Slider(
605
+ minimum=0,
606
+ maximum=10,
607
+ label="Instrument volume",
608
+ value=1,
609
+ interactive=True,
610
+ step=1,
611
+ info="Adjust instrument volume (Default: 1}",
612
+ visible=False
613
+ )
614
+ vc_combined_output = gr.Audio(label="Output Combined Audio", visible=False)
615
+ vc_combine = gr.Button("Combine",variant="primary", visible=False)
616
+ vc_convert.click(
617
+ fn=vc_fn,
618
+ inputs=[
619
+ vc_audio_mode,
620
+ vc_input,
621
+ vc_upload,
622
+ tts_text,
623
+ tts_voice,
624
+ vc_transform0,
625
+ f0method0,
626
+ index_rate1,
627
+ filter_radius0,
628
+ resample_sr0,
629
+ rms_mix_rate0,
630
+ protect0,
631
+ ],
632
+ outputs=[vc_log ,vc_output]
633
+ )
634
+ vc_download_button.click(
635
+ fn=download_audio,
636
+ inputs=[vc_link, vc_download_audio],
637
+ outputs=[vc_audio_preview, vc_log_yt]
638
+ )
639
+ vc_split.click(
640
+ fn=cut_vocal_and_inst,
641
+ inputs=[vc_split_model],
642
+ outputs=[vc_split_log, vc_vocal_preview, vc_inst_preview, vc_input]
643
+ )
644
+ vc_combine.click(
645
+ fn=combine_vocal_and_inst,
646
+ inputs=[vc_output, vc_vocal_volume, vc_inst_volume, vc_split_model],
647
+ outputs=[vc_combined_output]
648
+ )
649
+ vc_microphone_mode.change(
650
+ fn=use_microphone,
651
+ inputs=vc_microphone_mode,
652
+ outputs=vc_upload
653
+ )
654
+ vc_audio_mode.change(
655
+ fn=change_audio_mode,
656
+ inputs=[vc_audio_mode],
657
+ outputs=[
658
+ vc_input,
659
+ vc_microphone_mode,
660
+ vc_upload,
661
+ vc_download_audio,
662
+ vc_link,
663
+ vc_log_yt,
664
+ vc_download_button,
665
+ vc_split_model,
666
+ vc_split_log,
667
+ vc_split,
668
+ vc_audio_preview,
669
+ vc_vocal_preview,
670
+ vc_inst_preview,
671
+ vc_vocal_volume,
672
+ vc_inst_volume,
673
+ vc_combined_output,
674
+ vc_combine,
675
+ tts_text,
676
+ tts_voice
677
+ ]
678
+ )
679
+ app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)