ArkanDash commited on
Commit
773691e
1 Parent(s): 367637d

feat: update gradio

Browse files
README.md DELETED
@@ -1,13 +0,0 @@
1
- ---
2
- title: RVC Genshin Impact
3
- emoji: 🎤
4
- colorFrom: red
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 3.40.1
8
- app_file: app.py
9
- pinned: true
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -9,24 +9,30 @@ import librosa
9
  import torch
10
  import asyncio
11
  import edge_tts
12
- import yt_dlp
13
- import ffmpeg
14
- import subprocess
15
  import sys
16
  import io
17
- import wave
18
  from datetime import datetime
19
- from fairseq import checkpoint_utils
 
 
 
20
  from lib.infer_pack.models import (
21
  SynthesizerTrnMs256NSFsid,
22
  SynthesizerTrnMs256NSFsid_nono,
23
  SynthesizerTrnMs768NSFsid,
24
  SynthesizerTrnMs768NSFsid_nono,
25
  )
26
- from vc_infer_pipeline import VC
27
- from config import Config
 
 
 
 
 
28
  config = Config()
29
  logging.getLogger("numba").setLevel(logging.WARNING)
 
30
  spaces = os.getenv("SYSTEM") == "spaces"
31
  force_support = None
32
  if config.unsupported is False:
@@ -38,6 +44,7 @@ else:
38
  audio_mode = []
39
  f0method_mode = []
40
  f0method_info = ""
 
41
 
42
  if force_support is False or spaces is True:
43
  if spaces is True:
@@ -71,11 +78,15 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
71
  ):
72
  try:
73
  logs = []
74
- print(f"Converting using {model_name}...")
75
  logs.append(f"Converting using {model_name}...")
76
  yield "\n".join(logs), None
 
77
  if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
78
- audio, sr = librosa.load(vc_input, sr=16000, mono=True)
 
 
 
79
  elif vc_audio_mode == "Upload audio":
80
  if vc_upload is None:
81
  return "You need to upload an audio", None
@@ -93,9 +104,11 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
93
  return "Text is too long", None
94
  if tts_text is None or tts_voice is None:
95
  return "You need to enter text and select a voice", None
96
- asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
97
- audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
98
- vc_input = "tts.mp3"
 
 
99
  times = [0, 0, 0]
100
  f0_up_key = int(f0_up_key)
101
  audio_opt = vc.pipeline(
@@ -120,22 +133,20 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
120
  f0_file=None,
121
  )
122
  info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
123
- print(f"{model_name} | {info}")
124
  logs.append(f"Successfully Convert {model_name}\n{info}")
125
  yield "\n".join(logs), (tgt_sr, audio_opt)
126
  except Exception as err:
127
  info = traceback.format_exc()
128
- print(info)
129
- print(f"Error when using {model_name}.\n{str(err)}")
130
  yield info, None
131
  return vc_fn
132
 
133
  def load_model():
134
  categories = []
 
135
  if os.path.isfile("weights/folder_info.json"):
136
- for _, w_dirs, _ in os.walk(f"weights"):
137
- category_count_total = len(w_dirs)
138
- category_count = 1
139
  with open("weights/folder_info.json", "r", encoding="utf-8") as f:
140
  folder_info = json.load(f)
141
  for category_name, category_info in folder_info.items():
@@ -144,11 +155,7 @@ def load_model():
144
  category_title = category_info['title']
145
  category_folder = category_info['folder_path']
146
  description = category_info['description']
147
- print(f"Load {category_title} [{category_count}/{category_count_total}]")
148
  models = []
149
- for _, m_dirs, _ in os.walk(f"weights/{category_folder}"):
150
- model_count_total = len(m_dirs)
151
- model_count = 1
152
  with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
153
  models_info = json.load(f)
154
  for character_name, info in models_info.items():
@@ -177,15 +184,14 @@ def load_model():
177
  net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
178
  model_version = "V2"
179
  del net_g.enc_q
180
- print(net_g.load_state_dict(cpt["weight"], strict=False))
181
  net_g.eval().to(config.device)
182
  if config.is_half:
183
  net_g = net_g.half()
184
  else:
185
  net_g = net_g.float()
186
  vc = VC(tgt_sr, config)
187
- print(f"Model loaded [{model_count}/{model_count_total}]: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
188
- model_count += 1
189
  models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
190
  category_count += 1
191
  categories.append([category_title, description, models])
@@ -197,7 +203,7 @@ def load_model():
197
  pth_files = glob.glob(f"weights/{sub_dir}/*.pth")
198
  index_files = glob.glob(f"weights/{sub_dir}/*.index")
199
  if pth_files == []:
200
- print(f"Model [{model_count}/{len(w_dirs)}]: No Model file detected, skipping...")
201
  continue
202
  cpt = torch.load(pth_files[0])
203
  tgt_sr = cpt["config"][-1]
@@ -217,7 +223,7 @@ def load_model():
217
  net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
218
  model_version = "V2"
219
  del net_g.enc_q
220
- print(net_g.load_state_dict(cpt["weight"], strict=False))
221
  net_g.eval().to(config.device)
222
  if config.is_half:
223
  net_g = net_g.half()
@@ -225,13 +231,13 @@ def load_model():
225
  net_g = net_g.float()
226
  vc = VC(tgt_sr, config)
227
  if index_files == []:
228
- print("Warning: No Index file detected!")
229
  index_info = "None"
230
  model_index = ""
231
  else:
232
  index_info = index_files[0]
233
  model_index = index_files[0]
234
- print(f"Model loaded [{model_count}/{len(w_dirs)}]: {index_files[0]} / {index_info} | ({model_version})")
235
  model_count += 1
236
  models.append((index_files[0][:-4], index_files[0][:-4], "", "", model_version, create_vc_fn(index_files[0], tgt_sr, net_g, vc, if_f0, version, model_index)))
237
  categories.append(["Models", "", models])
@@ -239,202 +245,16 @@ def load_model():
239
  categories = []
240
  return categories
241
 
242
- def download_audio(url, audio_provider):
243
- logs = []
244
- if url == "":
245
- logs.append("URL required!")
246
- yield None, "\n".join(logs)
247
- return None, "\n".join(logs)
248
- if not os.path.exists("dl_audio"):
249
- os.mkdir("dl_audio")
250
- if audio_provider == "Youtube":
251
- logs.append("Downloading the audio...")
252
- yield None, "\n".join(logs)
253
- ydl_opts = {
254
- 'noplaylist': True,
255
- 'format': 'bestaudio/best',
256
- 'postprocessors': [{
257
- 'key': 'FFmpegExtractAudio',
258
- 'preferredcodec': 'wav',
259
- }],
260
- "outtmpl": 'dl_audio/audio',
261
- }
262
- audio_path = "dl_audio/audio.wav"
263
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
264
- ydl.download([url])
265
- logs.append("Download Complete.")
266
- yield audio_path, "\n".join(logs)
267
-
268
- def cut_vocal_and_inst(split_model):
269
- logs = []
270
- logs.append("Starting the audio splitting process...")
271
- yield "\n".join(logs), None, None, None
272
- command = f"demucs --two-stems=vocals -n {split_model} dl_audio/audio.wav -o output"
273
- result = subprocess.Popen(command.split(), stdout=subprocess.PIPE, text=True)
274
- for line in result.stdout:
275
- logs.append(line)
276
- yield "\n".join(logs), None, None, None
277
- print(result.stdout)
278
- vocal = f"output/{split_model}/audio/vocals.wav"
279
- inst = f"output/{split_model}/audio/no_vocals.wav"
280
- logs.append("Audio splitting complete.")
281
- yield "\n".join(logs), vocal, inst, vocal
282
-
283
- def combine_vocal_and_inst(audio_data, vocal_volume, inst_volume, split_model):
284
- if not os.path.exists("output/result"):
285
- os.mkdir("output/result")
286
- vocal_path = "output/result/output.wav"
287
- output_path = "output/result/combine.mp3"
288
- inst_path = f"output/{split_model}/audio/no_vocals.wav"
289
- with wave.open(vocal_path, "w") as wave_file:
290
- wave_file.setnchannels(1)
291
- wave_file.setsampwidth(2)
292
- wave_file.setframerate(audio_data[0])
293
- wave_file.writeframes(audio_data[1].tobytes())
294
- command = f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [0:a]volume={inst_volume}[i];[1:a]volume={vocal_volume}[v];[i][v]amix=inputs=2:duration=longest[a] -map [a] -b:a 320k -c:a libmp3lame {output_path}'
295
- result = subprocess.run(command.split(), stdout=subprocess.PIPE)
296
- print(result.stdout.decode())
297
- return output_path
298
-
299
- def load_hubert():
300
- global hubert_model
301
- models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
302
- ["hubert_base.pt"],
303
- suffix="",
304
- )
305
- hubert_model = models[0]
306
- hubert_model = hubert_model.to(config.device)
307
- if config.is_half:
308
- hubert_model = hubert_model.half()
309
- else:
310
- hubert_model = hubert_model.float()
311
- hubert_model.eval()
312
-
313
- def change_audio_mode(vc_audio_mode):
314
- if vc_audio_mode == "Input path":
315
- return (
316
- # Input & Upload
317
- gr.Textbox.update(visible=True),
318
- gr.Checkbox.update(visible=False),
319
- gr.Audio.update(visible=False),
320
- # Youtube
321
- gr.Dropdown.update(visible=False),
322
- gr.Textbox.update(visible=False),
323
- gr.Textbox.update(visible=False),
324
- gr.Button.update(visible=False),
325
- # Splitter
326
- gr.Dropdown.update(visible=False),
327
- gr.Textbox.update(visible=False),
328
- gr.Button.update(visible=False),
329
- gr.Audio.update(visible=False),
330
- gr.Audio.update(visible=False),
331
- gr.Audio.update(visible=False),
332
- gr.Slider.update(visible=False),
333
- gr.Slider.update(visible=False),
334
- gr.Audio.update(visible=False),
335
- gr.Button.update(visible=False),
336
- # TTS
337
- gr.Textbox.update(visible=False),
338
- gr.Dropdown.update(visible=False)
339
- )
340
- elif vc_audio_mode == "Upload audio":
341
- return (
342
- # Input & Upload
343
- gr.Textbox.update(visible=False),
344
- gr.Checkbox.update(visible=True),
345
- gr.Audio.update(visible=True),
346
- # Youtube
347
- gr.Dropdown.update(visible=False),
348
- gr.Textbox.update(visible=False),
349
- gr.Textbox.update(visible=False),
350
- gr.Button.update(visible=False),
351
- # Splitter
352
- gr.Dropdown.update(visible=False),
353
- gr.Textbox.update(visible=False),
354
- gr.Button.update(visible=False),
355
- gr.Audio.update(visible=False),
356
- gr.Audio.update(visible=False),
357
- gr.Audio.update(visible=False),
358
- gr.Slider.update(visible=False),
359
- gr.Slider.update(visible=False),
360
- gr.Audio.update(visible=False),
361
- gr.Button.update(visible=False),
362
- # TTS
363
- gr.Textbox.update(visible=False),
364
- gr.Dropdown.update(visible=False)
365
- )
366
- elif vc_audio_mode == "Youtube":
367
- return (
368
- # Input & Upload
369
- gr.Textbox.update(visible=False),
370
- gr.Checkbox.update(visible=False),
371
- gr.Audio.update(visible=False),
372
- # Youtube
373
- gr.Dropdown.update(visible=True),
374
- gr.Textbox.update(visible=True),
375
- gr.Textbox.update(visible=True),
376
- gr.Button.update(visible=True),
377
- # Splitter
378
- gr.Dropdown.update(visible=True),
379
- gr.Textbox.update(visible=True),
380
- gr.Button.update(visible=True),
381
- gr.Audio.update(visible=True),
382
- gr.Audio.update(visible=True),
383
- gr.Audio.update(visible=True),
384
- gr.Slider.update(visible=True),
385
- gr.Slider.update(visible=True),
386
- gr.Audio.update(visible=True),
387
- gr.Button.update(visible=True),
388
- # TTS
389
- gr.Textbox.update(visible=False),
390
- gr.Dropdown.update(visible=False)
391
- )
392
- elif vc_audio_mode == "TTS Audio":
393
- return (
394
- # Input & Upload
395
- gr.Textbox.update(visible=False),
396
- gr.Checkbox.update(visible=False),
397
- gr.Audio.update(visible=False),
398
- # Youtube
399
- gr.Dropdown.update(visible=False),
400
- gr.Textbox.update(visible=False),
401
- gr.Textbox.update(visible=False),
402
- gr.Button.update(visible=False),
403
- # Splitter
404
- gr.Dropdown.update(visible=False),
405
- gr.Textbox.update(visible=False),
406
- gr.Button.update(visible=False),
407
- gr.Audio.update(visible=False),
408
- gr.Audio.update(visible=False),
409
- gr.Audio.update(visible=False),
410
- gr.Slider.update(visible=False),
411
- gr.Slider.update(visible=False),
412
- gr.Audio.update(visible=False),
413
- gr.Button.update(visible=False),
414
- # TTS
415
- gr.Textbox.update(visible=True),
416
- gr.Dropdown.update(visible=True)
417
- )
418
-
419
- def use_microphone(microphone):
420
- if microphone == True:
421
- return gr.Audio.update(source="microphone")
422
- else:
423
- return gr.Audio.update(source="upload")
424
-
425
  if __name__ == '__main__':
426
- load_hubert()
427
  categories = load_model()
428
  tts_voice_list = asyncio.new_event_loop().run_until_complete(edge_tts.list_voices())
429
  voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
430
  with gr.Blocks() as app:
431
  gr.Markdown(
432
  "<div align='center'>\n\n"+
433
- "# RVC Genshin Impact\n\n"+
434
- "### Recommended to use Google Colab to use other character and feature.\n\n"+
435
- "[![Colab](https://img.shields.io/badge/Colab-RVC%20Genshin%20Impact-blue?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n\n"+
436
- "</div>\n\n"+
437
- "[![Repository](https://img.shields.io/badge/Github-Multi%20Model%20RVC%20Inference-blue?style=for-the-badge&logo=github)](https://github.com/ArkanDash/Multi-Model-RVC-Inference)"
438
  )
439
  if categories == []:
440
  gr.Markdown(
@@ -471,8 +291,7 @@ if __name__ == '__main__':
471
  # Input
472
  vc_input = gr.Textbox(label="Input audio path", visible=False)
473
  # Upload
474
- vc_microphone_mode = gr.Checkbox(label="Use Microphone", value=False, visible=True, interactive=True)
475
- vc_upload = gr.Audio(label="Upload audio file", source="upload", visible=True, interactive=True)
476
  # Youtube
477
  vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
478
  vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
@@ -574,7 +393,6 @@ if __name__ == '__main__':
574
  # Input
575
  vc_input = gr.Textbox(label="Input audio path", visible=False)
576
  # Upload
577
- vc_microphone_mode = gr.Checkbox(label="Use Microphone", value=False, visible=True, interactive=True)
578
  vc_upload = gr.Audio(label="Upload audio file", source="upload", visible=True, interactive=True)
579
  # Youtube
580
  vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
@@ -702,17 +520,11 @@ if __name__ == '__main__':
702
  inputs=[vc_output, vc_vocal_volume, vc_inst_volume, vc_split_model],
703
  outputs=[vc_combined_output]
704
  )
705
- vc_microphone_mode.change(
706
- fn=use_microphone,
707
- inputs=vc_microphone_mode,
708
- outputs=vc_upload
709
- )
710
  vc_audio_mode.change(
711
  fn=change_audio_mode,
712
  inputs=[vc_audio_mode],
713
  outputs=[
714
  vc_input,
715
- vc_microphone_mode,
716
  vc_upload,
717
  vc_download_audio,
718
  vc_link,
@@ -732,4 +544,10 @@ if __name__ == '__main__':
732
  tts_voice
733
  ]
734
  )
735
- app.queue(concurrency_count=5, max_size=50, api_open=config.api).launch(share=config.share)
 
 
 
 
 
 
 
9
  import torch
10
  import asyncio
11
  import edge_tts
 
 
 
12
  import sys
13
  import io
14
+
15
  from datetime import datetime
16
+ from lib.config.config import Config
17
+ from lib.vc.vc_infer_pipeline import VC
18
+ from lib.vc.settings import change_audio_mode
19
+ from lib.vc.audio import load_audio
20
  from lib.infer_pack.models import (
21
  SynthesizerTrnMs256NSFsid,
22
  SynthesizerTrnMs256NSFsid_nono,
23
  SynthesizerTrnMs768NSFsid,
24
  SynthesizerTrnMs768NSFsid_nono,
25
  )
26
+ from lib.vc.utils import (
27
+ combine_vocal_and_inst,
28
+ cut_vocal_and_inst,
29
+ download_audio,
30
+ load_hubert
31
+ )
32
+
33
  config = Config()
34
  logging.getLogger("numba").setLevel(logging.WARNING)
35
+ logger = logging.getLogger(__name__)
36
  spaces = os.getenv("SYSTEM") == "spaces"
37
  force_support = None
38
  if config.unsupported is False:
 
44
  audio_mode = []
45
  f0method_mode = []
46
  f0method_info = ""
47
+ hubert_model = load_hubert(config)
48
 
49
  if force_support is False or spaces is True:
50
  if spaces is True:
 
78
  ):
79
  try:
80
  logs = []
81
+ logger.info(f"Converting using {model_name}...")
82
  logs.append(f"Converting using {model_name}...")
83
  yield "\n".join(logs), None
84
+ logger.info(vc_audio_mode)
85
  if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
86
+ audio = load_audio(vc_input, 16000)
87
+ audio_max = np.abs(audio).max() / 0.95
88
+ if audio_max > 1:
89
+ audio /= audio_max
90
  elif vc_audio_mode == "Upload audio":
91
  if vc_upload is None:
92
  return "You need to upload an audio", None
 
104
  return "Text is too long", None
105
  if tts_text is None or tts_voice is None:
106
  return "You need to enter text and select a voice", None
107
+ os.makedirs("output", exist_ok=True)
108
+ os.makedirs(os.path.join("output", "tts"), exist_ok=True)
109
+ asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(os.path.join("output", "tts", "tts.mp3")))
110
+ audio, sr = librosa.load(os.path.join("output", "tts", "tts.mp3"), sr=16000, mono=True)
111
+ vc_input = os.path.join("output", "tts", "tts.mp3")
112
  times = [0, 0, 0]
113
  f0_up_key = int(f0_up_key)
114
  audio_opt = vc.pipeline(
 
133
  f0_file=None,
134
  )
135
  info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
136
+ logger.info(f"{model_name} | {info}")
137
  logs.append(f"Successfully Convert {model_name}\n{info}")
138
  yield "\n".join(logs), (tgt_sr, audio_opt)
139
  except Exception as err:
140
  info = traceback.format_exc()
141
+ logger.error(info)
142
+ logger.error(f"Error when using {model_name}.\n{str(err)}")
143
  yield info, None
144
  return vc_fn
145
 
146
  def load_model():
147
  categories = []
148
+ category_count = 0
149
  if os.path.isfile("weights/folder_info.json"):
 
 
 
150
  with open("weights/folder_info.json", "r", encoding="utf-8") as f:
151
  folder_info = json.load(f)
152
  for category_name, category_info in folder_info.items():
 
155
  category_title = category_info['title']
156
  category_folder = category_info['folder_path']
157
  description = category_info['description']
 
158
  models = []
 
 
 
159
  with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
160
  models_info = json.load(f)
161
  for character_name, info in models_info.items():
 
184
  net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
185
  model_version = "V2"
186
  del net_g.enc_q
187
+ logger.info(net_g.load_state_dict(cpt["weight"], strict=False))
188
  net_g.eval().to(config.device)
189
  if config.is_half:
190
  net_g = net_g.half()
191
  else:
192
  net_g = net_g.float()
193
  vc = VC(tgt_sr, config)
194
+ logger.info(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
 
195
  models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
196
  category_count += 1
197
  categories.append([category_title, description, models])
 
203
  pth_files = glob.glob(f"weights/{sub_dir}/*.pth")
204
  index_files = glob.glob(f"weights/{sub_dir}/*.index")
205
  if pth_files == []:
206
+ logger.debug(f"Model [{model_count}/{len(w_dirs)}]: No Model file detected, skipping...")
207
  continue
208
  cpt = torch.load(pth_files[0])
209
  tgt_sr = cpt["config"][-1]
 
223
  net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
224
  model_version = "V2"
225
  del net_g.enc_q
226
+ logger.info(net_g.load_state_dict(cpt["weight"], strict=False))
227
  net_g.eval().to(config.device)
228
  if config.is_half:
229
  net_g = net_g.half()
 
231
  net_g = net_g.float()
232
  vc = VC(tgt_sr, config)
233
  if index_files == []:
234
+ logger.warning("No Index file detected!")
235
  index_info = "None"
236
  model_index = ""
237
  else:
238
  index_info = index_files[0]
239
  model_index = index_files[0]
240
+ logger.info(f"Model loaded [{model_count}/{len(w_dirs)}]: {index_files[0]} / {index_info} | ({model_version})")
241
  model_count += 1
242
  models.append((index_files[0][:-4], index_files[0][:-4], "", "", model_version, create_vc_fn(index_files[0], tgt_sr, net_g, vc, if_f0, version, model_index)))
243
  categories.append(["Models", "", models])
 
245
  categories = []
246
  return categories
247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  if __name__ == '__main__':
 
249
  categories = load_model()
250
  tts_voice_list = asyncio.new_event_loop().run_until_complete(edge_tts.list_voices())
251
  voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
252
  with gr.Blocks() as app:
253
  gr.Markdown(
254
  "<div align='center'>\n\n"+
255
+ "# Multi Model RVC Inference\n\n"+
256
+ "[![Repository](https://img.shields.io/badge/Github-Multi%20Model%20RVC%20Inference-blue?style=for-the-badge&logo=github)](https://github.com/ArkanDash/Multi-Model-RVC-Inference)\n\n"+
257
+ "</div>"
 
 
258
  )
259
  if categories == []:
260
  gr.Markdown(
 
291
  # Input
292
  vc_input = gr.Textbox(label="Input audio path", visible=False)
293
  # Upload
294
+ vc_upload = gr.Audio(label="Upload audio file", sources=["upload", "microphone"], visible=True, interactive=True)
 
295
  # Youtube
296
  vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
297
  vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
 
393
  # Input
394
  vc_input = gr.Textbox(label="Input audio path", visible=False)
395
  # Upload
 
396
  vc_upload = gr.Audio(label="Upload audio file", source="upload", visible=True, interactive=True)
397
  # Youtube
398
  vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
 
520
  inputs=[vc_output, vc_vocal_volume, vc_inst_volume, vc_split_model],
521
  outputs=[vc_combined_output]
522
  )
 
 
 
 
 
523
  vc_audio_mode.change(
524
  fn=change_audio_mode,
525
  inputs=[vc_audio_mode],
526
  outputs=[
527
  vc_input,
 
528
  vc_upload,
529
  vc_download_audio,
530
  vc_link,
 
544
  tts_voice
545
  ]
546
  )
547
+ app.queue(
548
+ max_size=20,
549
+ api_open=config.api,
550
+ ).launch(
551
+ share=config.share,
552
+ max_threads=1,
553
+ )
hubert_base.pt → assets/hubert/hubert_base.pt RENAMED
File without changes
assets/hubert/req-hubert.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ put hubert_base.pt here
assets/rvmpe/req-rvmpe.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ this is optional for pitch extraction algorithm
2
+ put rvmpe.pt here
rmvpe.pt → assets/rvmpe/rmvpe.pt RENAMED
File without changes
config.py → lib/config/config.py RENAMED
@@ -13,7 +13,7 @@ class Config:
13
  (
14
  self.share,
15
  self.api,
16
- self.unsupported
17
  ) = self.arg_parse()
18
  self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
19
 
@@ -28,7 +28,7 @@ class Config:
28
  return (
29
  cmd_opts.share,
30
  cmd_opts.api,
31
- cmd_opts.unsupported
32
  )
33
 
34
  # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
 
13
  (
14
  self.share,
15
  self.api,
16
+ self.unsupported,
17
  ) = self.arg_parse()
18
  self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
19
 
 
28
  return (
29
  cmd_opts.share,
30
  cmd_opts.api,
31
+ cmd_opts.unsupported,
32
  )
33
 
34
  # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
lib/vc/audio.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import traceback
3
+
4
+ import librosa
5
+ import numpy as np
6
+ import av
7
+ from io import BytesIO
8
+
9
+
10
+ def wav2(i, o, format):
11
+ inp = av.open(i, "rb")
12
+ if format == "m4a":
13
+ format = "mp4"
14
+ out = av.open(o, "wb", format=format)
15
+ if format == "ogg":
16
+ format = "libvorbis"
17
+ if format == "mp4":
18
+ format = "aac"
19
+
20
+ ostream = out.add_stream(format)
21
+
22
+ for frame in inp.decode(audio=0):
23
+ for p in ostream.encode(frame):
24
+ out.mux(p)
25
+
26
+ for p in ostream.encode(None):
27
+ out.mux(p)
28
+
29
+ out.close()
30
+ inp.close()
31
+
32
+
33
+ def audio2(i, o, format, sr):
34
+ inp = av.open(i, "rb")
35
+ out = av.open(o, "wb", format=format)
36
+ if format == "ogg":
37
+ format = "libvorbis"
38
+ if format == "f32le":
39
+ format = "pcm_f32le"
40
+
41
+ ostream = out.add_stream(format, channels=1)
42
+ ostream.sample_rate = sr
43
+
44
+ for frame in inp.decode(audio=0):
45
+ for p in ostream.encode(frame):
46
+ out.mux(p)
47
+
48
+ out.close()
49
+ inp.close()
50
+
51
+
52
+ def load_audio(file, sr):
53
+ file = (
54
+ file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
55
+ ) # 防止小白拷路径头尾带了空格和"和回车
56
+ if os.path.exists(file) == False:
57
+ raise RuntimeError(
58
+ "You input a wrong audio path that does not exists, please fix it!"
59
+ )
60
+ try:
61
+ with open(file, "rb") as f:
62
+ with BytesIO() as out:
63
+ audio2(f, out, "f32le", sr)
64
+ return np.frombuffer(out.getvalue(), np.float32).flatten()
65
+
66
+ except AttributeError:
67
+ audio = file[1] / 32768.0
68
+ if len(audio.shape) == 2:
69
+ audio = np.mean(audio, -1)
70
+ return librosa.resample(audio, orig_sr=file[0], target_sr=16000)
71
+
72
+ except:
73
+ raise RuntimeError(traceback.format_exc())
rmvpe.py → lib/vc/rmvpe.py RENAMED
File without changes
lib/vc/settings.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ def change_audio_mode(vc_audio_mode):
4
+ if vc_audio_mode == "Input path":
5
+ return (
6
+ # Input & Upload
7
+ gr.Textbox(visible=True),
8
+ gr.Audio(visible=False),
9
+ # Youtube
10
+ gr.Dropdown(visible=False),
11
+ gr.Textbox(visible=False),
12
+ gr.Textbox(visible=False),
13
+ gr.Button(visible=False),
14
+ # Splitter
15
+ gr.Dropdown(visible=False),
16
+ gr.Textbox(visible=False),
17
+ gr.Button(visible=False),
18
+ gr.Audio(visible=False),
19
+ gr.Audio(visible=False),
20
+ gr.Audio(visible=False),
21
+ gr.Slider(visible=False),
22
+ gr.Slider(visible=False),
23
+ gr.Audio(visible=False),
24
+ gr.Button(visible=False),
25
+ # TTS
26
+ gr.Textbox(visible=False),
27
+ gr.Dropdown(visible=False)
28
+ )
29
+ elif vc_audio_mode == "Upload audio":
30
+ return (
31
+ # Input & Upload
32
+ gr.Textbox(visible=False),
33
+ gr.Audio(visible=True),
34
+ # Youtube
35
+ gr.Dropdown(visible=False),
36
+ gr.Textbox(visible=False),
37
+ gr.Textbox(visible=False),
38
+ gr.Button(visible=False),
39
+ # Splitter
40
+ gr.Dropdown(visible=False),
41
+ gr.Textbox(visible=False),
42
+ gr.Button(visible=False),
43
+ gr.Audio(visible=False),
44
+ gr.Audio(visible=False),
45
+ gr.Audio(visible=False),
46
+ gr.Slider(visible=False),
47
+ gr.Slider(visible=False),
48
+ gr.Audio(visible=False),
49
+ gr.Button(visible=False),
50
+ # TTS
51
+ gr.Textbox(visible=False),
52
+ gr.Dropdown(visible=False)
53
+ )
54
+ elif vc_audio_mode == "Youtube":
55
+ return (
56
+ # Input & Upload
57
+ gr.Textbox(visible=False),
58
+ gr.Audio(visible=False),
59
+ # Youtube
60
+ gr.Dropdown(visible=True),
61
+ gr.Textbox(visible=True),
62
+ gr.Textbox(visible=True),
63
+ gr.Button(visible=True),
64
+ # Splitter
65
+ gr.Dropdown(visible=True),
66
+ gr.Textbox(visible=True),
67
+ gr.Button(visible=True),
68
+ gr.Audio(visible=True),
69
+ gr.Audio(visible=True),
70
+ gr.Audio(visible=True),
71
+ gr.Slider(visible=True),
72
+ gr.Slider(visible=True),
73
+ gr.Audio(visible=True),
74
+ gr.Button(visible=True),
75
+ # TTS
76
+ gr.Textbox(visible=False),
77
+ gr.Dropdown(visible=False)
78
+ )
79
+ elif vc_audio_mode == "TTS Audio":
80
+ return (
81
+ # Input & Upload
82
+ gr.Textbox(visible=False),
83
+ gr.Audio(visible=False),
84
+ # Youtube
85
+ gr.Dropdown(visible=False),
86
+ gr.Textbox(visible=False),
87
+ gr.Textbox(visible=False),
88
+ gr.Button(visible=False),
89
+ # Splitter
90
+ gr.Dropdown(visible=False),
91
+ gr.Textbox(visible=False),
92
+ gr.Button(visible=False),
93
+ gr.Audio(visible=False),
94
+ gr.Audio(visible=False),
95
+ gr.Audio(visible=False),
96
+ gr.Slider(visible=False),
97
+ gr.Slider(visible=False),
98
+ gr.Audio(visible=False),
99
+ gr.Button(visible=False),
100
+ # TTS
101
+ gr.Textbox(visible=True),
102
+ gr.Dropdown(visible=True)
103
+ )
lib/vc/utils.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import wave
3
+ import subprocess
4
+ import yt_dlp
5
+ import ffmpeg
6
+ import logging
7
+ from fairseq import checkpoint_utils
8
+ logger = logging.getLogger(__name__)
9
+
10
+ def load_hubert(config):
11
+ path_check = os.path.exists("assets/hubert/hubert_base.pt")
12
+ if path_check is False:
13
+ logger.warn("hubert_base.pt is missing. Please check the documentation for to get it.")
14
+ else:
15
+ logger.info("hubert_base.pt found.")
16
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
17
+ [os.path.join("assets", "hubert", "hubert_base.pt")],
18
+ suffix="",
19
+ )
20
+ hubert_model = models[0]
21
+ hubert_model = hubert_model.to(config.device)
22
+ if config.is_half:
23
+ hubert_model = hubert_model.half()
24
+ else:
25
+ hubert_model = hubert_model.float()
26
+ hubert_model.eval()
27
+ return hubert_model
28
+
29
+ def download_audio(url, audio_provider):
30
+ logs = []
31
+ if url == "":
32
+ logs.append("URL required!")
33
+ yield None, "\n".join(logs)
34
+ return None, "\n".join(logs)
35
+ if not os.path.exists("yt"):
36
+ os.mkdir("yt")
37
+ if audio_provider == "Youtube":
38
+ logs.append("Downloading the audio...")
39
+ yield None, "\n".join(logs)
40
+ ydl_opts = {
41
+ 'noplaylist': True,
42
+ 'format': 'bestaudio/best',
43
+ 'postprocessors': [{
44
+ 'key': 'FFmpegExtractAudio',
45
+ 'preferredcodec': 'wav',
46
+ }],
47
+ "outtmpl": 'yt/audio',
48
+ }
49
+ audio_path = "yt/audio.wav"
50
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
51
+ ydl.download([url])
52
+ logs.append("Download Complete.")
53
+ yield audio_path, "\n".join(logs)
54
+
55
+ def cut_vocal_and_inst(split_model):
56
+ logs = []
57
+ logs.append("Starting the audio splitting process...")
58
+ yield "\n".join(logs), None, None, None
59
+ command = f"demucs --two-stems=vocals -n {split_model} yt/audio.wav -o output"
60
+ result = subprocess.Popen(command.split(), stdout=subprocess.PIPE, text=True)
61
+ for line in result.stdout:
62
+ logs.append(line)
63
+ yield "\n".join(logs), None, None, None
64
+ logger.info(result.stdout)
65
+ vocal = f"output/{split_model}/audio/vocals.wav"
66
+ inst = f"output/{split_model}/audio/no_vocals.wav"
67
+ logs.append("Audio splitting complete.")
68
+ yield "\n".join(logs), vocal, inst, vocal
69
+
70
+ def combine_vocal_and_inst(audio_data, vocal_volume, inst_volume, split_model):
71
+ if not os.path.exists("output/result"):
72
+ os.mkdir("output/result")
73
+ vocal_path = "output/result/output.wav"
74
+ output_path = "output/result/combine.mp3"
75
+ inst_path = f"output/{split_model}/audio/no_vocals.wav"
76
+ with wave.open(vocal_path, "w") as wave_file:
77
+ wave_file.setnchannels(1)
78
+ wave_file.setsampwidth(2)
79
+ wave_file.setframerate(audio_data[0])
80
+ wave_file.writeframes(audio_data[1].tobytes())
81
+ command = f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [0:a]volume={inst_volume}[i];[1:a]volume={vocal_volume}[v];[i][v]amix=inputs=2:duration=longest[a] -map [a] -b:a 320k -c:a libmp3lame {output_path}'
82
+ result = subprocess.run(command.split(), stdout=subprocess.PIPE)
83
+ logger.info(result.stdout.decode())
84
+ return output_path
vc_infer_pipeline.py → lib/vc/vc_infer_pipeline.py RENAMED
@@ -133,7 +133,7 @@ class VC(object):
133
 
134
  print("loading rmvpe model")
135
  self.model_rmvpe = RMVPE(
136
- "rmvpe.pt", is_half=self.is_half, device=self.device
137
  )
138
  f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
139
  f0 *= pow(2, f0_up_key / 12)
 
133
 
134
  print("loading rmvpe model")
135
  self.model_rmvpe = RMVPE(
136
+ os.path.join("assets", "rvmpe", "rmvpe.pt"), is_half=self.is_half, device=self.device
137
  )
138
  f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
139
  f0 *= pow(2, f0_up_key / 12)
requirements.txt CHANGED
@@ -7,7 +7,7 @@ scipy==1.9.3
7
  librosa==0.9.1
8
  fairseq==0.12.2
9
  faiss-cpu==1.7.3
10
- gradio==3.40.1
11
  pyworld==0.3.2
12
  soundfile>=0.12.1
13
  praat-parselmouth>=0.4.2
@@ -19,3 +19,5 @@ onnxruntime
19
  demucs
20
  edge-tts
21
  yt_dlp
 
 
 
7
  librosa==0.9.1
8
  fairseq==0.12.2
9
  faiss-cpu==1.7.3
10
+ gradio>==4.19.2
11
  pyworld==0.3.2
12
  soundfile>=0.12.1
13
  praat-parselmouth>=0.4.2
 
19
  demucs
20
  edge-tts
21
  yt_dlp
22
+ pytube
23
+ av