Ilzhabimantara commited on
Commit
5cc9b3a
·
1 Parent(s): 98d555d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -58
app.py CHANGED
@@ -34,14 +34,17 @@ f0method_mode = []
34
  f0method_info = ""
35
  if limitation is True:
36
  audio_mode = ["Upload audio", "TTS Audio"]
37
- f0method_mode = ["pm", "harvest"]
38
- f0method_info = "PM is fast, Harvest is good but extremely slow. (Default: PM)"
39
  else:
40
- audio_mode = ["Input path", "Upload audio", "Youtube", "TTS Audio"]
41
- f0method_mode = ["pm", "harvest", "crepe"]
42
- f0method_info = "PM is fast, Harvest is good but extremely slow, and Crepe effect is good but requires GPU (Default: PM)"
43
 
44
- def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
 
 
 
45
  def vc_fn(
46
  vc_audio_mode,
47
  vc_input,
@@ -57,7 +60,6 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
57
  protect,
58
  ):
59
  try:
60
- print(f"Converting using {model_name}...")
61
  if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
62
  audio, sr = librosa.load(vc_input, sr=16000, mono=True)
63
  elif vc_audio_mode == "Upload audio":
@@ -65,15 +67,15 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
65
  return "You need to upload an audio", None
66
  sampling_rate, audio = vc_upload
67
  duration = audio.shape[0] / sampling_rate
68
- if duration > 20 and limitation:
69
- return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
70
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
71
  if len(audio.shape) > 1:
72
  audio = librosa.to_mono(audio.transpose(1, 0))
73
  if sampling_rate != 16000:
74
  audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
75
  elif vc_audio_mode == "TTS Audio":
76
- if len(tts_text) > 100 and limitation:
77
  return "Text is too long", None
78
  if tts_text is None or tts_voice is None:
79
  return "You need to enter text and select a voice", None
@@ -104,12 +106,12 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
104
  f0_file=None,
105
  )
106
  info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
107
- print(f"{model_name} | {info}")
108
  return info, (tgt_sr, audio_opt)
109
  except:
110
  info = traceback.format_exc()
111
  print(info)
112
- return info, None
113
  return vc_fn
114
 
115
  def load_model():
@@ -121,7 +123,6 @@ def load_model():
121
  continue
122
  category_title = category_info['title']
123
  category_folder = category_info['folder_path']
124
- description = category_info['description']
125
  models = []
126
  with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
127
  models_info = json.load(f)
@@ -159,8 +160,8 @@ def load_model():
159
  net_g = net_g.float()
160
  vc = VC(tgt_sr, config)
161
  print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
162
- models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
163
- categories.append([category_title, category_folder, description, models])
164
  return categories
165
 
166
  def cut_vocal_and_inst(url, audio_provider, split_model):
@@ -169,17 +170,25 @@ def cut_vocal_and_inst(url, audio_provider, split_model):
169
  os.mkdir("dl_audio")
170
  if audio_provider == "Youtube":
171
  ydl_opts = {
172
- 'noplaylist': True,
173
- 'format': 'bestaudio/best',
174
- 'postprocessors': [{
175
- 'key': 'FFmpegExtractAudio',
176
- 'preferredcodec': 'wav',
177
- }],
178
- "outtmpl": 'dl_audio/youtube_audio',
179
  }
180
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
181
  ydl.download([url])
182
  audio_path = "dl_audio/youtube_audio.wav"
 
 
 
 
 
 
 
 
 
183
  if split_model == "htdemucs":
184
  command = f"demucs --two-stems=vocals {audio_path} -o output"
185
  result = subprocess.run(command.split(), stdout=subprocess.PIPE)
@@ -232,7 +241,6 @@ def change_audio_mode(vc_audio_mode):
232
  return (
233
  # Input & Upload
234
  gr.Textbox.update(visible=True),
235
- gr.Checkbox.update(visible=False),
236
  gr.Audio.update(visible=False),
237
  # Youtube
238
  gr.Dropdown.update(visible=False),
@@ -253,7 +261,6 @@ def change_audio_mode(vc_audio_mode):
253
  return (
254
  # Input & Upload
255
  gr.Textbox.update(visible=False),
256
- gr.Checkbox.update(visible=True),
257
  gr.Audio.update(visible=True),
258
  # Youtube
259
  gr.Dropdown.update(visible=False),
@@ -274,7 +281,6 @@ def change_audio_mode(vc_audio_mode):
274
  return (
275
  # Input & Upload
276
  gr.Textbox.update(visible=False),
277
- gr.Checkbox.update(visible=False),
278
  gr.Audio.update(visible=False),
279
  # Youtube
280
  gr.Dropdown.update(visible=True),
@@ -295,7 +301,6 @@ def change_audio_mode(vc_audio_mode):
295
  return (
296
  # Input & Upload
297
  gr.Textbox.update(visible=False),
298
- gr.Checkbox.update(visible=False),
299
  gr.Audio.update(visible=False),
300
  # Youtube
301
  gr.Dropdown.update(visible=False),
@@ -316,7 +321,6 @@ def change_audio_mode(vc_audio_mode):
316
  return (
317
  # Input & Upload
318
  gr.Textbox.update(visible=False),
319
- gr.Checkbox.update(visible=True),
320
  gr.Audio.update(visible=True),
321
  # Youtube
322
  gr.Dropdown.update(visible=False),
@@ -334,28 +338,20 @@ def change_audio_mode(vc_audio_mode):
334
  gr.Dropdown.update(visible=False)
335
  )
336
 
337
- def use_microphone(microphone):
338
- if microphone == True:
339
- return gr.Audio.update(source="microphone")
340
- else:
341
- return gr.Audio.update(source="upload")
342
-
343
  if __name__ == '__main__':
344
  load_hubert()
345
  categories = load_model()
346
  tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
347
  voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
348
- with gr.Blocks() as app:
349
  gr.Markdown(
350
- "<div align='center'>\n\n"+
351
- "# Multi Model RVC Inference\n\n"+
352
- "[![Repository](https://img.shields.io/badge/Github-Multi%20Model%20RVC%20Inference-blue?style=for-the-badge&logo=github)](https://github.com/ArkanDash/Multi-Model-RVC-Inference)\n\n"+
353
- "</div>"
354
  )
355
- for (folder_title, folder, description, models) in categories:
356
  with gr.TabItem(folder_title):
357
- if description:
358
- gr.Markdown(f"### <center> {description}")
359
  with gr.Tabs():
360
  if not models:
361
  gr.Markdown("# <center> No Model Loaded.")
@@ -375,11 +371,9 @@ if __name__ == '__main__':
375
  with gr.Row():
376
  with gr.Column():
377
  vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio")
378
- # Input
379
  vc_input = gr.Textbox(label="Input audio path", visible=False)
380
- # Upload
381
- vc_microphone_mode = gr.Checkbox(label="Use Microphone", value=False, visible=True, interactive=True)
382
- vc_upload = gr.Audio(label="Upload audio file", source="upload", visible=True, interactive=True)
383
  # Youtube
384
  vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
385
  vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
@@ -404,8 +398,8 @@ if __name__ == '__main__':
404
  minimum=0,
405
  maximum=1,
406
  label="Retrieval feature ratio",
407
- info="(Default: 0.7)",
408
- value=0.7,
409
  interactive=True,
410
  )
411
  filter_radius0 = gr.Slider(
@@ -413,7 +407,7 @@ if __name__ == '__main__':
413
  maximum=7,
414
  label="Apply Median Filtering",
415
  info="The value represents the filter radius and can reduce breathiness.",
416
- value=3,
417
  step=1,
418
  interactive=True,
419
  )
@@ -439,7 +433,7 @@ if __name__ == '__main__':
439
  maximum=0.5,
440
  label="Voice Protection",
441
  info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
442
- value=0.5,
443
  step=0.01,
444
  interactive=True,
445
  )
@@ -463,7 +457,7 @@ if __name__ == '__main__':
463
  fn=vc_fn,
464
  inputs=[
465
  vc_audio_mode,
466
- vc_input,
467
  vc_upload,
468
  tts_text,
469
  tts_voice,
@@ -487,17 +481,11 @@ if __name__ == '__main__':
487
  inputs=[vc_output, vc_volume, vc_split_model],
488
  outputs=[vc_combined_output]
489
  )
490
- vc_microphone_mode.change(
491
- fn=use_microphone,
492
- inputs=vc_microphone_mode,
493
- outputs=vc_upload
494
- )
495
  vc_audio_mode.change(
496
  fn=change_audio_mode,
497
  inputs=[vc_audio_mode],
498
  outputs=[
499
- vc_input,
500
- vc_microphone_mode,
501
  vc_upload,
502
  vc_download_audio,
503
  vc_link,
@@ -513,4 +501,7 @@ if __name__ == '__main__':
513
  tts_voice
514
  ]
515
  )
516
- app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
 
 
 
 
34
  f0method_info = ""
35
  if limitation is True:
36
  audio_mode = ["Upload audio", "TTS Audio"]
37
+ f0method_mode = ["pm", "crepe", "harvest"]
38
+ f0method_info = "PM is fast, rmvpe is middle, Crepe or harvest is good but it was extremely slow (Default: PM)"
39
  else:
40
+ audio_mode = ["Upload audio", "Youtube", "TTS Audio"]
41
+ f0method_mode = ["pm", "crepe", "harvest"]
42
+ f0method_info = "PM is fast, rmvpe is middle. Crepe or harvest is good but it was extremely slow (Default: PM))"
43
 
44
+ if os.path.isfile("rmvpe.pt"):
45
+ f0method_mode.insert(2, "rmvpe")
46
+
47
+ def create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, file_index):
48
  def vc_fn(
49
  vc_audio_mode,
50
  vc_input,
 
60
  protect,
61
  ):
62
  try:
 
63
  if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
64
  audio, sr = librosa.load(vc_input, sr=16000, mono=True)
65
  elif vc_audio_mode == "Upload audio":
 
67
  return "You need to upload an audio", None
68
  sampling_rate, audio = vc_upload
69
  duration = audio.shape[0] / sampling_rate
70
+ if duration > 360 and limitation:
71
+ return "Please upload an audio file that is less than 1 minute.", None
72
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
73
  if len(audio.shape) > 1:
74
  audio = librosa.to_mono(audio.transpose(1, 0))
75
  if sampling_rate != 16000:
76
  audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
77
  elif vc_audio_mode == "TTS Audio":
78
+ if len(tts_text) > 600 and limitation:
79
  return "Text is too long", None
80
  if tts_text is None or tts_voice is None:
81
  return "You need to enter text and select a voice", None
 
106
  f0_file=None,
107
  )
108
  info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
109
+ print(f"{model_title} | {info}")
110
  return info, (tgt_sr, audio_opt)
111
  except:
112
  info = traceback.format_exc()
113
  print(info)
114
+ return info, (None, None)
115
  return vc_fn
116
 
117
  def load_model():
 
123
  continue
124
  category_title = category_info['title']
125
  category_folder = category_info['folder_path']
 
126
  models = []
127
  with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
128
  models_info = json.load(f)
 
160
  net_g = net_g.float()
161
  vc = VC(tgt_sr, config)
162
  print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
163
+ models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, model_index)))
164
+ categories.append([category_title, category_folder, models])
165
  return categories
166
 
167
  def cut_vocal_and_inst(url, audio_provider, split_model):
 
170
  os.mkdir("dl_audio")
171
  if audio_provider == "Youtube":
172
  ydl_opts = {
173
+ 'format': 'bestaudio/best',
174
+ 'postprocessors': [{
175
+ 'key': 'FFmpegExtractAudio',
176
+ 'preferredcodec': 'wav',
177
+ }],
178
+ "outtmpl": 'dl_audio/youtube_audio',
 
179
  }
180
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
181
  ydl.download([url])
182
  audio_path = "dl_audio/youtube_audio.wav"
183
+ else:
184
+ # Spotify doesnt work.
185
+ # Need to find other solution soon.
186
+ '''
187
+ command = f"spotdl download {url} --output dl_audio/.wav"
188
+ result = subprocess.run(command.split(), stdout=subprocess.PIPE)
189
+ print(result.stdout.decode())
190
+ audio_path = "dl_audio/spotify_audio.wav"
191
+ '''
192
  if split_model == "htdemucs":
193
  command = f"demucs --two-stems=vocals {audio_path} -o output"
194
  result = subprocess.run(command.split(), stdout=subprocess.PIPE)
 
241
  return (
242
  # Input & Upload
243
  gr.Textbox.update(visible=True),
 
244
  gr.Audio.update(visible=False),
245
  # Youtube
246
  gr.Dropdown.update(visible=False),
 
261
  return (
262
  # Input & Upload
263
  gr.Textbox.update(visible=False),
 
264
  gr.Audio.update(visible=True),
265
  # Youtube
266
  gr.Dropdown.update(visible=False),
 
281
  return (
282
  # Input & Upload
283
  gr.Textbox.update(visible=False),
 
284
  gr.Audio.update(visible=False),
285
  # Youtube
286
  gr.Dropdown.update(visible=True),
 
301
  return (
302
  # Input & Upload
303
  gr.Textbox.update(visible=False),
 
304
  gr.Audio.update(visible=False),
305
  # Youtube
306
  gr.Dropdown.update(visible=False),
 
321
  return (
322
  # Input & Upload
323
  gr.Textbox.update(visible=False),
 
324
  gr.Audio.update(visible=True),
325
  # Youtube
326
  gr.Dropdown.update(visible=False),
 
338
  gr.Dropdown.update(visible=False)
339
  )
340
 
 
 
 
 
 
 
341
  if __name__ == '__main__':
342
  load_hubert()
343
  categories = load_model()
344
  tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
345
  voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
346
+ with gr.Blocks(theme=gr.themes.Base()) as app:
347
  gr.Markdown(
348
+ "# <center> RVC Models\n"
349
+ "### <center> will update every hololive ai model that i can find or make.\n"
350
+ "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/aziib/hololive-rvc-models-v2/blob/main/hololive_rvc_models_v2.ipynb)\n\n"
351
+ "[![ko-fi](https://ko-fi.com/img/githubbutton_sm.svg)](https://ko-fi.com/megaaziib)\n\n"
352
  )
353
+ for (folder_title, folder, models) in categories:
354
  with gr.TabItem(folder_title):
 
 
355
  with gr.Tabs():
356
  if not models:
357
  gr.Markdown("# <center> No Model Loaded.")
 
371
  with gr.Row():
372
  with gr.Column():
373
  vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio")
374
+ # Input and Upload
375
  vc_input = gr.Textbox(label="Input audio path", visible=False)
376
+ vc_upload = gr.Audio(label="Upload audio file", visible=True, interactive=True)
 
 
377
  # Youtube
378
  vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
379
  vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
 
398
  minimum=0,
399
  maximum=1,
400
  label="Retrieval feature ratio",
401
+ info="Accents controling. Too high prob gonna sounds too robotic (Default: 0.4)",
402
+ value=0.4,
403
  interactive=True,
404
  )
405
  filter_radius0 = gr.Slider(
 
407
  maximum=7,
408
  label="Apply Median Filtering",
409
  info="The value represents the filter radius and can reduce breathiness.",
410
+ value=1,
411
  step=1,
412
  interactive=True,
413
  )
 
433
  maximum=0.5,
434
  label="Voice Protection",
435
  info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
436
+ value=0.23,
437
  step=0.01,
438
  interactive=True,
439
  )
 
457
  fn=vc_fn,
458
  inputs=[
459
  vc_audio_mode,
460
+ vc_input,
461
  vc_upload,
462
  tts_text,
463
  tts_voice,
 
481
  inputs=[vc_output, vc_volume, vc_split_model],
482
  outputs=[vc_combined_output]
483
  )
 
 
 
 
 
484
  vc_audio_mode.change(
485
  fn=change_audio_mode,
486
  inputs=[vc_audio_mode],
487
  outputs=[
488
+ vc_input,
 
489
  vc_upload,
490
  vc_download_audio,
491
  vc_link,
 
501
  tts_voice
502
  ]
503
  )
504
+ if limitation is True:
505
+ app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
506
+ else:
507
+ app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=True)