ArkanDash commited on
Commit
cf10d9b
1 Parent(s): db4e781

feat(app): ui overhaul

Browse files
app-full.py CHANGED
@@ -75,40 +75,61 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
75
  print(
76
  f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
77
  )
78
- return "Success", (tgt_sr, audio_opt)
79
  except:
80
  info = traceback.format_exc()
81
  print(info)
82
  return info, (None, None)
83
  return vc_fn
84
 
85
- def cut_vocal_and_inst(yt_url):
86
- if yt_url != "":
87
- if not os.path.exists("youtube_audio"):
88
- os.mkdir("youtube_audio")
89
- ydl_opts = {
 
90
  'format': 'bestaudio/best',
91
  'postprocessors': [{
92
  'key': 'FFmpegExtractAudio',
93
  'preferredcodec': 'wav',
94
  }],
95
- "outtmpl": 'youtube_audio/audio',
96
- }
97
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
98
- ydl.download([yt_url])
99
- yt_audio_path = "youtube_audio/audio.wav"
100
- command = f"demucs --two-stems=vocals {yt_audio_path}"
101
- result = subprocess.run(command.split(), stdout=subprocess.PIPE)
102
- print(result.stdout.decode())
103
- return ("separated/htdemucs/audio/vocals.wav", "separated/htdemucs/audio/no_vocals.wav", yt_audio_path, "separated/htdemucs/audio/vocals.wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- def combine_vocal_and_inst(audio_data, audio_volume):
106
- print(audio_data)
107
- if not os.path.exists("result"):
108
- os.mkdir("result")
109
- vocal_path = "result/output.wav"
110
- inst_path = "separated/htdemucs/audio/no_vocals.wav"
111
- output_path = "result/combine.mp3"
 
 
112
  with wave.open(vocal_path, "w") as wave_file:
113
  wave_file.setnchannels(1)
114
  wave_file.setsampwidth(2)
@@ -116,6 +137,7 @@ def combine_vocal_and_inst(audio_data, audio_volume):
116
  wave_file.writeframes(audio_data[1].tobytes())
117
  command = f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [1:a]volume={audio_volume}dB[v];[0:a][v]amix=inputs=2:duration=longest -b:a 320k -c:a libmp3lame {output_path}'
118
  result = subprocess.run(command.split(), stdout=subprocess.PIPE)
 
119
  return output_path
120
 
121
  def load_hubert():
@@ -191,7 +213,7 @@ if __name__ == '__main__':
191
  categories.append([category_title, category_folder, description, models])
192
  with gr.Blocks() as app:
193
  gr.Markdown(
194
- "# <center> RVC Models [(Latest Update)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/releases/tag/20230428updated)\n"
195
  "## <center> The input audio should be clean and pure voice without background music.\n"
196
  "### <center> This project was inspired by [zomehwh](https://huggingface.co/spaces/zomehwh/rvc-models) and [ardha27](https://huggingface.co/spaces/ardha27/rvc-models)\n"
197
  "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n\n"
@@ -218,21 +240,24 @@ if __name__ == '__main__':
218
  )
219
  with gr.Row():
220
  with gr.Column():
221
- vc_youtube = gr.Textbox(label="Youtube URL")
222
- vc_convert = gr.Button("Convert", variant="primary")
 
 
223
  vc_vocal_preview = gr.Audio(label="Vocal Preview")
224
  vc_inst_preview = gr.Audio(label="Instrumental Preview")
225
  vc_audio_preview = gr.Audio(label="Audio Preview")
226
  with gr.Column():
 
227
  vc_input = gr.Textbox(label="Input audio path")
228
  vc_upload = gr.Audio(label="Upload audio file", visible=False, interactive=True)
229
- upload_mode = gr.Checkbox(label="Upload mode", value=False)
230
- vc_transpose = gr.Number(label="Transpose", value=0)
231
  vc_f0method = gr.Radio(
232
- label="Pitch extraction algorithm, PM is fast but Harvest is better for low frequencies",
233
  choices=["pm", "harvest"],
234
  value="pm",
235
  interactive=True,
 
236
  )
237
  vc_index_ratio = gr.Slider(
238
  minimum=0,
@@ -240,13 +265,13 @@ if __name__ == '__main__':
240
  label="Retrieval feature ratio",
241
  value=0.6,
242
  interactive=True,
 
243
  )
244
  tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
245
  tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
246
  tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
247
- vc_output1 = gr.Textbox(label="Output Message")
248
- vc_output2 = gr.Audio(label="Output Audio")
249
- vc_submit = gr.Button("Generate", variant="primary")
250
  with gr.Column():
251
  vc_volume = gr.Slider(
252
  minimum=0,
@@ -254,13 +279,14 @@ if __name__ == '__main__':
254
  label="Vocal volume",
255
  value=4,
256
  interactive=True,
257
- step=1
 
258
  )
259
- vc_outputCombine = gr.Audio(label="Output Combined Audio")
260
  vc_combine = gr.Button("Combine",variant="primary")
261
- vc_submit.click(vc_fn, [vc_input, vc_upload, upload_mode, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
262
- vc_convert.click(cut_vocal_and_inst, vc_youtube, [vc_vocal_preview, vc_inst_preview, vc_audio_preview, vc_input])
263
- vc_combine.click(combine_vocal_and_inst, [vc_output2, vc_volume], vc_outputCombine)
264
  tts_mode.change(change_to_tts_mode, [tts_mode, upload_mode], [vc_input, vc_upload, upload_mode, tts_text, tts_voice])
265
  upload_mode.change(change_to_upload_mode, [upload_mode], [vc_input, vc_upload])
266
  app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
 
75
  print(
76
  f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
77
  )
78
+ return (tgt_sr, audio_opt)
79
  except:
80
  info = traceback.format_exc()
81
  print(info)
82
  return info, (None, None)
83
  return vc_fn
84
 
85
+ def cut_vocal_and_inst(url, audio_provider, split_model):
86
+ if url != "":
87
+ if not os.path.exists("dl_audio"):
88
+ os.mkdir("dl_audio")
89
+ if audio_provider == "Youtube":
90
+ ydl_opts = {
91
  'format': 'bestaudio/best',
92
  'postprocessors': [{
93
  'key': 'FFmpegExtractAudio',
94
  'preferredcodec': 'wav',
95
  }],
96
+ "outtmpl": 'dl_audio/youtube_audio',
97
+ }
98
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
99
+ ydl.download([url])
100
+ audio_path = "dl_audio/youtube_audio.wav"
101
+ else:
102
+ # Spotify doesnt work.
103
+ # Need to find other solution soon.
104
+ '''
105
+ command = f"spotdl download {url} --output dl_audio/.wav"
106
+ result = subprocess.run(command.split(), stdout=subprocess.PIPE)
107
+ print(result.stdout.decode())
108
+ audio_path = "dl_audio/spotify_audio.wav"
109
+ '''
110
+ if split_model == "htdemucs":
111
+ command = f"demucs --two-stems=vocals {audio_path} -o output"
112
+ result = subprocess.run(command.split(), stdout=subprocess.PIPE)
113
+ print(result.stdout.decode())
114
+ return "output/htdemucs/youtube_audio/vocals.wav", "output/htdemucs/youtube_audio/no_vocals.wav", audio_path, "output/htdemucs/youtube_audio/vocals.wav"
115
+ else:
116
+ command = f"demucs --two-stems=vocals -n mdx_extra_q {audio_path} -o output"
117
+ result = subprocess.run(command.split(), stdout=subprocess.PIPE)
118
+ print(result.stdout.decode())
119
+ return "output/mdx_extra_q/youtube_audio/vocals.wav", "output/mdx_extra_q/youtube_audio/no_vocals.wav", audio_path, "output/mdx_extra_q/youtube_audio/vocals.wav"
120
+ else:
121
+ raise gr.Error("URL Required!")
122
+ return None, None, None, None
123
 
124
+ def combine_vocal_and_inst(audio_data, audio_volume, split_model):
125
+ if not os.path.exists("output/result"):
126
+ os.mkdir("output/result")
127
+ vocal_path = "output/result/output.wav"
128
+ output_path = "output/result/combine.mp3"
129
+ if split_model == "htdemucs":
130
+ inst_path = "output/htdemucs/youtube_audio/no_vocals.wav"
131
+ else:
132
+ inst_path = "output/mdx_extra_q/youtube_audio/no_vocals.wav"
133
  with wave.open(vocal_path, "w") as wave_file:
134
  wave_file.setnchannels(1)
135
  wave_file.setsampwidth(2)
 
137
  wave_file.writeframes(audio_data[1].tobytes())
138
  command = f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [1:a]volume={audio_volume}dB[v];[0:a][v]amix=inputs=2:duration=longest -b:a 320k -c:a libmp3lame {output_path}'
139
  result = subprocess.run(command.split(), stdout=subprocess.PIPE)
140
+ print(result.stdout.decode())
141
  return output_path
142
 
143
  def load_hubert():
 
213
  categories.append([category_title, category_folder, description, models])
214
  with gr.Blocks() as app:
215
  gr.Markdown(
216
+ "# <center> RVC Models\n"
217
  "## <center> The input audio should be clean and pure voice without background music.\n"
218
  "### <center> This project was inspired by [zomehwh](https://huggingface.co/spaces/zomehwh/rvc-models) and [ardha27](https://huggingface.co/spaces/ardha27/rvc-models)\n"
219
  "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n\n"
 
240
  )
241
  with gr.Row():
242
  with gr.Column():
243
+ vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, value="Youtube", info="Select provider [REQUIRED: UPLOAD MODE = OFF] (Default: Youtube)")
244
+ vc_link = gr.Textbox(label="Youtube URL", info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A")
245
+ vc_split_model = gr.Dropdown(label="Splitter Model", choices=["htdemucs", "mdx_extra_q"], allow_custom_value=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)")
246
+ vc_split = gr.Button("Split Audio", variant="primary")
247
  vc_vocal_preview = gr.Audio(label="Vocal Preview")
248
  vc_inst_preview = gr.Audio(label="Instrumental Preview")
249
  vc_audio_preview = gr.Audio(label="Audio Preview")
250
  with gr.Column():
251
+ upload_mode = gr.Checkbox(label="Upload mode", value=False, info="Enable to upload audio instead of audio path")
252
  vc_input = gr.Textbox(label="Input audio path")
253
  vc_upload = gr.Audio(label="Upload audio file", visible=False, interactive=True)
254
+ vc_transpose = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
 
255
  vc_f0method = gr.Radio(
256
+ label="Pitch extraction algorithm",
257
  choices=["pm", "harvest"],
258
  value="pm",
259
  interactive=True,
260
+ info="PM is fast but Harvest is better for low frequencies. (Default: PM)"
261
  )
262
  vc_index_ratio = gr.Slider(
263
  minimum=0,
 
265
  label="Retrieval feature ratio",
266
  value=0.6,
267
  interactive=True,
268
+ info="(Default: 0.6)"
269
  )
270
  tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
271
  tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
272
  tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
273
+ vc_output = gr.Audio(label="Output Audio", interactive=False)
274
+ vc_submit = gr.Button("Convert", variant="primary")
 
275
  with gr.Column():
276
  vc_volume = gr.Slider(
277
  minimum=0,
 
279
  label="Vocal volume",
280
  value=4,
281
  interactive=True,
282
+ step=1,
283
+ info="Adjust vocal volume (Default: 4}"
284
  )
285
+ vc_combined_output = gr.Audio(label="Output Combined Audio")
286
  vc_combine = gr.Button("Combine",variant="primary")
287
+ vc_submit.click(vc_fn, [vc_input, vc_upload, upload_mode, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output])
288
+ vc_split.click(cut_vocal_and_inst, [vc_link, vc_download_audio, vc_split_model], [vc_vocal_preview, vc_inst_preview, vc_audio_preview, vc_input])
289
+ vc_combine.click(combine_vocal_and_inst, [vc_output, vc_volume, vc_split_model], vc_combined_output)
290
  tts_mode.change(change_to_tts_mode, [tts_mode, upload_mode], [vc_input, vc_upload, upload_mode, tts_text, tts_voice])
291
  upload_mode.change(change_to_upload_mode, [upload_mode], [vc_input, vc_upload])
292
  app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
app.py CHANGED
@@ -67,7 +67,7 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
67
  print(
68
  f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
69
  )
70
- return "Success", (tgt_sr, audio_opt)
71
  except:
72
  info = traceback.format_exc()
73
  print(info)
@@ -138,7 +138,7 @@ if __name__ == '__main__':
138
  categories.append([category_title, category_folder, description, models])
139
  with gr.Blocks() as app:
140
  gr.Markdown(
141
- "# <center> RVC Models [(Latest Update)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/releases/tag/20230428updated)\n"
142
  "## <center> The input audio should be clean and pure voice without background music.\n"
143
  "### <center> This project was inspired by [zomehwh](https://huggingface.co/spaces/zomehwh/rvc-models) and [ardha27](https://huggingface.co/spaces/ardha27/rvc-models)\n"
144
  "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n\n"
@@ -167,12 +167,13 @@ if __name__ == '__main__':
167
  with gr.Row():
168
  with gr.Column():
169
  vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
170
- vc_transpose = gr.Number(label="Transpose", value=0)
171
  vc_f0method = gr.Radio(
172
- label="Pitch extraction algorithm, PM is fast but Harvest is better for low frequencies",
173
  choices=["pm", "harvest"],
174
  value="pm",
175
  interactive=True,
 
176
  )
177
  vc_index_ratio = gr.Slider(
178
  minimum=0,
@@ -180,14 +181,14 @@ if __name__ == '__main__':
180
  label="Retrieval feature ratio",
181
  value=0.6,
182
  interactive=True,
 
183
  )
184
  tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
185
  tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
186
  tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
187
  vc_submit = gr.Button("Generate", variant="primary")
188
  with gr.Column():
189
- vc_output1 = gr.Textbox(label="Output Message")
190
- vc_output2 = gr.Audio(label="Output Audio")
191
- vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
192
  tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice])
193
  app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
 
67
  print(
68
  f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
69
  )
70
+ return (tgt_sr, audio_opt)
71
  except:
72
  info = traceback.format_exc()
73
  print(info)
 
138
  categories.append([category_title, category_folder, description, models])
139
  with gr.Blocks() as app:
140
  gr.Markdown(
141
+ "# <center> RVC Models\n"
142
  "## <center> The input audio should be clean and pure voice without background music.\n"
143
  "### <center> This project was inspired by [zomehwh](https://huggingface.co/spaces/zomehwh/rvc-models) and [ardha27](https://huggingface.co/spaces/ardha27/rvc-models)\n"
144
  "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n\n"
 
167
  with gr.Row():
168
  with gr.Column():
169
  vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
170
+ vc_transpose = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
171
  vc_f0method = gr.Radio(
172
+ label="Pitch extraction algorithm",
173
  choices=["pm", "harvest"],
174
  value="pm",
175
  interactive=True,
176
+ info="PM is fast but Harvest is better for low frequencies. (Default: PM)"
177
  )
178
  vc_index_ratio = gr.Slider(
179
  minimum=0,
 
181
  label="Retrieval feature ratio",
182
  value=0.6,
183
  interactive=True,
184
+ info="(Default: 0.6)"
185
  )
186
  tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
187
  tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
188
  tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
189
  vc_submit = gr.Button("Generate", variant="primary")
190
  with gr.Column():
191
+ vc_output = gr.Audio(label="Output Audio")
192
+ vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output])
 
193
  tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice])
194
  app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
weights/anime/model_info.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sistine-fibel": {
3
+ "enable": true,
4
+ "name": "sistine-fibel",
5
+ "title": "Rokudenashi Majutsu Koushi to Akashic Records - Sistine Fibel",
6
+ "cover": "cover.png",
7
+ "feature_retrieval_library": "added_IVF412_Flat_nprobe_1.index",
8
+ "author":"baguss"
9
+ }
10
+ }
weights/anime/sistine-fibel/added_IVF412_Flat_nprobe_1.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21fa6e62422d5ee20a6ce0ad0fb1f6017fb0725bae06ad6a564ba10597fcfc2a
3
+ size 17032267
weights/anime/sistine-fibel/cover.png ADDED

Git LFS Details

  • SHA256: 53d76e0396eb17c976d822031a799ba4b52712b69011bacf7c3e32a82a298a9f
  • Pointer size: 131 Bytes
  • Size of remote file: 608 kB
weights/anime/sistine-fibel/sistine-fibel.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb5c1243c98d85a4c6da69aedbbe805cffc5447bf2f26d47b145114acee29397
3
+ size 55026095
weights/folder_info.json CHANGED
@@ -3,6 +3,12 @@
3
  "enable": true,
4
  "title": "Genshin Impact",
5
  "folder_path": "genshin-impact",
6
- "description": ""
 
 
 
 
 
 
7
  }
8
  }
 
3
  "enable": true,
4
  "title": "Genshin Impact",
5
  "folder_path": "genshin-impact",
6
+ "description": "Models from [RVC Genshin Impact](https://huggingface.co/ArkanDash/rvc-genshin-impact)"
7
+ },
8
+ "anime":{
9
+ "enable": true,
10
+ "title": "Anime",
11
+ "folder_path": "anime",
12
+ "description": "Models from [RVC Rokudenashi Akashic Records](https://huggingface.co/baguss/RVC_Rokudenashi_Akashic_Records)"
13
  }
14
  }
weights/genshin-impact/klee-jp/added_IVF1036_Flat_nprobe_1.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aea4838e463962216484dcbd42804eb0ef61f59b5a596afa20aae4e37df79b21
3
+ size 42770347
weights/genshin-impact/klee-jp/cover.png ADDED

Git LFS Details

  • SHA256: 05945712a7515bd579b09e6b40ec50c4574e5fcb34a0d8814ff901ce624732dd
  • Pointer size: 132 Bytes
  • Size of remote file: 1 MB
weights/genshin-impact/klee-jp/klee-jp.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:207e17307ea789211b670d5db1fc5b1072716fa74323557dd70818aac2c878d4
3
+ size 55026095
weights/genshin-impact/model_info.json CHANGED
@@ -87,6 +87,14 @@
87
  "feature_retrieval_library": "added_IVF2062_Flat_nprobe_1.index",
88
  "author":"ArkanDash"
89
  },
 
 
 
 
 
 
 
 
90
  "fischl-jp": {
91
  "enable": true,
92
  "name": "fischl-jp",
 
87
  "feature_retrieval_library": "added_IVF2062_Flat_nprobe_1.index",
88
  "author":"ArkanDash"
89
  },
90
+ "klee-jp": {
91
+ "enable": true,
92
+ "name": "klee-jp",
93
+ "title": "Genshin Impact - Klee",
94
+ "cover": "cover.png",
95
+ "feature_retrieval_library": "added_IVF1036_Flat_nprobe_1.index",
96
+ "author":"ArkanDash"
97
+ },
98
  "fischl-jp": {
99
  "enable": true,
100
  "name": "fischl-jp",