Spaces:

ArkanDash
/

rvc-genshin-impact

Running on CPU Upgrade

App Files Files Community

ArkanDash commited on May 29, 2023

Commit

cf10d9b

•

1 Parent(s): db4e781

feat(app): ui overhaul

Browse files

Files changed (11) hide show

app-full.py +62 -36
app.py +8 -7
weights/anime/model_info.json +10 -0
weights/anime/sistine-fibel/added_IVF412_Flat_nprobe_1.index +3 -0
weights/anime/sistine-fibel/cover.png +3 -0
weights/anime/sistine-fibel/sistine-fibel.pth +3 -0
weights/folder_info.json +7 -1
weights/genshin-impact/klee-jp/added_IVF1036_Flat_nprobe_1.index +3 -0
weights/genshin-impact/klee-jp/cover.png +3 -0
weights/genshin-impact/klee-jp/klee-jp.pth +3 -0
weights/genshin-impact/model_info.json +8 -0

app-full.py CHANGED Viewed

@@ -75,40 +75,61 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
             print(
                 f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
             )
-            return "Success", (tgt_sr, audio_opt)
         except:
             info = traceback.format_exc()
             print(info)
             return info, (None, None)
     return vc_fn
-def cut_vocal_and_inst(yt_url):
-    if yt_url != "":
-        if not os.path.exists("youtube_audio"):
-            os.mkdir("youtube_audio")
-        ydl_opts = {
             'format': 'bestaudio/best',
             'postprocessors': [{
                 'key': 'FFmpegExtractAudio',
                 'preferredcodec': 'wav',
             }],
-            "outtmpl": 'youtube_audio/audio',
-        }
-        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-            ydl.download([yt_url])
-        yt_audio_path = "youtube_audio/audio.wav"
-        command = f"demucs --two-stems=vocals {yt_audio_path}"
-        result = subprocess.run(command.split(), stdout=subprocess.PIPE)
-        print(result.stdout.decode())
-        return ("separated/htdemucs/audio/vocals.wav", "separated/htdemucs/audio/no_vocals.wav", yt_audio_path, "separated/htdemucs/audio/vocals.wav")
-def combine_vocal_and_inst(audio_data, audio_volume):
-    print(audio_data)
-    if not os.path.exists("result"):
-        os.mkdir("result")
-    vocal_path = "result/output.wav"
-    inst_path = "separated/htdemucs/audio/no_vocals.wav"
-    output_path = "result/combine.mp3"
     with wave.open(vocal_path, "w") as wave_file:
         wave_file.setnchannels(1)
         wave_file.setsampwidth(2)
@@ -116,6 +137,7 @@ def combine_vocal_and_inst(audio_data, audio_volume):
         wave_file.writeframes(audio_data[1].tobytes())
     command =  f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [1:a]volume={audio_volume}dB[v];[0:a][v]amix=inputs=2:duration=longest -b:a 320k -c:a libmp3lame {output_path}'
     result = subprocess.run(command.split(), stdout=subprocess.PIPE)
     return output_path
 def load_hubert():
@@ -191,7 +213,7 @@ if __name__ == '__main__':
         categories.append([category_title, category_folder, description, models])
     with gr.Blocks() as app:
         gr.Markdown(
-            "# <center> RVC Models [(Latest Update)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/releases/tag/20230428updated)\n"
             "## <center> The input audio should be clean and pure voice without background music.\n"
             "### <center> This project was inspired by [zomehwh](https://huggingface.co/spaces/zomehwh/rvc-models) and [ardha27](https://huggingface.co/spaces/ardha27/rvc-models)\n"
             "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n\n"
@@ -218,21 +240,24 @@ if __name__ == '__main__':
                                 )
                             with gr.Row():
                                 with gr.Column():
-                                    vc_youtube = gr.Textbox(label="Youtube URL")
-                                    vc_convert = gr.Button("Convert", variant="primary")
                                     vc_vocal_preview = gr.Audio(label="Vocal Preview")
                                     vc_inst_preview = gr.Audio(label="Instrumental Preview")
                                     vc_audio_preview = gr.Audio(label="Audio Preview")
                                 with gr.Column():
                                     vc_input = gr.Textbox(label="Input audio path")
                                     vc_upload = gr.Audio(label="Upload audio file", visible=False, interactive=True)
-                                    upload_mode = gr.Checkbox(label="Upload mode", value=False)
-                                    vc_transpose = gr.Number(label="Transpose", value=0)
                                     vc_f0method = gr.Radio(
-                                        label="Pitch extraction algorithm, PM is fast but Harvest is better for low frequencies",
                                         choices=["pm", "harvest"],
                                         value="pm",
                                         interactive=True,
                                     )
                                     vc_index_ratio = gr.Slider(
                                         minimum=0,
@@ -240,13 +265,13 @@ if __name__ == '__main__':
                                         label="Retrieval feature ratio",
                                         value=0.6,
                                         interactive=True,
                                     )
                                     tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
                                     tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
                                     tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
-                                    vc_output1 = gr.Textbox(label="Output Message")
-                                    vc_output2 = gr.Audio(label="Output Audio")
-                                    vc_submit = gr.Button("Generate", variant="primary")
                                 with gr.Column():
                                     vc_volume = gr.Slider(
                                         minimum=0,
@@ -254,13 +279,14 @@ if __name__ == '__main__':
                                         label="Vocal volume",
                                         value=4,
                                         interactive=True,
-                                        step=1
                                     )
-                                    vc_outputCombine = gr.Audio(label="Output Combined Audio")
                                     vc_combine =  gr.Button("Combine",variant="primary")
-                        vc_submit.click(vc_fn, [vc_input, vc_upload, upload_mode, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
-                        vc_convert.click(cut_vocal_and_inst, vc_youtube, [vc_vocal_preview, vc_inst_preview, vc_audio_preview, vc_input])
-                        vc_combine.click(combine_vocal_and_inst, [vc_output2, vc_volume], vc_outputCombine)
                         tts_mode.change(change_to_tts_mode, [tts_mode, upload_mode], [vc_input, vc_upload, upload_mode, tts_text, tts_voice])
                         upload_mode.change(change_to_upload_mode, [upload_mode], [vc_input, vc_upload])
         app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)

             print(
                 f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
             )
+            return (tgt_sr, audio_opt)
         except:
             info = traceback.format_exc()
             print(info)
             return info, (None, None)
     return vc_fn
+def cut_vocal_and_inst(url, audio_provider, split_model):
+    if url != "":
+        if not os.path.exists("dl_audio"):
+            os.mkdir("dl_audio")
+        if audio_provider == "Youtube":
+            ydl_opts = {
             'format': 'bestaudio/best',
             'postprocessors': [{
                 'key': 'FFmpegExtractAudio',
                 'preferredcodec': 'wav',
             }],
+            "outtmpl": 'dl_audio/youtube_audio',
+            }
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                ydl.download([url])
+            audio_path = "dl_audio/youtube_audio.wav"
+        else:
+            # Spotify doesnt work.
+            # Need to find other solution soon.
+            '''
+            command = f"spotdl download {url} --output dl_audio/.wav"
+            result = subprocess.run(command.split(), stdout=subprocess.PIPE)
+            print(result.stdout.decode())
+            audio_path = "dl_audio/spotify_audio.wav"
+            '''
+        if split_model == "htdemucs":
+            command = f"demucs --two-stems=vocals {audio_path} -o output"
+            result = subprocess.run(command.split(), stdout=subprocess.PIPE)
+            print(result.stdout.decode())
+            return "output/htdemucs/youtube_audio/vocals.wav", "output/htdemucs/youtube_audio/no_vocals.wav", audio_path, "output/htdemucs/youtube_audio/vocals.wav"
+        else:
+            command = f"demucs --two-stems=vocals -n mdx_extra_q {audio_path} -o output"
+            result = subprocess.run(command.split(), stdout=subprocess.PIPE)
+            print(result.stdout.decode())
+            return "output/mdx_extra_q/youtube_audio/vocals.wav", "output/mdx_extra_q/youtube_audio/no_vocals.wav", audio_path, "output/mdx_extra_q/youtube_audio/vocals.wav"
+    else:
+        raise gr.Error("URL Required!")
+        return None, None, None, None
+def combine_vocal_and_inst(audio_data, audio_volume, split_model):
+    if not os.path.exists("output/result"):
+        os.mkdir("output/result")
+    vocal_path = "output/result/output.wav"
+    output_path = "output/result/combine.mp3"
+    if split_model == "htdemucs":
+        inst_path = "output/htdemucs/youtube_audio/no_vocals.wav"
+    else:
+        inst_path = "output/mdx_extra_q/youtube_audio/no_vocals.wav"
     with wave.open(vocal_path, "w") as wave_file:
         wave_file.setnchannels(1)
         wave_file.setsampwidth(2)
         wave_file.writeframes(audio_data[1].tobytes())
     command =  f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [1:a]volume={audio_volume}dB[v];[0:a][v]amix=inputs=2:duration=longest -b:a 320k -c:a libmp3lame {output_path}'
     result = subprocess.run(command.split(), stdout=subprocess.PIPE)
+    print(result.stdout.decode())
     return output_path
 def load_hubert():
         categories.append([category_title, category_folder, description, models])
     with gr.Blocks() as app:
         gr.Markdown(
+            "# <center> RVC Models\n"
             "## <center> The input audio should be clean and pure voice without background music.\n"
             "### <center> This project was inspired by [zomehwh](https://huggingface.co/spaces/zomehwh/rvc-models) and [ardha27](https://huggingface.co/spaces/ardha27/rvc-models)\n"
             "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n\n"
                                 )
                             with gr.Row():
                                 with gr.Column():
+                                    vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, value="Youtube", info="Select provider [REQUIRED: UPLOAD MODE = OFF] (Default: Youtube)")
+                                    vc_link = gr.Textbox(label="Youtube URL", info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A")
+                                    vc_split_model = gr.Dropdown(label="Splitter Model", choices=["htdemucs", "mdx_extra_q"], allow_custom_value=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)")
+                                    vc_split = gr.Button("Split Audio", variant="primary")
                                     vc_vocal_preview = gr.Audio(label="Vocal Preview")
                                     vc_inst_preview = gr.Audio(label="Instrumental Preview")
                                     vc_audio_preview = gr.Audio(label="Audio Preview")
                                 with gr.Column():
+                                    upload_mode = gr.Checkbox(label="Upload mode", value=False, info="Enable to upload audio instead of audio path")
                                     vc_input = gr.Textbox(label="Input audio path")
                                     vc_upload = gr.Audio(label="Upload audio file", visible=False, interactive=True)
+                                    vc_transpose = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
                                     vc_f0method = gr.Radio(
+                                        label="Pitch extraction algorithm",
                                         choices=["pm", "harvest"],
                                         value="pm",
                                         interactive=True,
+                                        info="PM is fast but Harvest is better for low frequencies. (Default: PM)"
                                     )
                                     vc_index_ratio = gr.Slider(
                                         minimum=0,
                                         label="Retrieval feature ratio",
                                         value=0.6,
                                         interactive=True,
+                                        info="(Default: 0.6)"
                                     )
                                     tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
                                     tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
                                     tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
+                                    vc_output = gr.Audio(label="Output Audio", interactive=False)
+                                    vc_submit = gr.Button("Convert", variant="primary")
                                 with gr.Column():
                                     vc_volume = gr.Slider(
                                         minimum=0,
                                         label="Vocal volume",
                                         value=4,
                                         interactive=True,
+                                        step=1,
+                                        info="Adjust vocal volume (Default: 4}"
                                     )
+                                    vc_combined_output = gr.Audio(label="Output Combined Audio")
                                     vc_combine =  gr.Button("Combine",variant="primary")
+                        vc_submit.click(vc_fn, [vc_input, vc_upload, upload_mode, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output])
+                        vc_split.click(cut_vocal_and_inst, [vc_link, vc_download_audio, vc_split_model], [vc_vocal_preview, vc_inst_preview, vc_audio_preview, vc_input])
+                        vc_combine.click(combine_vocal_and_inst, [vc_output, vc_volume, vc_split_model], vc_combined_output)
                         tts_mode.change(change_to_tts_mode, [tts_mode, upload_mode], [vc_input, vc_upload, upload_mode, tts_text, tts_voice])
                         upload_mode.change(change_to_upload_mode, [upload_mode], [vc_input, vc_upload])
         app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)

app.py CHANGED Viewed

@@ -67,7 +67,7 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
             print(
                 f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
             )
-            return "Success", (tgt_sr, audio_opt)
         except:
             info = traceback.format_exc()
             print(info)
@@ -138,7 +138,7 @@ if __name__ == '__main__':
         categories.append([category_title, category_folder, description, models])
     with gr.Blocks() as app:
         gr.Markdown(
-            "# <center> RVC Models [(Latest Update)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/releases/tag/20230428updated)\n"
             "## <center> The input audio should be clean and pure voice without background music.\n"
             "### <center> This project was inspired by [zomehwh](https://huggingface.co/spaces/zomehwh/rvc-models) and [ardha27](https://huggingface.co/spaces/ardha27/rvc-models)\n"
             "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n\n"
@@ -167,12 +167,13 @@ if __name__ == '__main__':
                                 with gr.Row():
                                     with gr.Column():
                                         vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
-                                        vc_transpose = gr.Number(label="Transpose", value=0)
                                         vc_f0method = gr.Radio(
-                                            label="Pitch extraction algorithm, PM is fast but Harvest is better for low frequencies",
                                             choices=["pm", "harvest"],
                                             value="pm",
                                             interactive=True,
                                         )
                                         vc_index_ratio = gr.Slider(
                                             minimum=0,
@@ -180,14 +181,14 @@ if __name__ == '__main__':
                                             label="Retrieval feature ratio",
                                             value=0.6,
                                             interactive=True,
                                         )
                                         tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
                                         tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
                                         tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
                                         vc_submit = gr.Button("Generate", variant="primary")
                                     with gr.Column():
-                                        vc_output1 = gr.Textbox(label="Output Message")
-                                        vc_output2 = gr.Audio(label="Output Audio")
-                            vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
                             tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice])
         app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)

             print(
                 f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
             )
+            return (tgt_sr, audio_opt)
         except:
             info = traceback.format_exc()
             print(info)
         categories.append([category_title, category_folder, description, models])
     with gr.Blocks() as app:
         gr.Markdown(
+            "# <center> RVC Models\n"
             "## <center> The input audio should be clean and pure voice without background music.\n"
             "### <center> This project was inspired by [zomehwh](https://huggingface.co/spaces/zomehwh/rvc-models) and [ardha27](https://huggingface.co/spaces/ardha27/rvc-models)\n"
             "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n\n"
                                 with gr.Row():
                                     with gr.Column():
                                         vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
+                                        vc_transpose = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
                                         vc_f0method = gr.Radio(
+                                            label="Pitch extraction algorithm",
                                             choices=["pm", "harvest"],
                                             value="pm",
                                             interactive=True,
+                                            info="PM is fast but Harvest is better for low frequencies. (Default: PM)"
                                         )
                                         vc_index_ratio = gr.Slider(
                                             minimum=0,
                                             label="Retrieval feature ratio",
                                             value=0.6,
                                             interactive=True,
+                                            info="(Default: 0.6)"
                                         )
                                         tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
                                         tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
                                         tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
                                         vc_submit = gr.Button("Generate", variant="primary")
                                     with gr.Column():
+                                        vc_output = gr.Audio(label="Output Audio")
+                            vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output])
                             tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice])
         app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)

weights/anime/model_info.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+	"sistine-fibel": {
+		"enable": true,
+		"name": "sistine-fibel",
+        "title": "Rokudenashi Majutsu Koushi to Akashic Records - Sistine Fibel",
+        "cover": "cover.png",
+		"feature_retrieval_library": "added_IVF412_Flat_nprobe_1.index",
+		"author":"baguss"
+	}
+}

weights/anime/sistine-fibel/added_IVF412_Flat_nprobe_1.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21fa6e62422d5ee20a6ce0ad0fb1f6017fb0725bae06ad6a564ba10597fcfc2a
+size 17032267

weights/anime/sistine-fibel/cover.png ADDED Viewed

Git LFS Details

SHA256: 53d76e0396eb17c976d822031a799ba4b52712b69011bacf7c3e32a82a298a9f
Pointer size: 131 Bytes
Size of remote file: 608 kB

weights/anime/sistine-fibel/sistine-fibel.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb5c1243c98d85a4c6da69aedbbe805cffc5447bf2f26d47b145114acee29397
+size 55026095

weights/folder_info.json CHANGED Viewed

@@ -3,6 +3,12 @@
         "enable": true,
         "title": "Genshin Impact",
         "folder_path": "genshin-impact",
-        "description": ""
     }
 }

         "enable": true,
         "title": "Genshin Impact",
         "folder_path": "genshin-impact",
+        "description": "Models from [RVC Genshin Impact](https://huggingface.co/ArkanDash/rvc-genshin-impact)"
+    },
+    "anime":{
+        "enable": true,
+        "title": "Anime",
+        "folder_path": "anime",
+        "description": "Models from [RVC Rokudenashi Akashic Records](https://huggingface.co/baguss/RVC_Rokudenashi_Akashic_Records)"
     }
 }

weights/genshin-impact/klee-jp/added_IVF1036_Flat_nprobe_1.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aea4838e463962216484dcbd42804eb0ef61f59b5a596afa20aae4e37df79b21
+size 42770347

weights/genshin-impact/klee-jp/cover.png ADDED Viewed

Git LFS Details

SHA256: 05945712a7515bd579b09e6b40ec50c4574e5fcb34a0d8814ff901ce624732dd
Pointer size: 132 Bytes
Size of remote file: 1 MB

weights/genshin-impact/klee-jp/klee-jp.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:207e17307ea789211b670d5db1fc5b1072716fa74323557dd70818aac2c878d4
+size 55026095

weights/genshin-impact/model_info.json CHANGED Viewed

@@ -87,6 +87,14 @@
 		"feature_retrieval_library": "added_IVF2062_Flat_nprobe_1.index",
 		"author":"ArkanDash"
 	},
 	"fischl-jp": {
 		"enable": true,
 		"name": "fischl-jp",

 		"feature_retrieval_library": "added_IVF2062_Flat_nprobe_1.index",
 		"author":"ArkanDash"
 	},
+	"klee-jp": {
+		"enable": true,
+		"name": "klee-jp",
+        "title": "Genshin Impact - Klee",
+        "cover": "cover.png",
+		"feature_retrieval_library": "added_IVF1036_Flat_nprobe_1.index",
+		"author":"ArkanDash"
+	},
 	"fischl-jp": {
 		"enable": true,
 		"name": "fischl-jp",