Spaces:

thejenja
/

Applio-V3

Running

Ilaria commited on Dec 4, 2023

Commit

7cfb6ba

1 Parent(s): 2afc955

New Version - 3.0

- New support for Ilaria TTS (best tts at the moment for HF, resources speaking)

- Fixed support for ElevenLabs and Google TTS

- Faster inference

- Better looking UI

- Various bug fixes

- Removed Herobrine

Files changed (4) hide show

app.py +58 -33
ilariatts.py +230 -0
requirements.txt +2 -1
vc_infer_pipeline.py +9 -8

app.py CHANGED Viewed

@@ -21,6 +21,11 @@ warnings.filterwarnings("ignore")
 torch.manual_seed(114514)
 from i18n import I18nAuto
 import signal
 import math
@@ -1445,6 +1450,12 @@ def elevenTTS(xiapi, text, id, lang):
         aud_path = save_to_wav('./temp_gTTS.mp3')
         return aud_path, aud_path
 def upload_to_dataset(files, dir):
     if dir == '':
         dir = './dataset'
@@ -1470,7 +1481,7 @@ def zip_downloader(model):
     else:
         return f'./weights/{model}.pth', "Could not find Index file."
-with gr.Blocks(theme=gr.themes.Base (), title='Mangio-RVC-Web 💻') as app:
     with gr.Tabs():
         with gr.TabItem("Inference"):
             gr.HTML("<h1>  Ilaria RVC 💖   </h1>")
@@ -1525,11 +1536,11 @@ with gr.Blocks(theme=gr.themes.Base (), title='Mangio-RVC-Web 💻') as app:
                         dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
                         refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
                         record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
-                        record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
                     with gr.Row():
-                        with gr.Accordion('Text To Speech', open=False):
                             with gr.Column():
-                                lang = gr.Radio(label='Chinese & Japanese do not work with ElevenLabs currently.',choices=['en','es','fr','pt','zh-CN','de','hi','ja'], value='en')
                                 api_box = gr.Textbox(label="Enter your API Key for ElevenLabs, or leave empty to use GoogleTTS", value='')
                                 elevenid=gr.Dropdown(label="Voice:", choices=eleven_voices)
                             with gr.Column():
@@ -1537,7 +1548,7 @@ with gr.Blocks(theme=gr.themes.Base (), title='Mangio-RVC-Web 💻') as app:
                                 tts_button = gr.Button(value="Speak")
                                 tts_button.click(fn=elevenTTS, inputs=[api_box,tfs, elevenid, lang], outputs=[record_button, input_audio0])
                     with gr.Row():
-                        with gr.Accordion('Wav2Lip', open=False):
                             with gr.Row():
                                 size = gr.Radio(label='Resolution:',choices=['Half','Full'])
                                 face = gr.UploadButton("Upload A Character",type='file')
@@ -1550,37 +1561,50 @@ with gr.Blocks(theme=gr.themes.Base (), title='Mangio-RVC-Web 💻') as app:
                                 refresh_button2.click(fn=change_choices2, inputs=[], outputs=[input_audio0, animation])
                             with gr.Row():
                                 animate_button = gr.Button('Animate')
                 with gr.Column():
-                    with gr.Accordion("Index Settings", open=False):
-                        file_index1 = gr.Dropdown(
-                            label="3. Choose the index file (in case it wasn't automatically found.)",
-                            choices=get_indexes(),
-                            value=get_index(),
-                            interactive=True,
-                            )
-                        sid0.change(fn=match_index, inputs=[sid0],outputs=[file_index1])
-                        refresh_button.click(
-                            fn=change_choices, inputs=[], outputs=[sid0, file_index1]
-                            )
-                        # file_big_npy1 = gr.Textbox(
-                        #     label=i18n("特征文件路径"),
-                        #     value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
-                        #     interactive=True,
-                        # )
-                        index_rate1 = gr.Slider(
-                            minimum=0,
-                            maximum=1,
-                            label=i18n("检索特征占比"),
-                            value=0.66,
-                            interactive=True,
-                            )
                     vc_output2 = gr.Audio(
                         label="Final Result! (Click on the three dots to download the audio)",
                         type='filepath',
                         interactive=False,
                     )
                     animate_button.click(fn=mouth, inputs=[size, face, vc_output2, faces], outputs=[animation, preview])
                     with gr.Accordion("Advanced Options", open=False):
                         f0method0 = gr.Radio(
                             label="Optional: Change the Pitch Extraction Algorithm. Extraction methods are sorted from 'worst quality' to 'best quality'. If you don't know what you're doing, leave rmvpe.",
@@ -1679,6 +1703,7 @@ with gr.Blocks(theme=gr.themes.Base (), title='Mangio-RVC-Web 💻') as app:
                         formanting.change(fn=formant_enabled,inputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button],outputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button])
                         frmntbut.click(fn=formant_apply,inputs=[qfrency, tmbre], outputs=[qfrency, tmbre])
                         formant_refresh_button.click(fn=update_fshift_presets,inputs=[formant_preset, qfrency, tmbre],outputs=[formant_preset, qfrency, tmbre])
             with gr.Row():
                 vc_output1 = gr.Textbox("")
                 f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"), visible=False)
@@ -1704,7 +1729,7 @@ with gr.Blocks(theme=gr.themes.Base (), title='Mangio-RVC-Web 💻') as app:
                     [vc_output1, vc_output2],
                 )
-            with gr.Accordion("Batch Conversion",open=False):
                 with gr.Row():
                     with gr.Column():
                         vc_transform1 = gr.Number(
@@ -1828,7 +1853,7 @@ with gr.Blocks(theme=gr.themes.Base (), title='Mangio-RVC-Web 💻') as app:
                 model = gr.Textbox(label="Name of the model (without spaces):")
                 download_button=gr.Button("Download")
             with gr.Row():
-                status_bar=gr.Textbox(label="")
                 download_button.click(fn=download_from_url, inputs=[url, model], outputs=[status_bar])
             with gr.Row():
                 gr.Markdown(
@@ -2080,9 +2105,9 @@ with gr.Blocks(theme=gr.themes.Base (), title='Mangio-RVC-Web 💻') as app:
         else:
             print(
                 "Pretrained weights not downloaded. Disabling training tab.\n"
-                "Wondering how to train a voice? Visit here for the RVC model training guide: https://t.ly/RVC_Training_Guide\n"
                 "-------------------------------\n"
             )
-    app.queue(concurrency_count=511, max_size=1022).launch(share=False, quiet=True)
 #endregion

 torch.manual_seed(114514)
 from i18n import I18nAuto
+import edge_tts, asyncio
+from ilariatts import tts_order_voice
+language_dict = tts_order_voice
+ilariavoices = language_dict.keys()
 import signal
 import math
         aud_path = save_to_wav('./temp_gTTS.mp3')
         return aud_path, aud_path
+def ilariaTTS(text, ttsvoice):
+    vo=language_dict[ttsvoice]
+    asyncio.run(edge_tts.Communicate(text, vo).save("./temp_ilaria.mp3"))
+    aud_path = save_to_wav('./temp_ilaria.mp3')
+    return aud_path, aud_path
 def upload_to_dataset(files, dir):
     if dir == '':
         dir = './dataset'
     else:
         return f'./weights/{model}.pth', "Could not find Index file."
+with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose"), title="Ilaria RVC 💖") as app:
     with gr.Tabs():
         with gr.TabItem("Inference"):
             gr.HTML("<h1>  Ilaria RVC 💖   </h1>")
                         dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
                         refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
                         record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
+                        record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
                     with gr.Row():
+                        with gr.Accordion('ElevenLabs / Google TTS', open=False):
                             with gr.Column():
+                                lang = gr.Radio(label='Chinese & Japanese do not work with ElevenLabs currently.',choices=['en','it','es','fr','pt','zh-CN','de','hi','ja'], value='en')
                                 api_box = gr.Textbox(label="Enter your API Key for ElevenLabs, or leave empty to use GoogleTTS", value='')
                                 elevenid=gr.Dropdown(label="Voice:", choices=eleven_voices)
                             with gr.Column():
                                 tts_button = gr.Button(value="Speak")
                                 tts_button.click(fn=elevenTTS, inputs=[api_box,tfs, elevenid, lang], outputs=[record_button, input_audio0])
                     with gr.Row():
+                        with gr.Accordion('Wav2Lip', open=False, visible=False):
                             with gr.Row():
                                 size = gr.Radio(label='Resolution:',choices=['Half','Full'])
                                 face = gr.UploadButton("Upload A Character",type='file')
                                 refresh_button2.click(fn=change_choices2, inputs=[], outputs=[input_audio0, animation])
                             with gr.Row():
                                 animate_button = gr.Button('Animate')
                 with gr.Column():
                     vc_output2 = gr.Audio(
                         label="Final Result! (Click on the three dots to download the audio)",
                         type='filepath',
                         interactive=False,
                     )
+                    with gr.Accordion('IlariaTTS', open=True):
+                        with gr.Column():
+                            ilariaid=gr.Dropdown(label="Voice:", choices=ilariavoices, value="English-Jenny (Female)")
+                            ilariatext = gr.Textbox(label="Input your Text", interactive=True, value="This is a test.")
+                            ilariatts_button = gr.Button(value="Speak")
+                            ilariatts_button.click(fn=ilariaTTS, inputs=[ilariatext, ilariaid], outputs=[record_button, input_audio0])
+                #with gr.Column():
+                    with gr.Accordion("Index Settings", open=False):
+                        #with gr.Row():
+                            file_index1 = gr.Dropdown(
+                                label="3. Choose the index file (in case it wasn't automatically found.)",
+                                choices=get_indexes(),
+                                value=get_index(),
+                                interactive=True,
+                                )
+                            sid0.change(fn=match_index, inputs=[sid0],outputs=[file_index1])
+                            refresh_button.click(
+                                fn=change_choices, inputs=[], outputs=[sid0, file_index1]
+                                )
+                            # file_big_npy1 = gr.Textbox(
+                            #     label=i18n("特征文件路径"),
+                            #     value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
+                            #     interactive=True,
+                            # )
+                            index_rate1 = gr.Slider(
+                                minimum=0,
+                                maximum=1,
+                                label=i18n("检索特征占比"),
+                                value=0.66,
+                                interactive=True,
+                                )
                     animate_button.click(fn=mouth, inputs=[size, face, vc_output2, faces], outputs=[animation, preview])
                     with gr.Accordion("Advanced Options", open=False):
                         f0method0 = gr.Radio(
                             label="Optional: Change the Pitch Extraction Algorithm. Extraction methods are sorted from 'worst quality' to 'best quality'. If you don't know what you're doing, leave rmvpe.",
                         formanting.change(fn=formant_enabled,inputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button],outputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button])
                         frmntbut.click(fn=formant_apply,inputs=[qfrency, tmbre], outputs=[qfrency, tmbre])
                         formant_refresh_button.click(fn=update_fshift_presets,inputs=[formant_preset, qfrency, tmbre],outputs=[formant_preset, qfrency, tmbre])
             with gr.Row():
                 vc_output1 = gr.Textbox("")
                 f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"), visible=False)
                     [vc_output1, vc_output2],
                 )
+            with gr.Accordion("Batch Conversion",open=False, visible=False):
                 with gr.Row():
                     with gr.Column():
                         vc_transform1 = gr.Number(
                 model = gr.Textbox(label="Name of the model (without spaces):")
                 download_button=gr.Button("Download")
             with gr.Row():
+                status_bar=gr.Textbox(label="Download Status")
                 download_button.click(fn=download_from_url, inputs=[url, model], outputs=[status_bar])
             with gr.Row():
                 gr.Markdown(
         else:
             print(
                 "Pretrained weights not downloaded. Disabling training tab.\n"
+                "Wondering how to train a voice? Join AI HUB Discord Server! https://discord.gg/aihub\n"
                 "-------------------------------\n"
             )
+    app.queue(concurrency_count=511, max_size=1022).launch(share=False, quiet=False)
 #endregion

ilariatts.py ADDED Viewed

	@@ -0,0 +1,230 @@

+tts_order_voice = {'English-Jenny (Female)': 'en-US-JennyNeural',
+ 'English-Guy (Male)': 'en-US-GuyNeural',
+ 'English-Ana (Female)': 'en-US-AnaNeural',
+ 'English-Aria (Female)': 'en-US-AriaNeural',
+ 'English-Christopher (Male)': 'en-US-ChristopherNeural',
+ 'English-Eric (Male)': 'en-US-EricNeural',
+ 'English-Michelle (Female)': 'en-US-MichelleNeural',
+ 'English-Roger (Male)': 'en-US-RogerNeural',
+ 'Spanish (Mexican)-Dalia (Female)': 'es-MX-DaliaNeural',
+ 'Spanish (Mexican)-Jorge- (Male)': 'es-MX-JorgeNeural',
+ 'Korean-Sun-Hi- (Female)': 'ko-KR-SunHiNeural',
+ 'Korean-InJoon- (Male)': 'ko-KR-InJoonNeural',
+'Thai-Premwadee- (Female)': 'th-TH-PremwadeeNeural',
+ 'Thai-Niwat- (Male)': 'th-TH-NiwatNeural',
+ 'Vietnamese-HoaiMy- (Female)': 'vi-VN-HoaiMyNeural',
+'Vietnamese-NamMinh- (Male)': 'vi-VN-NamMinhNeural',
+ 'Japanese-Nanami- (Female)': 'ja-JP-NanamiNeural',
+ 'Japanese-Keita- (Male)': 'ja-JP-KeitaNeural',
+ 'French-Denise- (Female)': 'fr-FR-DeniseNeural',
+ 'French-Eloise- (Female)': 'fr-FR-EloiseNeural',
+ 'French-Henri- (Male)': 'fr-FR-HenriNeural',
+ 'Brazilian-Francisca- (Female)': 'pt-BR-FranciscaNeural',
+ 'Brazilian-Antonio- (Male)': 'pt-BR-AntonioNeural',
+ 'Indonesian-Ardi- (Male)': 'id-ID-ArdiNeural',
+ 'Indonesian-Gadis- (Female)': 'id-ID-GadisNeural',
+ 'Hebrew-Avri- (Male)': 'he-IL-AvriNeural',
+ 'Hebrew-Hila- (Female)': 'he-IL-HilaNeural',
+'Italian-Isabella- (Female)': 'it-IT-IsabellaNeural',
+ 'Italian-Diego- (Male)': 'it-IT-DiegoNeural',
+ 'Italian-Elsa- (Female)': 'it-IT-ElsaNeural',
+ 'Dutch-Colette- (Female)': 'nl-NL-ColetteNeural',
+ 'Dutch-Fenna- (Female)': 'nl-NL-FennaNeural',
+ 'Dutch-Maarten- (Male)': 'nl-NL-MaartenNeural',
+'Malese-Osman- (Male)': 'ms-MY-OsmanNeural',
+ 'Malese-Yasmin- (Female)': 'ms-MY-YasminNeural',
+ 'Norwegian-Pernille- (Female)': 'nb-NO-PernilleNeural',
+ 'Norwegian-Finn- (Male)': 'nb-NO-FinnNeural',
+ 'Swedish-Sofie- (Female)': 'sv-SE-SofieNeural',
+ 'ArabicSwedish-Mattias- (Male)': 'sv-SE-MattiasNeural',
+ 'Arabic-Hamed- (Male)': 'ar-SA-HamedNeural',
+ 'Arabic-Zariyah- (Female)': 'ar-SA-ZariyahNeural',
+ 'Greek-Athina- (Female)': 'el-GR-AthinaNeural',
+ 'Greek-Nestoras- (Male)': 'el-GR-NestorasNeural',
+'German-Katja- (Female)': 'de-DE-KatjaNeural',
+ 'German-Amala- (Female)': 'de-DE-AmalaNeural',
+ 'German-Conrad- (Male)': 'de-DE-ConradNeural',
+ 'German-Killian- (Male)': 'de-DE-KillianNeural',
+ 'Afrikaans-Adri- (Female)': 'af-ZA-AdriNeural',
+ 'Afrikaans-Willem- (Male)': 'af-ZA-WillemNeural',
+ 'Ethiopian-Ameha- (Male)': 'am-ET-AmehaNeural',
+ 'Ethiopian-Mekdes- (Female)': 'am-ET-MekdesNeural',
+ 'Arabic (UAD)-Fatima- (Female)': 'ar-AE-FatimaNeural',
+ 'Arabic (UAD)-Hamdan- (Male)': 'ar-AE-HamdanNeural',
+ 'Arabic (Bahrain)-Ali- (Male)': 'ar-BH-AliNeural',
+ 'Arabic (Bahrain)-Laila- (Female)': 'ar-BH-LailaNeural',
+ 'Arabic (Algeria)-Ismael- (Male)': 'ar-DZ-IsmaelNeural',
+ 'Arabic (Egypt)-Salma- (Female)': 'ar-EG-SalmaNeural',
+ 'Arabic (Egypt)-Shakir- (Male)': 'ar-EG-ShakirNeural',
+ 'Arabic (Iraq)-Bassel- (Male)': 'ar-IQ-BasselNeural',
+ 'Arabic (Iraq)-Rana- (Female)': 'ar-IQ-RanaNeural',
+ 'Arabic (Jordan)-Sana- (Female)': 'ar-JO-SanaNeural',
+ 'Arabic (Jordan)-Taim- (Male)': 'ar-JO-TaimNeural',
+ 'Arabic (Kuwait)-Fahed- (Male)': 'ar-KW-FahedNeural',
+ 'Arabic (Kuwait)-Noura- (Female)': 'ar-KW-NouraNeural',
+ 'Arabic (Lebanon)-Layla- (Female)': 'ar-LB-LaylaNeural',
+ 'Arabic (Lebanon)-Rami- (Male)': 'ar-LB-RamiNeural',
+ 'Arabic (Libya)-Iman- (Female)': 'ar-LY-ImanNeural',
+ 'Arabic (Libya)-Omar- (Male)': 'ar-LY-OmarNeural',
+ 'Arabic (Morocco)-Jamal- (Male)': 'ar-MA-JamalNeural',
+ 'Arabic (Morocco)-Mouna- (Female)': 'ar-MA-MounaNeural',
+ 'Arabic (Oman)-Abdullah- (Male)': 'ar-OM-AbdullahNeural',
+ 'Arabic (Oman)-Aysha- (Female)': 'ar-OM-AyshaNeural',
+ 'Arabic (Qatar)-Amal- (Female)': 'ar-QA-AmalNeural',
+ 'Arabic (Qatar)-Moaz- (Male)': 'ar-QA-MoazNeural',
+ 'Arabic (Syrian Arab Republic)-Amany- (Female)': 'ar-SY-AmanyNeural',
+ 'Arabic (Syrian Arab Republic)-Laith- (Male)': 'ar-SY-LaithNeural',
+ 'Arabic (Tunisia)-Hedi- (Male)': 'ar-TN-HediNeural',
+ 'Arabic (Tunisia)-Reem- (Female)': 'ar-TN-ReemNeural',
+ 'Arabic (Yemen	)-Maryam- (Female)': 'ar-YE-MaryamNeural',
+ 'Arabic (Yemen	)-Saleh- (Male)': 'ar-YE-SalehNeural',
+ 'Azerbaijani-Babek- (Male)': 'az-AZ-BabekNeural',
+ 'Azerbaijani-Banu- (Female)': 'az-AZ-BanuNeural',
+ 'Bulgarian-Borislav- (Male)': 'bg-BG-BorislavNeural',
+ 'Bulgarian-Kalina- (Female)': 'bg-BG-KalinaNeural',
+ 'Bengali (Bangladesh)-Nabanita- (Female)': 'bn-BD-NabanitaNeural',
+ 'Bengali (Bangladesh)-Pradeep- (Male)': 'bn-BD-PradeepNeural',
+ 'Bengali (India)-Bashkar- (Male)': 'bn-IN-BashkarNeural',
+ 'Bengali (India)-Tanishaa- (Female)': 'bn-IN-TanishaaNeural',
+ 'Bosniak (Bosnia and Herzegovina)-Goran- (Male)': 'bs-BA-GoranNeural',
+ 'Bosniak (Bosnia and Herzegovina)-Vesna- (Female)': 'bs-BA-VesnaNeural',
+ 'Catalan (Spain)-Joana- (Female)': 'ca-ES-JoanaNeural',
+ 'Catalan (Spain)-Enric- (Male)': 'ca-ES-EnricNeural',
+ 'Czech (Czech Republic)-Antonin- (Male)': 'cs-CZ-AntoninNeural',
+ 'Czech (Czech Republic)-Vlasta- (Female)': 'cs-CZ-VlastaNeural',
+ 'Welsh (UK)-Aled- (Male)': 'cy-GB-AledNeural',
+ 'Welsh (UK)-Nia- (Female)': 'cy-GB-NiaNeural',
+ 'Danish (Denmark)-Christel- (Female)': 'da-DK-ChristelNeural',
+ 'Danish (Denmark)-Jeppe- (Male)': 'da-DK-JeppeNeural',
+ 'German (Austria)-Ingrid- (Female)': 'de-AT-IngridNeural',
+ 'German (Austria)-Jonas- (Male)': 'de-AT-JonasNeural',
+ 'German (Switzerland)-Jan- (Male)': 'de-CH-JanNeural',
+ 'German (Switzerland)-Leni- (Female)': 'de-CH-LeniNeural',
+ 'English (Australia)-Natasha- (Female)': 'en-AU-NatashaNeural',
+ 'English (Australia)-William- (Male)': 'en-AU-WilliamNeural',
+ 'English (Canada)-Clara- (Female)': 'en-CA-ClaraNeural',
+ 'English (Canada)-Liam- (Male)': 'en-CA-LiamNeural',
+ 'English (UK)-Libby- (Female)': 'en-GB-LibbyNeural',
+ 'English (UK)-Maisie- (Female)': 'en-GB-MaisieNeural',
+ 'English (UK)-Ryan- (Male)': 'en-GB-RyanNeural',
+ 'English (UK)-Sonia- (Female)': 'en-GB-SoniaNeural',
+ 'English (UK)-Thomas- (Male)': 'en-GB-ThomasNeural',
+ 'English (Hong Kong)-Sam- (Male)': 'en-HK-SamNeural',
+ 'English (Hong Kong)-Yan- (Female)': 'en-HK-YanNeural',
+ 'English (Ireland)-Connor- (Male)': 'en-IE-ConnorNeural',
+ 'English (Ireland)-Emily- (Female)': 'en-IE-EmilyNeural',
+ 'English (India)-Neerja- (Female)': 'en-IN-NeerjaNeural',
+ 'English (India)-Prabhat- (Male)': 'en-IN-PrabhatNeural',
+ 'English (Kenya)-Asilia- (Female)': 'en-KE-AsiliaNeural',
+ 'English (Kenya)-Chilemba- (Male)': 'en-KE-ChilembaNeural',
+ 'English (Nigeria)-Abeo- (Male)': 'en-NG-AbeoNeural',
+'English (Nigeria)-Ezinne- (Female)': 'en-NG-EzinneNeural',
+ 'English (New Zealand)-Mitchell- (Male)': 'en-NZ-MitchellNeural',
+ 'English (Philippines)-James- (Male)': 'en-PH-JamesNeural',
+ 'English (Philippines)-Rosa- (Female)': 'en-PH-RosaNeural',
+ 'English (Singapore)-Luna- (Female)': 'en-SG-LunaNeural',
+ 'English (Singapore)-Wayne- (Male)': 'en-SG-WayneNeural',
+ 'English (Tanzania)-Elimu- (Male)': 'en-TZ-ElimuNeural',
+ 'English (Tanzania)-Imani- (Female)': 'en-TZ-ImaniNeural',
+ 'English (South Africa)-Leah- (Female)': 'en-ZA-LeahNeural',
+ 'English (South Africa)-Luke- (Male)': 'en-ZA-LukeNeural',
+'Spanish (Argentina)-Elena- (Female)': 'es-AR-ElenaNeural',
+ 'Spanish (Argentina)-Tomas- (Male)': 'es-AR-TomasNeural',
+ 'Spanish (Bolivia)-Marcelo- (Male)': 'es-BO-MarceloNeural',
+ 'Spanish (Bolivia)-Sofia- (Female)': 'es-BO-SofiaNeural',
+ 'Spanish (Colombia)-Gonzalo- (Male)': 'es-CO-GonzaloNeural',
+ 'Spanish (Colombia)-Salome- (Female)': 'es-CO-SalomeNeural',
+ 'Spanish (Costa Rica)-Juan- (Male)': 'es-CR-JuanNeural',
+ 'Spanish (Costa Rica)-Maria- (Female)': 'es-CR-MariaNeural',
+ 'Spanish (Cuba)-Belkys- (Female)': 'es-CU-BelkysNeural',
+ 'Spanish (Dominican Republic)-Emilio- (Male)': 'es-DO-EmilioNeural',
+ 'Spanish (Dominican Republic)-Ramona- (Female)': 'es-DO-RamonaNeural',
+ 'Spanish (Ecuador)-Andrea- (Female)': 'es-EC-AndreaNeural',
+ 'Spanish (Ecuador)-Luis- (Male)': 'es-EC-LuisNeural',
+ 'Spanish (Spain)-Alvaro- (Male)': 'es-ES-AlvaroNeural',
+ 'Spanish (Spain)-Elvira- (Female)': 'es-ES-ElviraNeural',
+ 'Spanish (Equatorial Guinea)-Teresa- (Female)': 'es-GQ-TeresaNeural',
+ 'Spanish (Guatemala)-Andres- (Male)': 'es-GT-AndresNeural',
+ 'Spanish (Guatemala)-Marta- (Female)': 'es-GT-MartaNeural',
+ 'Spanish (Honduras)-Carlos- (Male)': 'es-HN-CarlosNeural',
+ 'Spanish (Honduras)-Karla- (Female)': 'es-HN-KarlaNeural',
+ 'Spanish (Nicaragua)-Federico- (Male)': 'es-NI-FedericoNeural',
+ 'Spanish (Nicaragua)-Yolanda- (Female)': 'es-NI-YolandaNeural',
+ 'Spanish (Panama)-Margarita- (Female)': 'es-PA-MargaritaNeural',
+ 'Spanish (Panama)-Roberto- (Male)': 'es-PA-RobertoNeural',
+ 'Spanish (Peru)-Alex- (Male)': 'es-PE-AlexNeural',
+ 'Spanish (Peru)-Camila- (Female)': 'es-PE-CamilaNeural',
+ 'Spanish (Puerto Rico)-Karina- (Female)': 'es-PR-KarinaNeural',
+ 'Spanish (Puerto Rico)-Victor- (Male)': 'es-PR-VictorNeural',
+ 'Spanish (Paraguay)-Mario- (Male)': 'es-PY-MarioNeural',
+ 'Spanish (Paraguay)-Tania- (Female)': 'es-PY-TaniaNeural',
+ 'Spanish (El Salvador)-Lorena- (Female)': 'es-SV-LorenaNeural',
+ 'Spanish (El Salvador)-Rodrigo- (Male)': 'es-SV-RodrigoNeural',
+ 'Spanish (United States)-Alonso- (Male)': 'es-US-AlonsoNeural',
+ 'Spanish (United States)-Paloma- (Female)': 'es-US-PalomaNeural',
+ 'Spanish (Uruguay)-Mateo- (Male)': 'es-UY-MateoNeural',
+ 'Spanish (Uruguay)-Valentina- (Female)': 'es-UY-ValentinaNeural',
+ 'Spanish (Venezuela)-Paola- (Female)': 'es-VE-PaolaNeural',
+ 'Spanish (Venezuela)-Sebastian- (Male)': 'es-VE-SebastianNeural',
+'Estonian (Estonia)-Anu- (Female)': 'et-EE-AnuNeural',
+'Estonian (Estonia)-Kert- (Male)': 'et-EE-KertNeural',
+'Persian (Iran)-Dilara- (Female)': 'fa-IR-DilaraNeural',
+'Persian (Iran)-Farid- (Male)': 'fa-IR-FaridNeural',
+'Finnish (Finland)-Harri- (Male)': 'fi-FI-HarriNeural',
+'Finnish (Finland)-Noora- (Female)': 'fi-FI-NooraNeural',
+'French (Belgium)-Charline- (Female)': 'fr-BE-CharlineNeural',
+'French (Belgium)-Gerard- (Male)': 'fr-BE-GerardNeural',
+'French (Canada)-Sylvie- (Female)': 'fr-CA-SylvieNeural',
+'French (Canada)-Antoine- (Male)': 'fr-CA-AntoineNeural',
+'French (Canada)-Jean- (Male)': 'fr-CA-JeanNeural',
+'French (Switzerland)-Ariane- (Female)': 'fr-CH-ArianeNeural',
+'French (Switzerland)-Fabrice- (Male)': 'fr-CH-FabriceNeural',
+'Irish (Ireland)-Colm- (Male)': 'ga-IE-ColmNeural',
+'Irish (Ireland)-Orla- (Female)': 'ga-IE-OrlaNeural',
+'Galician (Spain)-Roi- (Male)': 'gl-ES-RoiNeural',
+'Galician (Spain)-Sabela- (Female)': 'gl-ES-SabelaNeural',
+'Gujarati (India)-Dhwani- (Female)': 'gu-IN-DhwaniNeural',
+'Gujarati (India)-Niranjan- (Male)': 'gu-IN-NiranjanNeural',
+'Hindi (India)-Madhur- (Male)': 'hi-IN-MadhurNeural',
+'Hindi (India)-Swara- (Female)': 'hi-IN-SwaraNeural',
+'Croatian (Croatia)-Gabrijela- (Female)': 'hr-HR-GabrijelaNeural',
+'Croatian (Croatia)-Srecko- (Male)': 'hr-HR-SreckoNeural',
+'Hungarian (Hungary)-Noemi- (Female)': 'hu-HU-NoemiNeural',
+'Hungarian (Hungary)-Tamas- (Male)': 'hu-HU-TamasNeural',
+'Icelandic (Iceland)-Gudrun- (Female)': 'is-IS-GudrunNeural',
+'Icelandic (Iceland)-Gunnar- (Male)': 'is-IS-GunnarNeural',
+'Javanese (Indonesia)-Dimas- (Male)': 'jv-ID-DimasNeural',
+'Javanese (Indonesia)-Siti- (Female)': 'jv-ID-SitiNeural',
+'Georgian (Georgia)-Eka- (Female)': 'ka-GE-EkaNeural',
+'Georgian (Georgia)-Giorgi- (Male)': 'ka-GE-GiorgiNeural',
+'Kazakh (Kazakhstan)-Aigul- (Female)': 'kk-KZ-AigulNeural',
+'Kazakh (Kazakhstan)-Daulet- (Male)': 'kk-KZ-DauletNeural',
+'Khmer (Cambodia)-Piseth- (Male)': 'km-KH-PisethNeural',
+'Khmer (Cambodia)-Sreymom- (Female)': 'km-KH-SreymomNeural',
+'Kannada (India)-Gagan- (Male)': 'kn-IN-GaganNeural',
+'Kannada (India)-Sapna- (Female)': 'kn-IN-SapnaNeural',
+'Lao (Laos)-Chanthavong- (Male)': 'lo-LA-ChanthavongNeural',
+'Lao (Laos)-Keomany- (Female)': 'lo-LA-KeomanyNeural',
+'Lithuanian (Lithuania)-Leonas- (Male)': 'lt-LT-LeonasNeural',
+'Lithuanian (Lithuania)-Ona- (Female)': 'lt-LT-OnaNeural',
+'Latvian (Latvia)-Everita- (Female)': 'lv-LV-EveritaNeural',
+'Latvian (Latvia)-Nils- (Male)': 'lv-LV-NilsNeural',
+'Macedonian (North Macedonia)-Aleksandar- (Male)': 'mk-MK-AleksandarNeural',
+'Macedonian (North Macedonia)-Marija- (Female)': 'mk-MK-MarijaNeural',
+'Malayalam (India)-Midhun- (Male)': 'ml-IN-MidhunNeural',
+'Malayalam (India)-Sobhana- (Female)': 'ml-IN-SobhanaNeural',
+'Mongolian (Mongolia)-Bataa- (Male)': 'mn-MN-BataaNeural',
+'Mongolian (Mongolia)-Yesui- (Female)': 'mn-MN-YesuiNeural',
+'Marathi (India)-Aarohi- (Female)': 'mr-IN-AarohiNeural',
+'Marathi (India)-Manohar- (Male)': 'mr-IN-ManoharNeural',
+'Maltese (Malta)-Grace- (Female)': 'mt-MT-GraceNeural',
+'Maltese (Malta)-Joseph- (Male)': 'mt-MT-JosephNeural',
+'Burmese (Myanmar)-Nilar- (Female)': 'my-MM-NilarNeural',
+'Burmese (Myanmar)-Thiha- (Male)': 'my-MM-ThihaNeural',
+'Nepali (Nepal)-Hemkala- (Female)': 'ne-NP-HemkalaNeural',
+'Nepali (Nepal)-Sagar- (Male)': 'ne-NP-SagarNeural',
+'Dutch (Belgium)-Arnaud- (Male)': 'nl-BE-ArnaudNeural',
+'Dutch (Belgium)-Dena- (Female)': 'nl-BE-DenaNeural',
+'Polish (Poland)-Marek- (Male)': 'pl-PL-MarekNeural',
+'Polish (Poland)-Zofia- (Female)': 'pl-PL-ZofiaNeural',
+'Pashto (Afghanistan)-Gul Nawaz- (Male)': 'ps-AF-Gul',}

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
 gTTS
 elevenlabs
 stftpitchshift==1.5.1
 torchcrepe
 setuptools
@@ -19,4 +20,4 @@ mega.py
 gdown
 onnxruntime
 pyngrok==4.1.12
-torch

 gTTS
 elevenlabs
+edge-tts
 stftpitchshift==1.5.1
 torchcrepe
 setuptools
 gdown
 onnxruntime
 pyngrok==4.1.12
+torch

vc_infer_pipeline.py CHANGED Viewed

@@ -15,6 +15,14 @@ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
 input_audio_path2wav = {}
 @lru_cache
 def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
@@ -312,14 +320,7 @@ class VC(object):
                 x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
             )
         elif f0_method == "rmvpe":
-            if hasattr(self, "model_rmvpe") == False:
-                from rmvpe import RMVPE
-                print("loading rmvpe model")
-                self.model_rmvpe = RMVPE(
-                    "rmvpe.pt", is_half=self.is_half, device=self.device
-                )
-            f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
         elif "hybrid" in f0_method:
             # Perform hybrid median pitch estimation

 input_audio_path2wav = {}
+#A fun little addition from my personal RVC branch.
+#You don't have to implement it if you don't have to
+from config import Config
+config=Config()
+from rmvpe import RMVPE
+print("Preloading RMVPE model...")
+model_rmvpe = RMVPE("rmvpe.pt", is_half=config.is_half, device=config.device)
+del config
 @lru_cache
 def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
                 x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
             )
         elif f0_method == "rmvpe":
+            f0 = model_rmvpe.infer_from_audio(x, thred=0.03)
         elif "hybrid" in f0_method:
             # Perform hybrid median pitch estimation