Ilaria commited on
Commit
7cfb6ba
1 Parent(s): 2afc955

New Version - 3.0

Browse files

- New support for Ilaria TTS (best tts at the moment for HF, resources speaking)

- Fixed support for ElevenLabs and Google TTS

- Faster inference

- Better looking UI

- Various bug fixes

- Removed Herobrine

Files changed (4) hide show
  1. app.py +58 -33
  2. ilariatts.py +230 -0
  3. requirements.txt +2 -1
  4. vc_infer_pipeline.py +9 -8
app.py CHANGED
@@ -21,6 +21,11 @@ warnings.filterwarnings("ignore")
21
  torch.manual_seed(114514)
22
  from i18n import I18nAuto
23
 
 
 
 
 
 
24
  import signal
25
 
26
  import math
@@ -1445,6 +1450,12 @@ def elevenTTS(xiapi, text, id, lang):
1445
  aud_path = save_to_wav('./temp_gTTS.mp3')
1446
  return aud_path, aud_path
1447
 
 
 
 
 
 
 
1448
  def upload_to_dataset(files, dir):
1449
  if dir == '':
1450
  dir = './dataset'
@@ -1470,7 +1481,7 @@ def zip_downloader(model):
1470
  else:
1471
  return f'./weights/{model}.pth', "Could not find Index file."
1472
 
1473
- with gr.Blocks(theme=gr.themes.Base (), title='Mangio-RVC-Web 💻') as app:
1474
  with gr.Tabs():
1475
  with gr.TabItem("Inference"):
1476
  gr.HTML("<h1> Ilaria RVC 💖 </h1>")
@@ -1525,11 +1536,11 @@ with gr.Blocks(theme=gr.themes.Base (), title='Mangio-RVC-Web 💻') as app:
1525
  dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
1526
  refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
1527
  record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
1528
- record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
1529
  with gr.Row():
1530
- with gr.Accordion('Text To Speech', open=False):
1531
  with gr.Column():
1532
- lang = gr.Radio(label='Chinese & Japanese do not work with ElevenLabs currently.',choices=['en','es','fr','pt','zh-CN','de','hi','ja'], value='en')
1533
  api_box = gr.Textbox(label="Enter your API Key for ElevenLabs, or leave empty to use GoogleTTS", value='')
1534
  elevenid=gr.Dropdown(label="Voice:", choices=eleven_voices)
1535
  with gr.Column():
@@ -1537,7 +1548,7 @@ with gr.Blocks(theme=gr.themes.Base (), title='Mangio-RVC-Web 💻') as app:
1537
  tts_button = gr.Button(value="Speak")
1538
  tts_button.click(fn=elevenTTS, inputs=[api_box,tfs, elevenid, lang], outputs=[record_button, input_audio0])
1539
  with gr.Row():
1540
- with gr.Accordion('Wav2Lip', open=False):
1541
  with gr.Row():
1542
  size = gr.Radio(label='Resolution:',choices=['Half','Full'])
1543
  face = gr.UploadButton("Upload A Character",type='file')
@@ -1550,37 +1561,50 @@ with gr.Blocks(theme=gr.themes.Base (), title='Mangio-RVC-Web 💻') as app:
1550
  refresh_button2.click(fn=change_choices2, inputs=[], outputs=[input_audio0, animation])
1551
  with gr.Row():
1552
  animate_button = gr.Button('Animate')
1553
-
1554
  with gr.Column():
1555
- with gr.Accordion("Index Settings", open=False):
1556
- file_index1 = gr.Dropdown(
1557
- label="3. Choose the index file (in case it wasn't automatically found.)",
1558
- choices=get_indexes(),
1559
- value=get_index(),
1560
- interactive=True,
1561
- )
1562
- sid0.change(fn=match_index, inputs=[sid0],outputs=[file_index1])
1563
- refresh_button.click(
1564
- fn=change_choices, inputs=[], outputs=[sid0, file_index1]
1565
- )
1566
- # file_big_npy1 = gr.Textbox(
1567
- # label=i18n("特征文件路径"),
1568
- # value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
1569
- # interactive=True,
1570
- # )
1571
- index_rate1 = gr.Slider(
1572
- minimum=0,
1573
- maximum=1,
1574
- label=i18n("检索特征占比"),
1575
- value=0.66,
1576
- interactive=True,
1577
- )
1578
  vc_output2 = gr.Audio(
1579
  label="Final Result! (Click on the three dots to download the audio)",
1580
  type='filepath',
1581
  interactive=False,
1582
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1583
  animate_button.click(fn=mouth, inputs=[size, face, vc_output2, faces], outputs=[animation, preview])
 
1584
  with gr.Accordion("Advanced Options", open=False):
1585
  f0method0 = gr.Radio(
1586
  label="Optional: Change the Pitch Extraction Algorithm. Extraction methods are sorted from 'worst quality' to 'best quality'. If you don't know what you're doing, leave rmvpe.",
@@ -1679,6 +1703,7 @@ with gr.Blocks(theme=gr.themes.Base (), title='Mangio-RVC-Web 💻') as app:
1679
  formanting.change(fn=formant_enabled,inputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button],outputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button])
1680
  frmntbut.click(fn=formant_apply,inputs=[qfrency, tmbre], outputs=[qfrency, tmbre])
1681
  formant_refresh_button.click(fn=update_fshift_presets,inputs=[formant_preset, qfrency, tmbre],outputs=[formant_preset, qfrency, tmbre])
 
1682
  with gr.Row():
1683
  vc_output1 = gr.Textbox("")
1684
  f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"), visible=False)
@@ -1704,7 +1729,7 @@ with gr.Blocks(theme=gr.themes.Base (), title='Mangio-RVC-Web 💻') as app:
1704
  [vc_output1, vc_output2],
1705
  )
1706
 
1707
- with gr.Accordion("Batch Conversion",open=False):
1708
  with gr.Row():
1709
  with gr.Column():
1710
  vc_transform1 = gr.Number(
@@ -1828,7 +1853,7 @@ with gr.Blocks(theme=gr.themes.Base (), title='Mangio-RVC-Web 💻') as app:
1828
  model = gr.Textbox(label="Name of the model (without spaces):")
1829
  download_button=gr.Button("Download")
1830
  with gr.Row():
1831
- status_bar=gr.Textbox(label="")
1832
  download_button.click(fn=download_from_url, inputs=[url, model], outputs=[status_bar])
1833
  with gr.Row():
1834
  gr.Markdown(
@@ -2080,9 +2105,9 @@ with gr.Blocks(theme=gr.themes.Base (), title='Mangio-RVC-Web 💻') as app:
2080
  else:
2081
  print(
2082
  "Pretrained weights not downloaded. Disabling training tab.\n"
2083
- "Wondering how to train a voice? Visit here for the RVC model training guide: https://t.ly/RVC_Training_Guide\n"
2084
  "-------------------------------\n"
2085
  )
2086
 
2087
- app.queue(concurrency_count=511, max_size=1022).launch(share=False, quiet=True)
2088
  #endregion
 
21
  torch.manual_seed(114514)
22
  from i18n import I18nAuto
23
 
24
+ import edge_tts, asyncio
25
+ from ilariatts import tts_order_voice
26
+ language_dict = tts_order_voice
27
+ ilariavoices = language_dict.keys()
28
+
29
  import signal
30
 
31
  import math
 
1450
  aud_path = save_to_wav('./temp_gTTS.mp3')
1451
  return aud_path, aud_path
1452
 
1453
+ def ilariaTTS(text, ttsvoice):
1454
+ vo=language_dict[ttsvoice]
1455
+ asyncio.run(edge_tts.Communicate(text, vo).save("./temp_ilaria.mp3"))
1456
+ aud_path = save_to_wav('./temp_ilaria.mp3')
1457
+ return aud_path, aud_path
1458
+
1459
  def upload_to_dataset(files, dir):
1460
  if dir == '':
1461
  dir = './dataset'
 
1481
  else:
1482
  return f'./weights/{model}.pth', "Could not find Index file."
1483
 
1484
+ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose"), title="Ilaria RVC 💖") as app:
1485
  with gr.Tabs():
1486
  with gr.TabItem("Inference"):
1487
  gr.HTML("<h1> Ilaria RVC 💖 </h1>")
 
1536
  dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
1537
  refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
1538
  record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
1539
+ record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
1540
  with gr.Row():
1541
+ with gr.Accordion('ElevenLabs / Google TTS', open=False):
1542
  with gr.Column():
1543
+ lang = gr.Radio(label='Chinese & Japanese do not work with ElevenLabs currently.',choices=['en','it','es','fr','pt','zh-CN','de','hi','ja'], value='en')
1544
  api_box = gr.Textbox(label="Enter your API Key for ElevenLabs, or leave empty to use GoogleTTS", value='')
1545
  elevenid=gr.Dropdown(label="Voice:", choices=eleven_voices)
1546
  with gr.Column():
 
1548
  tts_button = gr.Button(value="Speak")
1549
  tts_button.click(fn=elevenTTS, inputs=[api_box,tfs, elevenid, lang], outputs=[record_button, input_audio0])
1550
  with gr.Row():
1551
+ with gr.Accordion('Wav2Lip', open=False, visible=False):
1552
  with gr.Row():
1553
  size = gr.Radio(label='Resolution:',choices=['Half','Full'])
1554
  face = gr.UploadButton("Upload A Character",type='file')
 
1561
  refresh_button2.click(fn=change_choices2, inputs=[], outputs=[input_audio0, animation])
1562
  with gr.Row():
1563
  animate_button = gr.Button('Animate')
1564
+
1565
  with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1566
  vc_output2 = gr.Audio(
1567
  label="Final Result! (Click on the three dots to download the audio)",
1568
  type='filepath',
1569
  interactive=False,
1570
  )
1571
+
1572
+ with gr.Accordion('IlariaTTS', open=True):
1573
+ with gr.Column():
1574
+ ilariaid=gr.Dropdown(label="Voice:", choices=ilariavoices, value="English-Jenny (Female)")
1575
+ ilariatext = gr.Textbox(label="Input your Text", interactive=True, value="This is a test.")
1576
+ ilariatts_button = gr.Button(value="Speak")
1577
+ ilariatts_button.click(fn=ilariaTTS, inputs=[ilariatext, ilariaid], outputs=[record_button, input_audio0])
1578
+
1579
+ #with gr.Column():
1580
+ with gr.Accordion("Index Settings", open=False):
1581
+ #with gr.Row():
1582
+
1583
+ file_index1 = gr.Dropdown(
1584
+ label="3. Choose the index file (in case it wasn't automatically found.)",
1585
+ choices=get_indexes(),
1586
+ value=get_index(),
1587
+ interactive=True,
1588
+ )
1589
+ sid0.change(fn=match_index, inputs=[sid0],outputs=[file_index1])
1590
+ refresh_button.click(
1591
+ fn=change_choices, inputs=[], outputs=[sid0, file_index1]
1592
+ )
1593
+ # file_big_npy1 = gr.Textbox(
1594
+ # label=i18n("特征文件路径"),
1595
+ # value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
1596
+ # interactive=True,
1597
+ # )
1598
+ index_rate1 = gr.Slider(
1599
+ minimum=0,
1600
+ maximum=1,
1601
+ label=i18n("检索特征占比"),
1602
+ value=0.66,
1603
+ interactive=True,
1604
+ )
1605
+
1606
  animate_button.click(fn=mouth, inputs=[size, face, vc_output2, faces], outputs=[animation, preview])
1607
+
1608
  with gr.Accordion("Advanced Options", open=False):
1609
  f0method0 = gr.Radio(
1610
  label="Optional: Change the Pitch Extraction Algorithm. Extraction methods are sorted from 'worst quality' to 'best quality'. If you don't know what you're doing, leave rmvpe.",
 
1703
  formanting.change(fn=formant_enabled,inputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button],outputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button])
1704
  frmntbut.click(fn=formant_apply,inputs=[qfrency, tmbre], outputs=[qfrency, tmbre])
1705
  formant_refresh_button.click(fn=update_fshift_presets,inputs=[formant_preset, qfrency, tmbre],outputs=[formant_preset, qfrency, tmbre])
1706
+
1707
  with gr.Row():
1708
  vc_output1 = gr.Textbox("")
1709
  f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"), visible=False)
 
1729
  [vc_output1, vc_output2],
1730
  )
1731
 
1732
+ with gr.Accordion("Batch Conversion",open=False, visible=False):
1733
  with gr.Row():
1734
  with gr.Column():
1735
  vc_transform1 = gr.Number(
 
1853
  model = gr.Textbox(label="Name of the model (without spaces):")
1854
  download_button=gr.Button("Download")
1855
  with gr.Row():
1856
+ status_bar=gr.Textbox(label="Download Status")
1857
  download_button.click(fn=download_from_url, inputs=[url, model], outputs=[status_bar])
1858
  with gr.Row():
1859
  gr.Markdown(
 
2105
  else:
2106
  print(
2107
  "Pretrained weights not downloaded. Disabling training tab.\n"
2108
+ "Wondering how to train a voice? Join AI HUB Discord Server! https://discord.gg/aihub\n"
2109
  "-------------------------------\n"
2110
  )
2111
 
2112
+ app.queue(concurrency_count=511, max_size=1022).launch(share=False, quiet=False)
2113
  #endregion
ilariatts.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tts_order_voice = {'English-Jenny (Female)': 'en-US-JennyNeural',
2
+ 'English-Guy (Male)': 'en-US-GuyNeural',
3
+ 'English-Ana (Female)': 'en-US-AnaNeural',
4
+ 'English-Aria (Female)': 'en-US-AriaNeural',
5
+ 'English-Christopher (Male)': 'en-US-ChristopherNeural',
6
+ 'English-Eric (Male)': 'en-US-EricNeural',
7
+ 'English-Michelle (Female)': 'en-US-MichelleNeural',
8
+ 'English-Roger (Male)': 'en-US-RogerNeural',
9
+ 'Spanish (Mexican)-Dalia (Female)': 'es-MX-DaliaNeural',
10
+ 'Spanish (Mexican)-Jorge- (Male)': 'es-MX-JorgeNeural',
11
+ 'Korean-Sun-Hi- (Female)': 'ko-KR-SunHiNeural',
12
+ 'Korean-InJoon- (Male)': 'ko-KR-InJoonNeural',
13
+ 'Thai-Premwadee- (Female)': 'th-TH-PremwadeeNeural',
14
+ 'Thai-Niwat- (Male)': 'th-TH-NiwatNeural',
15
+ 'Vietnamese-HoaiMy- (Female)': 'vi-VN-HoaiMyNeural',
16
+ 'Vietnamese-NamMinh- (Male)': 'vi-VN-NamMinhNeural',
17
+ 'Japanese-Nanami- (Female)': 'ja-JP-NanamiNeural',
18
+ 'Japanese-Keita- (Male)': 'ja-JP-KeitaNeural',
19
+ 'French-Denise- (Female)': 'fr-FR-DeniseNeural',
20
+ 'French-Eloise- (Female)': 'fr-FR-EloiseNeural',
21
+ 'French-Henri- (Male)': 'fr-FR-HenriNeural',
22
+ 'Brazilian-Francisca- (Female)': 'pt-BR-FranciscaNeural',
23
+ 'Brazilian-Antonio- (Male)': 'pt-BR-AntonioNeural',
24
+ 'Indonesian-Ardi- (Male)': 'id-ID-ArdiNeural',
25
+ 'Indonesian-Gadis- (Female)': 'id-ID-GadisNeural',
26
+ 'Hebrew-Avri- (Male)': 'he-IL-AvriNeural',
27
+ 'Hebrew-Hila- (Female)': 'he-IL-HilaNeural',
28
+ 'Italian-Isabella- (Female)': 'it-IT-IsabellaNeural',
29
+ 'Italian-Diego- (Male)': 'it-IT-DiegoNeural',
30
+ 'Italian-Elsa- (Female)': 'it-IT-ElsaNeural',
31
+ 'Dutch-Colette- (Female)': 'nl-NL-ColetteNeural',
32
+ 'Dutch-Fenna- (Female)': 'nl-NL-FennaNeural',
33
+ 'Dutch-Maarten- (Male)': 'nl-NL-MaartenNeural',
34
+ 'Malese-Osman- (Male)': 'ms-MY-OsmanNeural',
35
+ 'Malese-Yasmin- (Female)': 'ms-MY-YasminNeural',
36
+ 'Norwegian-Pernille- (Female)': 'nb-NO-PernilleNeural',
37
+ 'Norwegian-Finn- (Male)': 'nb-NO-FinnNeural',
38
+ 'Swedish-Sofie- (Female)': 'sv-SE-SofieNeural',
39
+ 'ArabicSwedish-Mattias- (Male)': 'sv-SE-MattiasNeural',
40
+ 'Arabic-Hamed- (Male)': 'ar-SA-HamedNeural',
41
+ 'Arabic-Zariyah- (Female)': 'ar-SA-ZariyahNeural',
42
+ 'Greek-Athina- (Female)': 'el-GR-AthinaNeural',
43
+ 'Greek-Nestoras- (Male)': 'el-GR-NestorasNeural',
44
+ 'German-Katja- (Female)': 'de-DE-KatjaNeural',
45
+ 'German-Amala- (Female)': 'de-DE-AmalaNeural',
46
+ 'German-Conrad- (Male)': 'de-DE-ConradNeural',
47
+ 'German-Killian- (Male)': 'de-DE-KillianNeural',
48
+ 'Afrikaans-Adri- (Female)': 'af-ZA-AdriNeural',
49
+ 'Afrikaans-Willem- (Male)': 'af-ZA-WillemNeural',
50
+ 'Ethiopian-Ameha- (Male)': 'am-ET-AmehaNeural',
51
+ 'Ethiopian-Mekdes- (Female)': 'am-ET-MekdesNeural',
52
+ 'Arabic (UAD)-Fatima- (Female)': 'ar-AE-FatimaNeural',
53
+ 'Arabic (UAD)-Hamdan- (Male)': 'ar-AE-HamdanNeural',
54
+ 'Arabic (Bahrain)-Ali- (Male)': 'ar-BH-AliNeural',
55
+ 'Arabic (Bahrain)-Laila- (Female)': 'ar-BH-LailaNeural',
56
+ 'Arabic (Algeria)-Ismael- (Male)': 'ar-DZ-IsmaelNeural',
57
+ 'Arabic (Egypt)-Salma- (Female)': 'ar-EG-SalmaNeural',
58
+ 'Arabic (Egypt)-Shakir- (Male)': 'ar-EG-ShakirNeural',
59
+ 'Arabic (Iraq)-Bassel- (Male)': 'ar-IQ-BasselNeural',
60
+ 'Arabic (Iraq)-Rana- (Female)': 'ar-IQ-RanaNeural',
61
+ 'Arabic (Jordan)-Sana- (Female)': 'ar-JO-SanaNeural',
62
+ 'Arabic (Jordan)-Taim- (Male)': 'ar-JO-TaimNeural',
63
+ 'Arabic (Kuwait)-Fahed- (Male)': 'ar-KW-FahedNeural',
64
+ 'Arabic (Kuwait)-Noura- (Female)': 'ar-KW-NouraNeural',
65
+ 'Arabic (Lebanon)-Layla- (Female)': 'ar-LB-LaylaNeural',
66
+ 'Arabic (Lebanon)-Rami- (Male)': 'ar-LB-RamiNeural',
67
+ 'Arabic (Libya)-Iman- (Female)': 'ar-LY-ImanNeural',
68
+ 'Arabic (Libya)-Omar- (Male)': 'ar-LY-OmarNeural',
69
+ 'Arabic (Morocco)-Jamal- (Male)': 'ar-MA-JamalNeural',
70
+ 'Arabic (Morocco)-Mouna- (Female)': 'ar-MA-MounaNeural',
71
+ 'Arabic (Oman)-Abdullah- (Male)': 'ar-OM-AbdullahNeural',
72
+ 'Arabic (Oman)-Aysha- (Female)': 'ar-OM-AyshaNeural',
73
+ 'Arabic (Qatar)-Amal- (Female)': 'ar-QA-AmalNeural',
74
+ 'Arabic (Qatar)-Moaz- (Male)': 'ar-QA-MoazNeural',
75
+ 'Arabic (Syrian Arab Republic)-Amany- (Female)': 'ar-SY-AmanyNeural',
76
+ 'Arabic (Syrian Arab Republic)-Laith- (Male)': 'ar-SY-LaithNeural',
77
+ 'Arabic (Tunisia)-Hedi- (Male)': 'ar-TN-HediNeural',
78
+ 'Arabic (Tunisia)-Reem- (Female)': 'ar-TN-ReemNeural',
79
+ 'Arabic (Yemen )-Maryam- (Female)': 'ar-YE-MaryamNeural',
80
+ 'Arabic (Yemen )-Saleh- (Male)': 'ar-YE-SalehNeural',
81
+ 'Azerbaijani-Babek- (Male)': 'az-AZ-BabekNeural',
82
+ 'Azerbaijani-Banu- (Female)': 'az-AZ-BanuNeural',
83
+ 'Bulgarian-Borislav- (Male)': 'bg-BG-BorislavNeural',
84
+ 'Bulgarian-Kalina- (Female)': 'bg-BG-KalinaNeural',
85
+ 'Bengali (Bangladesh)-Nabanita- (Female)': 'bn-BD-NabanitaNeural',
86
+ 'Bengali (Bangladesh)-Pradeep- (Male)': 'bn-BD-PradeepNeural',
87
+ 'Bengali (India)-Bashkar- (Male)': 'bn-IN-BashkarNeural',
88
+ 'Bengali (India)-Tanishaa- (Female)': 'bn-IN-TanishaaNeural',
89
+ 'Bosniak (Bosnia and Herzegovina)-Goran- (Male)': 'bs-BA-GoranNeural',
90
+ 'Bosniak (Bosnia and Herzegovina)-Vesna- (Female)': 'bs-BA-VesnaNeural',
91
+ 'Catalan (Spain)-Joana- (Female)': 'ca-ES-JoanaNeural',
92
+ 'Catalan (Spain)-Enric- (Male)': 'ca-ES-EnricNeural',
93
+ 'Czech (Czech Republic)-Antonin- (Male)': 'cs-CZ-AntoninNeural',
94
+ 'Czech (Czech Republic)-Vlasta- (Female)': 'cs-CZ-VlastaNeural',
95
+ 'Welsh (UK)-Aled- (Male)': 'cy-GB-AledNeural',
96
+ 'Welsh (UK)-Nia- (Female)': 'cy-GB-NiaNeural',
97
+ 'Danish (Denmark)-Christel- (Female)': 'da-DK-ChristelNeural',
98
+ 'Danish (Denmark)-Jeppe- (Male)': 'da-DK-JeppeNeural',
99
+ 'German (Austria)-Ingrid- (Female)': 'de-AT-IngridNeural',
100
+ 'German (Austria)-Jonas- (Male)': 'de-AT-JonasNeural',
101
+ 'German (Switzerland)-Jan- (Male)': 'de-CH-JanNeural',
102
+ 'German (Switzerland)-Leni- (Female)': 'de-CH-LeniNeural',
103
+ 'English (Australia)-Natasha- (Female)': 'en-AU-NatashaNeural',
104
+ 'English (Australia)-William- (Male)': 'en-AU-WilliamNeural',
105
+ 'English (Canada)-Clara- (Female)': 'en-CA-ClaraNeural',
106
+ 'English (Canada)-Liam- (Male)': 'en-CA-LiamNeural',
107
+ 'English (UK)-Libby- (Female)': 'en-GB-LibbyNeural',
108
+ 'English (UK)-Maisie- (Female)': 'en-GB-MaisieNeural',
109
+ 'English (UK)-Ryan- (Male)': 'en-GB-RyanNeural',
110
+ 'English (UK)-Sonia- (Female)': 'en-GB-SoniaNeural',
111
+ 'English (UK)-Thomas- (Male)': 'en-GB-ThomasNeural',
112
+ 'English (Hong Kong)-Sam- (Male)': 'en-HK-SamNeural',
113
+ 'English (Hong Kong)-Yan- (Female)': 'en-HK-YanNeural',
114
+ 'English (Ireland)-Connor- (Male)': 'en-IE-ConnorNeural',
115
+ 'English (Ireland)-Emily- (Female)': 'en-IE-EmilyNeural',
116
+ 'English (India)-Neerja- (Female)': 'en-IN-NeerjaNeural',
117
+ 'English (India)-Prabhat- (Male)': 'en-IN-PrabhatNeural',
118
+ 'English (Kenya)-Asilia- (Female)': 'en-KE-AsiliaNeural',
119
+ 'English (Kenya)-Chilemba- (Male)': 'en-KE-ChilembaNeural',
120
+ 'English (Nigeria)-Abeo- (Male)': 'en-NG-AbeoNeural',
121
+ 'English (Nigeria)-Ezinne- (Female)': 'en-NG-EzinneNeural',
122
+ 'English (New Zealand)-Mitchell- (Male)': 'en-NZ-MitchellNeural',
123
+ 'English (Philippines)-James- (Male)': 'en-PH-JamesNeural',
124
+ 'English (Philippines)-Rosa- (Female)': 'en-PH-RosaNeural',
125
+ 'English (Singapore)-Luna- (Female)': 'en-SG-LunaNeural',
126
+ 'English (Singapore)-Wayne- (Male)': 'en-SG-WayneNeural',
127
+ 'English (Tanzania)-Elimu- (Male)': 'en-TZ-ElimuNeural',
128
+ 'English (Tanzania)-Imani- (Female)': 'en-TZ-ImaniNeural',
129
+ 'English (South Africa)-Leah- (Female)': 'en-ZA-LeahNeural',
130
+ 'English (South Africa)-Luke- (Male)': 'en-ZA-LukeNeural',
131
+ 'Spanish (Argentina)-Elena- (Female)': 'es-AR-ElenaNeural',
132
+ 'Spanish (Argentina)-Tomas- (Male)': 'es-AR-TomasNeural',
133
+ 'Spanish (Bolivia)-Marcelo- (Male)': 'es-BO-MarceloNeural',
134
+ 'Spanish (Bolivia)-Sofia- (Female)': 'es-BO-SofiaNeural',
135
+ 'Spanish (Colombia)-Gonzalo- (Male)': 'es-CO-GonzaloNeural',
136
+ 'Spanish (Colombia)-Salome- (Female)': 'es-CO-SalomeNeural',
137
+ 'Spanish (Costa Rica)-Juan- (Male)': 'es-CR-JuanNeural',
138
+ 'Spanish (Costa Rica)-Maria- (Female)': 'es-CR-MariaNeural',
139
+ 'Spanish (Cuba)-Belkys- (Female)': 'es-CU-BelkysNeural',
140
+ 'Spanish (Dominican Republic)-Emilio- (Male)': 'es-DO-EmilioNeural',
141
+ 'Spanish (Dominican Republic)-Ramona- (Female)': 'es-DO-RamonaNeural',
142
+ 'Spanish (Ecuador)-Andrea- (Female)': 'es-EC-AndreaNeural',
143
+ 'Spanish (Ecuador)-Luis- (Male)': 'es-EC-LuisNeural',
144
+ 'Spanish (Spain)-Alvaro- (Male)': 'es-ES-AlvaroNeural',
145
+ 'Spanish (Spain)-Elvira- (Female)': 'es-ES-ElviraNeural',
146
+ 'Spanish (Equatorial Guinea)-Teresa- (Female)': 'es-GQ-TeresaNeural',
147
+ 'Spanish (Guatemala)-Andres- (Male)': 'es-GT-AndresNeural',
148
+ 'Spanish (Guatemala)-Marta- (Female)': 'es-GT-MartaNeural',
149
+ 'Spanish (Honduras)-Carlos- (Male)': 'es-HN-CarlosNeural',
150
+ 'Spanish (Honduras)-Karla- (Female)': 'es-HN-KarlaNeural',
151
+ 'Spanish (Nicaragua)-Federico- (Male)': 'es-NI-FedericoNeural',
152
+ 'Spanish (Nicaragua)-Yolanda- (Female)': 'es-NI-YolandaNeural',
153
+ 'Spanish (Panama)-Margarita- (Female)': 'es-PA-MargaritaNeural',
154
+ 'Spanish (Panama)-Roberto- (Male)': 'es-PA-RobertoNeural',
155
+ 'Spanish (Peru)-Alex- (Male)': 'es-PE-AlexNeural',
156
+ 'Spanish (Peru)-Camila- (Female)': 'es-PE-CamilaNeural',
157
+ 'Spanish (Puerto Rico)-Karina- (Female)': 'es-PR-KarinaNeural',
158
+ 'Spanish (Puerto Rico)-Victor- (Male)': 'es-PR-VictorNeural',
159
+ 'Spanish (Paraguay)-Mario- (Male)': 'es-PY-MarioNeural',
160
+ 'Spanish (Paraguay)-Tania- (Female)': 'es-PY-TaniaNeural',
161
+ 'Spanish (El Salvador)-Lorena- (Female)': 'es-SV-LorenaNeural',
162
+ 'Spanish (El Salvador)-Rodrigo- (Male)': 'es-SV-RodrigoNeural',
163
+ 'Spanish (United States)-Alonso- (Male)': 'es-US-AlonsoNeural',
164
+ 'Spanish (United States)-Paloma- (Female)': 'es-US-PalomaNeural',
165
+ 'Spanish (Uruguay)-Mateo- (Male)': 'es-UY-MateoNeural',
166
+ 'Spanish (Uruguay)-Valentina- (Female)': 'es-UY-ValentinaNeural',
167
+ 'Spanish (Venezuela)-Paola- (Female)': 'es-VE-PaolaNeural',
168
+ 'Spanish (Venezuela)-Sebastian- (Male)': 'es-VE-SebastianNeural',
169
+ 'Estonian (Estonia)-Anu- (Female)': 'et-EE-AnuNeural',
170
+ 'Estonian (Estonia)-Kert- (Male)': 'et-EE-KertNeural',
171
+ 'Persian (Iran)-Dilara- (Female)': 'fa-IR-DilaraNeural',
172
+ 'Persian (Iran)-Farid- (Male)': 'fa-IR-FaridNeural',
173
+ 'Finnish (Finland)-Harri- (Male)': 'fi-FI-HarriNeural',
174
+ 'Finnish (Finland)-Noora- (Female)': 'fi-FI-NooraNeural',
175
+ 'French (Belgium)-Charline- (Female)': 'fr-BE-CharlineNeural',
176
+ 'French (Belgium)-Gerard- (Male)': 'fr-BE-GerardNeural',
177
+ 'French (Canada)-Sylvie- (Female)': 'fr-CA-SylvieNeural',
178
+ 'French (Canada)-Antoine- (Male)': 'fr-CA-AntoineNeural',
179
+ 'French (Canada)-Jean- (Male)': 'fr-CA-JeanNeural',
180
+ 'French (Switzerland)-Ariane- (Female)': 'fr-CH-ArianeNeural',
181
+ 'French (Switzerland)-Fabrice- (Male)': 'fr-CH-FabriceNeural',
182
+ 'Irish (Ireland)-Colm- (Male)': 'ga-IE-ColmNeural',
183
+ 'Irish (Ireland)-Orla- (Female)': 'ga-IE-OrlaNeural',
184
+ 'Galician (Spain)-Roi- (Male)': 'gl-ES-RoiNeural',
185
+ 'Galician (Spain)-Sabela- (Female)': 'gl-ES-SabelaNeural',
186
+ 'Gujarati (India)-Dhwani- (Female)': 'gu-IN-DhwaniNeural',
187
+ 'Gujarati (India)-Niranjan- (Male)': 'gu-IN-NiranjanNeural',
188
+ 'Hindi (India)-Madhur- (Male)': 'hi-IN-MadhurNeural',
189
+ 'Hindi (India)-Swara- (Female)': 'hi-IN-SwaraNeural',
190
+ 'Croatian (Croatia)-Gabrijela- (Female)': 'hr-HR-GabrijelaNeural',
191
+ 'Croatian (Croatia)-Srecko- (Male)': 'hr-HR-SreckoNeural',
192
+ 'Hungarian (Hungary)-Noemi- (Female)': 'hu-HU-NoemiNeural',
193
+ 'Hungarian (Hungary)-Tamas- (Male)': 'hu-HU-TamasNeural',
194
+ 'Icelandic (Iceland)-Gudrun- (Female)': 'is-IS-GudrunNeural',
195
+ 'Icelandic (Iceland)-Gunnar- (Male)': 'is-IS-GunnarNeural',
196
+ 'Javanese (Indonesia)-Dimas- (Male)': 'jv-ID-DimasNeural',
197
+ 'Javanese (Indonesia)-Siti- (Female)': 'jv-ID-SitiNeural',
198
+ 'Georgian (Georgia)-Eka- (Female)': 'ka-GE-EkaNeural',
199
+ 'Georgian (Georgia)-Giorgi- (Male)': 'ka-GE-GiorgiNeural',
200
+ 'Kazakh (Kazakhstan)-Aigul- (Female)': 'kk-KZ-AigulNeural',
201
+ 'Kazakh (Kazakhstan)-Daulet- (Male)': 'kk-KZ-DauletNeural',
202
+ 'Khmer (Cambodia)-Piseth- (Male)': 'km-KH-PisethNeural',
203
+ 'Khmer (Cambodia)-Sreymom- (Female)': 'km-KH-SreymomNeural',
204
+ 'Kannada (India)-Gagan- (Male)': 'kn-IN-GaganNeural',
205
+ 'Kannada (India)-Sapna- (Female)': 'kn-IN-SapnaNeural',
206
+ 'Lao (Laos)-Chanthavong- (Male)': 'lo-LA-ChanthavongNeural',
207
+ 'Lao (Laos)-Keomany- (Female)': 'lo-LA-KeomanyNeural',
208
+ 'Lithuanian (Lithuania)-Leonas- (Male)': 'lt-LT-LeonasNeural',
209
+ 'Lithuanian (Lithuania)-Ona- (Female)': 'lt-LT-OnaNeural',
210
+ 'Latvian (Latvia)-Everita- (Female)': 'lv-LV-EveritaNeural',
211
+ 'Latvian (Latvia)-Nils- (Male)': 'lv-LV-NilsNeural',
212
+ 'Macedonian (North Macedonia)-Aleksandar- (Male)': 'mk-MK-AleksandarNeural',
213
+ 'Macedonian (North Macedonia)-Marija- (Female)': 'mk-MK-MarijaNeural',
214
+ 'Malayalam (India)-Midhun- (Male)': 'ml-IN-MidhunNeural',
215
+ 'Malayalam (India)-Sobhana- (Female)': 'ml-IN-SobhanaNeural',
216
+ 'Mongolian (Mongolia)-Bataa- (Male)': 'mn-MN-BataaNeural',
217
+ 'Mongolian (Mongolia)-Yesui- (Female)': 'mn-MN-YesuiNeural',
218
+ 'Marathi (India)-Aarohi- (Female)': 'mr-IN-AarohiNeural',
219
+ 'Marathi (India)-Manohar- (Male)': 'mr-IN-ManoharNeural',
220
+ 'Maltese (Malta)-Grace- (Female)': 'mt-MT-GraceNeural',
221
+ 'Maltese (Malta)-Joseph- (Male)': 'mt-MT-JosephNeural',
222
+ 'Burmese (Myanmar)-Nilar- (Female)': 'my-MM-NilarNeural',
223
+ 'Burmese (Myanmar)-Thiha- (Male)': 'my-MM-ThihaNeural',
224
+ 'Nepali (Nepal)-Hemkala- (Female)': 'ne-NP-HemkalaNeural',
225
+ 'Nepali (Nepal)-Sagar- (Male)': 'ne-NP-SagarNeural',
226
+ 'Dutch (Belgium)-Arnaud- (Male)': 'nl-BE-ArnaudNeural',
227
+ 'Dutch (Belgium)-Dena- (Female)': 'nl-BE-DenaNeural',
228
+ 'Polish (Poland)-Marek- (Male)': 'pl-PL-MarekNeural',
229
+ 'Polish (Poland)-Zofia- (Female)': 'pl-PL-ZofiaNeural',
230
+ 'Pashto (Afghanistan)-Gul Nawaz- (Male)': 'ps-AF-Gul',}
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  gTTS
2
  elevenlabs
 
3
  stftpitchshift==1.5.1
4
  torchcrepe
5
  setuptools
@@ -19,4 +20,4 @@ mega.py
19
  gdown
20
  onnxruntime
21
  pyngrok==4.1.12
22
- torch
 
1
  gTTS
2
  elevenlabs
3
+ edge-tts
4
  stftpitchshift==1.5.1
5
  torchcrepe
6
  setuptools
 
20
  gdown
21
  onnxruntime
22
  pyngrok==4.1.12
23
+ torch
vc_infer_pipeline.py CHANGED
@@ -15,6 +15,14 @@ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
15
 
16
  input_audio_path2wav = {}
17
 
 
 
 
 
 
 
 
 
18
 
19
  @lru_cache
20
  def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
@@ -312,14 +320,7 @@ class VC(object):
312
  x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
313
  )
314
  elif f0_method == "rmvpe":
315
- if hasattr(self, "model_rmvpe") == False:
316
- from rmvpe import RMVPE
317
-
318
- print("loading rmvpe model")
319
- self.model_rmvpe = RMVPE(
320
- "rmvpe.pt", is_half=self.is_half, device=self.device
321
- )
322
- f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
323
 
324
  elif "hybrid" in f0_method:
325
  # Perform hybrid median pitch estimation
 
15
 
16
  input_audio_path2wav = {}
17
 
18
+ #A fun little addition from my personal RVC branch.
19
+ #You don't have to implement it if you don't have to
20
+ from config import Config
21
+ config=Config()
22
+ from rmvpe import RMVPE
23
+ print("Preloading RMVPE model...")
24
+ model_rmvpe = RMVPE("rmvpe.pt", is_half=config.is_half, device=config.device)
25
+ del config
26
 
27
  @lru_cache
28
  def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
 
320
  x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
321
  )
322
  elif f0_method == "rmvpe":
323
+ f0 = model_rmvpe.infer_from_audio(x, thred=0.03)
 
 
 
 
 
 
 
324
 
325
  elif "hybrid" in f0_method:
326
  # Perform hybrid median pitch estimation