|
|
|
import gradio as gr |
|
import numpy as np |
|
import torch |
|
import torch.nn as nn |
|
import audiofile |
|
from tts import StyleTTS2 |
|
from textual import only_greek_or_only_latin, transliterate_number, fix_vocals |
|
import textwrap |
|
|
|
from audionar import VitsModel, VitsTokenizer |
|
|
|
|
|
|
|
|
|
|
|
|
|
language_names = ['Ancient greek', |
|
'English', |
|
'Deutsch', |
|
'French', |
|
'Hungarian', |
|
'Romanian', |
|
'Serbian (Approx.)'] |
|
|
|
|
|
def audionar_tts(text=None, |
|
lang='Romanian'): |
|
|
|
|
|
|
|
|
|
lang_map = { |
|
'ancient greek': 'grc', |
|
'english': 'eng', |
|
'deutsch': 'deu', |
|
'french': 'fra', |
|
'hungarian': 'hun', |
|
'romanian': 'ron', |
|
'serbian (approx.)': 'rmc-script_latin', |
|
} |
|
|
|
if text is None or text.strip() == '': |
|
text = 'No Txt Has been typed' |
|
|
|
|
|
|
|
fs = 16000 |
|
|
|
if lang not in language_names: |
|
|
|
fs = 24000 |
|
|
|
text = only_greek_or_only_latin(text, lang='eng') |
|
|
|
x = _tts.inference(text, |
|
ref_s='wav/' + lang + '.wav')[0, 0, :].numpy() |
|
|
|
else: |
|
|
|
lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip()) |
|
|
|
global cached_lang_code, cached_net_g, cached_tokenizer |
|
|
|
if 'cached_lang_code' not in globals() or cached_lang_code != lang_code: |
|
cached_lang_code = lang_code |
|
cached_net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval() |
|
cached_tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}') |
|
|
|
net_g = cached_net_g |
|
tokenizer = cached_tokenizer |
|
text = only_greek_or_only_latin(text, lang=lang_code) |
|
text = transliterate_number(text, lang=lang_code) |
|
text = fix_vocals(text, lang=lang_code) |
|
|
|
|
|
sentences = textwrap.wrap(text, width=439) |
|
|
|
total_audio_parts = [] |
|
for sentence in sentences: |
|
inputs = cached_tokenizer(sentence, return_tensors="pt") |
|
with torch.no_grad(): |
|
audio_part = cached_net_g( |
|
input_ids=inputs.input_ids, |
|
attention_mask=inputs.attention_mask, |
|
lang_code=lang_code, |
|
)[0, :] |
|
total_audio_parts.append(audio_part) |
|
|
|
x = torch.cat(total_audio_parts).cpu().numpy() |
|
|
|
|
|
|
|
|
|
|
|
|
|
wavfile = '_vits_.wav' |
|
audiofile.write(wavfile, x, fs) |
|
return wavfile |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
VOICES = ['jv_ID_google-gmu_04982.wav', |
|
'en_US_vctk_p303.wav', |
|
'en_US_vctk_p306.wav', |
|
'en_US_vctk_p318.wav', |
|
'en_US_vctk_p269.wav', |
|
'en_US_vctk_p316.wav', |
|
'en_US_vctk_p362.wav', |
|
'fr_FR_tom.wav', |
|
'bn_multi_5958.wav', |
|
'en_US_vctk_p287.wav', |
|
'en_US_vctk_p260.wav', |
|
'en_US_cmu_arctic_fem.wav', |
|
'en_US_cmu_arctic_rms.wav', |
|
'fr_FR_m-ailabs_nadine_eckert_boulet.wav', |
|
'en_US_vctk_p237.wav', |
|
'en_US_vctk_p317.wav', |
|
'tn_ZA_google-nwu_0378.wav', |
|
'nl_pmk.wav', |
|
'tn_ZA_google-nwu_3342.wav', |
|
'ne_NP_ne-google_3997.wav', |
|
'tn_ZA_google-nwu_8914.wav', |
|
'en_US_vctk_p238.wav', |
|
'en_US_vctk_p275.wav', |
|
'af_ZA_google-nwu_0184.wav', |
|
'af_ZA_google-nwu_8148.wav', |
|
'en_US_vctk_p326.wav', |
|
'en_US_vctk_p264.wav', |
|
'en_US_vctk_p295.wav', |
|
'en_US_vctk_p294.wav', |
|
'en_US_vctk_p330.wav', |
|
'gu_IN_cmu-indic_cmu_indic_guj_ad.wav', |
|
'jv_ID_google-gmu_05219.wav', |
|
'en_US_vctk_p284.wav', |
|
'en_US_m-ailabs_mary_ann.wav', |
|
'bn_multi_01701.wav', |
|
'en_US_vctk_p262.wav', |
|
'en_US_vctk_p243.wav', |
|
'en_US_vctk_p278.wav', |
|
'en_US_vctk_p250.wav', |
|
'nl_femal.wav', |
|
'en_US_vctk_p228.wav', |
|
'ne_NP_ne-google_0649.wav', |
|
'en_US_cmu_arctic_gka.wav', |
|
'en_US_vctk_p361.wav', |
|
'jv_ID_google-gmu_02326.wav', |
|
'tn_ZA_google-nwu_1932.wav', |
|
'de_DE_thorsten-emotion_amused.wav', |
|
'jv_ID_google-gmu_08002.wav', |
|
'tn_ZA_google-nwu_3629.wav', |
|
'en_US_vctk_p230.wav', |
|
'af_ZA_google-nwu_7214.wav', |
|
'nl_nathalie.wav', |
|
'en_US_cmu_arctic_lnh.wav', |
|
'tn_ZA_google-nwu_6459.wav', |
|
'tn_ZA_google-nwu_6206.wav', |
|
'en_US_vctk_p323.wav', |
|
'en_US_m-ailabs_judy_bieber.wav', |
|
'en_US_vctk_p261.wav', |
|
'fa_haaniye.wav', |
|
|
|
'tn_ZA_google-nwu_7896.wav', |
|
'en_US_vctk_p258.wav', |
|
'tn_ZA_google-nwu_7674.wav', |
|
'en_US_hifi-tts_6097.wav', |
|
'en_US_vctk_p304.wav', |
|
'en_US_vctk_p307.wav', |
|
'fr_FR_m-ailabs_bernard.wav', |
|
'en_US_cmu_arctic_jmk.wav', |
|
'ne_NP_ne-google_0283.wav', |
|
'en_US_vctk_p246.wav', |
|
'en_US_vctk_p276.wav', |
|
'style_o22050.wav', |
|
'en_US_vctk_s5.wav', |
|
'en_US_vctk_p268.wav', |
|
'af_ZA_google-nwu_8924.wav', |
|
'en_US_vctk_p363.wav', |
|
|
|
'ne_NP_ne-google_3614.wav', |
|
'ne_NP_ne-google_3154.wav', |
|
'en_US_cmu_arctic_eey.wav', |
|
'tn_ZA_google-nwu_2839.wav', |
|
'af_ZA_google-nwu_7130.wav', |
|
'ne_NP_ne-google_2139.wav', |
|
'jv_ID_google-gmu_04715.wav', |
|
'en_US_vctk_p273.wav' |
|
] |
|
VOICES = [t[:-4] for t in VOICES] |
|
|
|
_tts = StyleTTS2().to('cpu') |
|
with gr.Blocks() as demo: |
|
with gr.Tabs() as tabs: |
|
with gr.Tab("TTS"): |
|
with gr.Column(): |
|
text_input = gr.Textbox( |
|
label="Type text for TTS:", |
|
placeholder="Accepts Latin / Cyrillic / Greek", |
|
lines=4, |
|
value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.') |
|
choice_dropdown = gr.Dropdown( |
|
choices=language_names + VOICES, |
|
label="Vox", |
|
value=language_names[0]) |
|
generate_button = gr.Button("Produce Audio", variant="primary") |
|
output_audio = gr.Audio(label=".wav File") |
|
generate_button.click( |
|
fn=audionar_tts, |
|
inputs=[text_input, choice_dropdown], |
|
outputs=[output_audio]) |
|
|
|
with gr.Tab(label="Videos"): |
|
gr.Markdown('''<a href="https://huggingface.co/dkounadis/artificial-styletts2">Full Code and Videos</a>''') |
|
|
|
demo.launch(debug=True) |
|
|