import os import json import gradio as gr from gradio_client import Client voice_models = [ ("👨‍🦳 #6671", "ccby_nvidia_hifi_6671_M"), ("👱‍♀️ 🇬🇧 #92", "ccby_nvidia_hifi_92_F"), ] voice_models_more = [ ("🧔 #6670", "ccby_nvidia_hifi_6670_M"), ("👨‍🦲 #9017", "ccby_nvidia_hifi_9017_M"), ("🧑 #6097", "ccby_nvidia_hifi_6097_M"), ("👩‍🦱 #12787", "ccby_nvidia_hifi_12787_F"), ("👵 #11614", "ccby_nv_hifi_11614_F"), ("👩‍🦰 #8051", "ccby_nvidia_hifi_8051_F"), ("👩‍🦳 #11697", "ccby_nvidia_hifi_11697_F"), ("👩‍🦲 #9136", "ccby_nvidia_hifi_9136_F"), ("♟ Lojban", "x_selpahi"), # v2 model for Lojban, pre-multilingual capabilities of xVASynth ] # order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA languages = [ ("🇺🇸 EN", "en"), ("🇩🇪 DE", "de"), ("🇪🇸 ES", "es"), ("🇮🇳 HI", "hi"), ("🇨🇳 ZH", "zh"), ] languages_more = [ ("🇳🇱 NL", "nl"), ("🇧🇷 PT", "pt"), ("🇮🇹 IT", "it"), ("🇵🇱 PL", "pl"), ("🇷🇴 RO", "ro"), ("🇸🇪 SV", "sv"), ("🇩🇰 DA", "da"), ("🇫🇮 FI", "fi"), ("🇭🇺 HU", "hu"), ("🇬🇷 EL", "el"), ("🇫🇷 FR", "fr"), ("🇷🇺 RU", "ru"), ("🇺🇦 UA", "uk"), ("🇹🇷 TR", "tr"), ("🇸🇦 AR", "ar"), ("🇯🇵 JP", "jp"), ("🇰🇷 KO", "ko"), ("🇻🇳 VI", "vi"), ("🇻🇦 LA", "la"), ("🇳🇬 YO", "yo"), ("Swahili", "sw"), ("Hausa", "ha"), ("Wolof", "wo"), ] lojban_lang = [ # There is no ISO 639-1 for Lojban, but jb is valid ('♟ Lojban', 'jb') ] # Translated from English by DeepMind's Gemini Pro default_text = { "ar": "هذا هو صوتي.", "da": "Sådan lyder min stemme.", "de": "So klingt meine Stimme.", "el": "Έτσι ακούγεται η φωνή μου.", "en": "This is what my voice sounds like.", "es": "Así suena mi voz.", "fi": "Näin ääneni kuulostaa.", "fr": "Voici à quoi ressemble ma voix.", "ha": "Wannan ne muryata ke.", "hi": "यह मेरी आवाज़ कैसी लगती है।", "hu": "Így hangzik a hangom.", "it": "Così suona la mia voce.", "jb": ".i ca'e gusni", "jp": "これが私の声です。", "ko": "여기 제 목소리가 어떤지 들어보세요.", "la": "Haec est vox mea sonans.", "nl": "Dit is hoe mijn stem klinkt.", "pl": "Tak brzmi mój głos.", "pt": "É assim que minha voz soa.", "ro": "Așa sună vocea mea.", "ru": "Вот как звучит мой голос.", "sv": "Såhär låter min röst.", "sw": "Sauti yangu inasikika hivi.", "tr": "Benim sesimin sesi böyle.", "uk": "Ось як звучить мій голос.", "vi": "Đây là giọng nói của tôi.", "wo": "Ndox li neen xewnaal ma.", "yo": "Ìyí ni ohùn mi ńlá.", "zh": "这是我的声音。", } # Component defaults input_textbox_init = { 'label': "Input Text", 'value': "This is what my voice sounds like.", 'info': "Also accepts ARPAbet symbols placed within {} brackets.", 'lines': 1, 'max_lines': 5, 'autofocus': True, } pacing_slider_init = { 'value': 1.0, 'minimum': 0.5, 'maximum': 2.0, 'step': 0.1, 'label': "Duration", } pitch_slider_init = { 'minimum': 0, 'maximum': 1.0, 'value': 0.5, 'step': 0.05, 'label': "Pitch", 'visible': False, } energy_slider_init = { 'minimum': 0.1, 'maximum': 1.0, 'value': 1.0, 'step': 0.05, 'label': "Energy", 'visible': False, } anger_slider_init = { 'minimum': 0, 'maximum': 1.0, 'value': 0, 'step': 0.05, 'label': "😠 Anger", 'info': "Tread lightly beyond 0.9", } happy_slider_init = { 'minimum': 0, 'maximum': 1.0, 'value': 0, 'step': 0.05, 'label': "😃 Happiness", 'info': "Tread lightly beyond 0.7", } sad_slider_init = { 'minimum': 0, 'maximum': 1.0, 'value': 0, 'step': 0.05, 'label': "😭 Sadness", 'info': "Duration increased when beyond 0.2", } surprise_slider_init = { 'minimum': 0, 'maximum': 1.0, 'value': 0, 'step': 0.05, 'label': "😮 Surprise", 'info': "Does not play well with Happiness with either being beyond 0.3", } voice_radio_init = { 'choices': [*voice_models, (f'+{len(voice_models_more)}', 'more')], 'value': "ccby_nvidia_hifi_6671_M", 'label': "Voice", 'info': "NVIDIA HIFI CC-BY-4.0 xVAPitch voice model" } deepmoji_checkbox_init = { 'label': "Use DeepMoji", 'info': "Auto adjust emotional values for English", 'value': True, 'interactive': True } def more_lang_options(lang): # print('more_lang_options') if lang != 'more': return lang radio_init = {**language_radio_init} radio_init['choices'] = [*languages, *languages_more] return gr.Radio(**radio_init) def set_default_text(lang, deepmoji_checked): # print('set_default_text') textbox_init = {**input_textbox_init} if lang == 'more': textbox_init['value'] = default_text['en'] # return default_text['en'], deepmoji_checked return gr.Textbox(**textbox_init), deepmoji_checked textbox_init['value'] = default_text[lang] # DeepMoji only works on English Text checkbox_init = {**deepmoji_checkbox_init} if lang == 'en': checkbox_init['value'] = deepmoji_checked, # checkbox_init['interactive'] = True else: deepmoji_checked = False # FIXME: event listener conflict with toggle_deepmoji # checkbox_init['info'] = "Works only with English!", # checkbox_init['value'] = False, # checkbox_init['interactive'] = False # gr.Checkbox(**checkbox_init) return gr.Textbox(**textbox_init), deepmoji_checked # examples component en_examples = [ "This is what my voice sounds like.", "If there is anything else you need, feel free to ask.", "Amazing! Could you do that again?", "Why, I would be more than happy to help you!", "That was unexpected.", "How dare you! . You have no right.", "Ahh, well, you see. There is more to it.", "I can't believe she is gone.", "Stay out of my way!!!", # ARPAbet example "This { IH1 Z } { W AH1 T } { M AY1 } { V OY1 S } { S AW1 N D Z } like.", ] en_examples_dropdown_init = { 'choices': en_examples, 'value': en_examples[0], 'label': "Example dropdown", 'show_label': False, 'info': "English Examples", 'visible': True } def set_example_as_input(example_text): # print('set_example_as_input') return example_text def toggle_example_dropdown(lang): # print('toggle_example_dropdown') dropdown_init = {**en_examples_dropdown_init} if lang == 'en': dropdown_init['visible'] = True else: dropdown_init['visible'] = False return gr.Dropdown(**dropdown_init) def more_voice_options(voice): # print('more_voice_options') if voice != 'more': return voice radio_init = {**voice_radio_init} radio_init['choices'] = [*voice_models, *voice_models_more] return gr.Radio(**radio_init) def reset_em_sliders( deepmoji_enabled, anger, happy, sad, surprise ): # print('reset_em_sliders') if (deepmoji_enabled): return (0, 0, 0, 0) else: return ( anger, happy, sad, surprise ) def toggle_deepmoji( checked, anger, happy, sad, surprise ): # print('toggle_deepmoji') if checked: return (0, 0, 0, 0) else: return ( anger, happy, sad, surprise ) # languages component language_radio_init = { 'choices': [*languages, *[(f'+{len(languages_more)}', 'more')]], 'value': "en", 'label': "Language", 'info': "Will be more monotone and have an English accent." } def set_lojban_language(voice, lang): if voice != 'x_selpahi': return lang radio_init = {**language_radio_init} radio_init['choices'] = [ *lojban_lang, *languages, *languages_more, ] radio_init['value'] = lojban_lang[0][1] return gr.Radio(**radio_init) _DESCRIPTION = '''
Duplicate Space for a personal CPU-run one
''' class BlocksDemo: def __init__(self, models_path, lojban_models_path): self.models_path = models_path self.lojban_models_path = lojban_models_path self.block = self.create_interface() def create_interface(self): with gr.Blocks(css=".arpabet {background-color: gray; border-radius: 5px; font-size: 120%; padding: 0 0.1em; margin: 0 0.1em; text-align: center}") as demo: gr.Markdown("# xVASynth TTS") gr.HTML(label="description", value=_DESCRIPTION) with gr.Row(): # Main row for inputs and language selection with gr.Column(): # Input column input_textbox = gr.Textbox(**input_textbox_init) language_radio = gr.Radio(**language_radio_init) # remove autofocus input_textbox_init['autofocus'] = False with gr.Row(): with gr.Column(): en_examples_dropdown = gr.Dropdown(**en_examples_dropdown_init) with gr.Column(): pacing_slider = gr.Slider(**pacing_slider_init) with gr.Column(): # Control column voice_radio = gr.Radio(**voice_radio_init) pitch_slider = gr.Slider(**pitch_slider_init) energy_slider = gr.Slider(**energy_slider_init) with gr.Row(): # Main row for inputs and language selection with gr.Column(): # Input column anger_slider = gr.Slider(**anger_slider_init) sad_slider = gr.Slider(**sad_slider_init) with gr.Column(): # Input column happy_slider = gr.Slider(**happy_slider_init) surprise_slider = gr.Slider(**surprise_slider_init) deepmoji_checkbox = gr.Checkbox(**deepmoji_checkbox_init) # Event handling using click btn = gr.Button("Generate", variant="primary") # with gr.Row(): # Main row for inputs and language selection # with gr.Column(): # Input column output_wav = gr.Audio( label="22kHz audio output", type="filepath", editable=False, autoplay=True ) # with gr.Column(): # Input column output_arpabet = gr.HTML(label="ARPAbet") btn.click( fn=self.predict, inputs=[ input_textbox, voice_radio, language_radio, pacing_slider, pitch_slider, energy_slider, anger_slider, happy_slider, sad_slider, surprise_slider, deepmoji_checkbox ], outputs=[ output_wav, output_arpabet, anger_slider, happy_slider, sad_slider, surprise_slider, # xVAServer response gr.Textbox(visible=False) ] ) # more languages option language_radio.change( more_lang_options, inputs=language_radio, outputs=language_radio, trigger_mode='once', show_progress='hidden', ) # more voices option voice_radio.change( more_voice_options, inputs=voice_radio, outputs=voice_radio, trigger_mode='once', show_progress='hidden', queue=False, ) # set default text language_radio.change( set_default_text, inputs=[language_radio, deepmoji_checkbox], outputs=[input_textbox, deepmoji_checkbox], show_progress='hidden', queue=False, ) # toggle en examples language_radio.change( toggle_example_dropdown, inputs=language_radio, outputs=en_examples_dropdown, show_progress='hidden', queue=False, ) en_examples_dropdown.change( set_example_as_input, inputs=[en_examples_dropdown], outputs=[input_textbox], show_progress='hidden', queue=False, ) deepmoji_checkbox.change( toggle_deepmoji, inputs=[ deepmoji_checkbox, anger_slider, happy_slider, sad_slider, surprise_slider ], outputs=[ anger_slider, happy_slider, sad_slider, surprise_slider ], show_progress='hidden', queue=False, ) input_textbox.change( reset_em_sliders, inputs=[ deepmoji_checkbox, anger_slider, happy_slider, sad_slider, surprise_slider ], outputs=[ anger_slider, happy_slider, sad_slider, surprise_slider ], show_progress='hidden', queue=False, ) voice_radio.change( reset_em_sliders, inputs=[ deepmoji_checkbox, anger_slider, happy_slider, sad_slider, surprise_slider ], outputs=[ anger_slider, happy_slider, sad_slider, surprise_slider ], show_progress='hidden', queue=False, ) # Replace output with voice audio sample voice_radio.change( self.set_default_audio, inputs=voice_radio, outputs=output_wav, queue=True, trigger_mode='once', ) # Switched to Lojban voice voice_radio.change( set_lojban_language, inputs=[voice_radio, language_radio], outputs=[language_radio], trigger_mode='once', queue=True, ) return demo def predict( self, input_text, voice, lang, pacing, pitch, energy, anger, happy, sad, surprise, deepmoji_checked ): wav_path, arpabet_html, angry, happy, sad, surprise, response = client.predict( input_text, # str in 'Input Text' Textbox component voice, # Literal['ccby_nvidia_hifi_6670_M', 'ccby_nv_hifi_11614_F', 'ccby_nvidia_hifi_11697_F', 'ccby_nvidia_hifi_12787_F', 'ccby_nvidia_hifi_6097_M', 'ccby_nvidia_hifi_6671_M', 'ccby_nvidia_hifi_8051_F', 'ccby_nvidia_hifi_9017_M', 'ccby_nvidia_hifi_9136_F', 'ccby_nvidia_hifi_92_F'] in 'Voice' Radio component lang, # Literal['en', 'de', 'es', 'it', 'fr', 'ru', 'tr', 'la', 'ro', 'da', 'vi', 'ha', 'nl', 'zh', 'ar', 'uk', 'hi', 'ko', 'pl', 'sw', 'fi', 'hu', 'pt', 'yo', 'sv', 'el', 'wo', 'jp'] in 'Language' Radio component pacing, # float (numeric value between 0.5 and 2.0) in 'Duration' Slider component pitch, # float (numeric value between 0 and 1.0) in 'Pitch' Slider component energy, # float (numeric value between 0.1 and 1.0) in 'Energy' Slider component anger, # float (numeric value between 0 and 1.0) in '😠 Anger' Slider component happy, # float (numeric value between 0 and 1.0) in '😃 Happiness' Slider component sad, # float (numeric value between 0 and 1.0) in '😭 Sadness' Slider component surprise, # float (numeric value between 0 and 1.0) in '😮 Surprise' Slider component deepmoji_checked, # bool api_name="/predict" ) arpabet_html = '' if voice == 'x_selpahi': em_angry = 0 em_happy = 0 em_sad = 0 em_surprise = 0 else: json_data = json.loads(response.replace("'", '"')) arpabet_html = '
ARPAbet & Durations
' arpabet_html += '' arpabet_nopad = json_data['arpabet'].split('|PAD|') arpabet_symbols = json_data['arpabet'].split('|') wpad_len = len(arpabet_symbols) nopad_len = len(arpabet_nopad) total_dur_length = 0 for symb_i in range(wpad_len): if (arpabet_symbols[symb_i] == ''): continue total_dur_length += float(json_data['durations'][symb_i]) for symb_i in range(wpad_len): if (arpabet_symbols[symb_i] == ''): continue arpabet_length = float(json_data['durations'][symb_i]) cell_width = round(arpabet_length / total_dur_length * 100, 2) arpabet_html += ' ' arpabet_html += '
'\ + arpabet_symbols[symb_i]\ + '
' if use_deepmoji: em_angry = round(json_data['em_angry'][0], 2) em_happy = round(json_data['em_happy'][0], 2) em_sad = round(json_data['em_sad'][0], 2) em_surprise = round(json_data['em_surprise'][0], 2) else: em_angry = anger em_happy = happy em_sad = sad em_surprise = surprise return [ wav_path, arpabet_html, em_angry, em_happy, em_sad, em_surprise, response ] def set_default_audio(self, voice_id): if voice_id == 'more': return None if voice_id == 'x_selpahi': sample_path = self.lojban_models_path else: sample_path = self.models_path if __name__ == "__main__": return client.predict( voice_id, api_name="/set_default_audio" ) return sample_path + voice_id + '.wav' if __name__ == "__main__": print('running Gradio interface') client = Client("Pendrokar/xVASynth") demo = BlocksDemo('', '') demo.block.launch()