Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import os | |
import json | |
import gradio as gr | |
from gradio_client import Client | |
voice_models = [ | |
("👨🦳 #6671", "ccby_nvidia_hifi_6671_M"), | |
("👱♀️ 🇬🇧 #92", "ccby_nvidia_hifi_92_F"), | |
] | |
voice_models_more = [ | |
("🧔 #6670", "ccby_nvidia_hifi_6670_M"), | |
("👨🦲 #9017", "ccby_nvidia_hifi_9017_M"), | |
("🧑 #6097", "ccby_nvidia_hifi_6097_M"), | |
("👩🦱 #12787", "ccby_nvidia_hifi_12787_F"), | |
("👵 #11614", "ccby_nv_hifi_11614_F"), | |
("👩🦰 #8051", "ccby_nvidia_hifi_8051_F"), | |
("👩🦳 #11697", "ccby_nvidia_hifi_11697_F"), | |
("👩🦲 #9136", "ccby_nvidia_hifi_9136_F"), | |
("♟ Lojban", "x_selpahi"), # v2 model for Lojban, pre-multilingual capabilities of xVASynth | |
] | |
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA | |
languages = [ | |
("🇺🇸 EN", "en"), | |
("🇩🇪 DE", "de"), | |
("🇪🇸 ES", "es"), | |
("🇮🇳 HI", "hi"), | |
("🇨🇳 ZH", "zh"), | |
] | |
languages_more = [ | |
("🇳🇱 NL", "nl"), | |
("🇧🇷 PT", "pt"), | |
("🇮🇹 IT", "it"), | |
("🇵🇱 PL", "pl"), | |
("🇷🇴 RO", "ro"), | |
("🇸🇪 SV", "sv"), | |
("🇩🇰 DA", "da"), | |
("🇫🇮 FI", "fi"), | |
("🇭🇺 HU", "hu"), | |
("🇬🇷 EL", "el"), | |
("🇫🇷 FR", "fr"), | |
("🇷🇺 RU", "ru"), | |
("🇺🇦 UA", "uk"), | |
("🇹🇷 TR", "tr"), | |
("🇸🇦 AR", "ar"), | |
("🇯🇵 JP", "jp"), | |
("🇰🇷 KO", "ko"), | |
("🇻🇳 VI", "vi"), | |
("🇻🇦 LA", "la"), | |
("🇳🇬 YO", "yo"), | |
("Swahili", "sw"), | |
("Hausa", "ha"), | |
("Wolof", "wo"), | |
] | |
lojban_lang = [ | |
# There is no ISO 639-1 for Lojban, but jb is valid | |
('♟ Lojban', 'jb') | |
] | |
# Translated from English by DeepMind's Gemini Pro | |
default_text = { | |
"ar": "هذا هو صوتي.", | |
"da": "Sådan lyder min stemme.", | |
"de": "So klingt meine Stimme.", | |
"el": "Έτσι ακούγεται η φωνή μου.", | |
"en": "This is what my voice sounds like.", | |
"es": "Así suena mi voz.", | |
"fi": "Näin ääneni kuulostaa.", | |
"fr": "Voici à quoi ressemble ma voix.", | |
"ha": "Wannan ne muryata ke.", | |
"hi": "यह मेरी आवाज़ कैसी लगती है।", | |
"hu": "Így hangzik a hangom.", | |
"it": "Così suona la mia voce.", | |
"jb": ".i ca'e gusni", | |
"jp": "これが私の声です。", | |
"ko": "여기 제 목소리가 어떤지 들어보세요.", | |
"la": "Haec est vox mea sonans.", | |
"nl": "Dit is hoe mijn stem klinkt.", | |
"pl": "Tak brzmi mój głos.", | |
"pt": "É assim que minha voz soa.", | |
"ro": "Așa sună vocea mea.", | |
"ru": "Вот как звучит мой голос.", | |
"sv": "Såhär låter min röst.", | |
"sw": "Sauti yangu inasikika hivi.", | |
"tr": "Benim sesimin sesi böyle.", | |
"uk": "Ось як звучить мій голос.", | |
"vi": "Đây là giọng nói của tôi.", | |
"wo": "Ndox li neen xewnaal ma.", | |
"yo": "Ìyí ni ohùn mi ńlá.", | |
"zh": "这是我的声音。", | |
} | |
# Component defaults | |
input_textbox_init = { | |
'label': "Input Text", | |
'value': "This is what my voice sounds like.", | |
'info': "Also accepts ARPAbet symbols placed within {} brackets.", | |
'lines': 1, | |
'max_lines': 5, | |
'autofocus': True, | |
} | |
pacing_slider_init = { | |
'value': 1.0, | |
'minimum': 0.5, | |
'maximum': 2.0, | |
'step': 0.1, | |
'label': "Duration", | |
} | |
pitch_slider_init = { | |
'minimum': 0, | |
'maximum': 1.0, | |
'value': 0.5, | |
'step': 0.05, | |
'label': "Pitch", | |
'visible': False, | |
} | |
energy_slider_init = { | |
'minimum': 0.1, | |
'maximum': 1.0, | |
'value': 1.0, | |
'step': 0.05, | |
'label': "Energy", | |
'visible': False, | |
} | |
anger_slider_init = { | |
'minimum': 0, | |
'maximum': 1.0, | |
'value': 0, | |
'step': 0.05, | |
'label': "😠 Anger", | |
'info': "Tread lightly beyond 0.9", | |
} | |
happy_slider_init = { | |
'minimum': 0, | |
'maximum': 1.0, | |
'value': 0, | |
'step': 0.05, | |
'label': "😃 Happiness", | |
'info': "Tread lightly beyond 0.7", | |
} | |
sad_slider_init = { | |
'minimum': 0, | |
'maximum': 1.0, | |
'value': 0, | |
'step': 0.05, | |
'label': "😭 Sadness", | |
'info': "Duration increased when beyond 0.2", | |
} | |
surprise_slider_init = { | |
'minimum': 0, | |
'maximum': 1.0, | |
'value': 0, | |
'step': 0.05, | |
'label': "😮 Surprise", | |
'info': "Does not play well with Happiness with either being beyond 0.3", | |
} | |
voice_radio_init = { | |
'choices': [*voice_models, (f'+{len(voice_models_more)}', 'more')], | |
'value': "ccby_nvidia_hifi_6671_M", | |
'label': "Voice", | |
'info': "NVIDIA HIFI CC-BY-4.0 xVAPitch voice model" | |
} | |
deepmoji_checkbox_init = { | |
'label': "Use DeepMoji", | |
'info': "Auto adjust emotional values for English", | |
'value': True, | |
'interactive': True | |
} | |
def more_lang_options(lang): | |
# print('more_lang_options') | |
if lang != 'more': | |
return lang | |
radio_init = {**language_radio_init} | |
radio_init['choices'] = [*languages, *languages_more] | |
return gr.Radio(**radio_init) | |
def set_default_text(lang, deepmoji_checked): | |
# print('set_default_text') | |
textbox_init = {**input_textbox_init} | |
if lang == 'more': | |
textbox_init['value'] = default_text['en'] | |
# return default_text['en'], deepmoji_checked | |
return gr.Textbox(**textbox_init), deepmoji_checked | |
textbox_init['value'] = default_text[lang] | |
# DeepMoji only works on English Text | |
checkbox_init = {**deepmoji_checkbox_init} | |
if lang == 'en': | |
checkbox_init['value'] = deepmoji_checked, | |
# checkbox_init['interactive'] = True | |
else: | |
deepmoji_checked = False | |
# FIXME: event listener conflict with toggle_deepmoji | |
# checkbox_init['info'] = "Works only with English!", | |
# checkbox_init['value'] = False, | |
# checkbox_init['interactive'] = False | |
# gr.Checkbox(**checkbox_init) | |
return gr.Textbox(**textbox_init), deepmoji_checked | |
# examples component | |
en_examples = [ | |
"This is what my voice sounds like.", | |
"If there is anything else you need, feel free to ask.", | |
"Amazing! Could you do that again?", | |
"Why, I would be more than happy to help you!", | |
"That was unexpected.", | |
"How dare you! . You have no right.", | |
"Ahh, well, you see. There is more to it.", | |
"I can't believe she is gone.", | |
"Stay out of my way!!!", | |
# ARPAbet example | |
"This { IH1 Z } { W AH1 T } { M AY1 } { V OY1 S } { S AW1 N D Z } like.", | |
] | |
en_examples_dropdown_init = { | |
'choices': en_examples, | |
'value': en_examples[0], | |
'label': "Example dropdown", | |
'show_label': False, | |
'info': "English Examples", | |
'visible': True | |
} | |
def set_example_as_input(example_text): | |
# print('set_example_as_input') | |
return example_text | |
def toggle_example_dropdown(lang): | |
# print('toggle_example_dropdown') | |
dropdown_init = {**en_examples_dropdown_init} | |
if lang == 'en': | |
dropdown_init['visible'] = True | |
else: | |
dropdown_init['visible'] = False | |
return gr.Dropdown(**dropdown_init) | |
def more_voice_options(voice): | |
# print('more_voice_options') | |
if voice != 'more': | |
return voice | |
radio_init = {**voice_radio_init} | |
radio_init['choices'] = [*voice_models, *voice_models_more] | |
return gr.Radio(**radio_init) | |
def reset_em_sliders( | |
deepmoji_enabled, | |
anger, | |
happy, | |
sad, | |
surprise | |
): | |
# print('reset_em_sliders') | |
if (deepmoji_enabled): | |
return (0, 0, 0, 0) | |
else: | |
return ( | |
anger, | |
happy, | |
sad, | |
surprise | |
) | |
def toggle_deepmoji( | |
checked, | |
anger, | |
happy, | |
sad, | |
surprise | |
): | |
# print('toggle_deepmoji') | |
if checked: | |
return (0, 0, 0, 0) | |
else: | |
return ( | |
anger, | |
happy, | |
sad, | |
surprise | |
) | |
# languages component | |
language_radio_init = { | |
'choices': [*languages, *[(f'+{len(languages_more)}', 'more')]], | |
'value': "en", | |
'label': "Language", | |
'info': "Will be more monotone and have an English accent." | |
} | |
def set_lojban_language(voice, lang): | |
if voice != 'x_selpahi': | |
return lang | |
radio_init = {**language_radio_init} | |
radio_init['choices'] = [ | |
*lojban_lang, | |
*languages, | |
*languages_more, | |
] | |
radio_init['value'] = lojban_lang[0][1] | |
return gr.Radio(**radio_init) | |
_DESCRIPTION = ''' | |
<div> | |
<a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a> | |
<a style="display:inline-block;" href="https://www.nexusmods.com/skyrimspecialedition/mods/44184"><img src='https://img.shields.io/badge/Endorsements-3.4k-blue?logo=nexusmods'/></a> | |
<a style="display:inline-block; margin-left: .5em" href="https://discord.gg/nv7c6E2TzV"><img src='https://img.shields.io/discord/794590496202293278.svg?label=&logo=discord&logoColor=ffffff&color=7389D8&labelColor=6A7EC2'/></a> | |
<span style="display: inline-block;margin-left: .5em;vertical-align: top;"><a href="https://huggingface.co/spaces/Pendrokar/xVASynth?duplicate=true" style="" target="_blank"><img style="margin-bottom: 0em;display: inline;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for a personal CPU-run one</span> | |
</div> | |
''' | |
class BlocksDemo: | |
def __init__(self, models_path, lojban_models_path, robotic_models_path): | |
self.models_path = models_path | |
self.lojban_models_path = lojban_models_path | |
self.robotic_models_path = robotic_models_path | |
if robotic_models_path != '': | |
# insert robotic voice as the third option | |
voice_models.append(("🤖 Robot", "cnc_cabal")) | |
voice_radio_init['choices'].insert(2, ("🤖 Robot", "cnc_cabal")) | |
self.block = self.create_interface() | |
def create_interface(self): | |
with gr.Blocks(css=".arpabet {background-color: gray; border-radius: 5px; font-size: 120%; padding: 0 0.1em; margin: 0 0.1em; text-align: center}") as demo: | |
gr.Markdown("# xVASynth TTS") | |
gr.HTML(label="description", value=_DESCRIPTION) | |
with gr.Row(): # Main row for inputs and language selection | |
with gr.Column(): # Input column | |
input_textbox = gr.Textbox(**input_textbox_init) | |
language_radio = gr.Radio(**language_radio_init) | |
# remove autofocus | |
input_textbox_init['autofocus'] = False | |
with gr.Row(): | |
with gr.Column(): | |
en_examples_dropdown = gr.Dropdown(**en_examples_dropdown_init) | |
with gr.Column(): | |
pacing_slider = gr.Slider(**pacing_slider_init) | |
with gr.Column(): # Control column | |
voice_radio = gr.Radio(**voice_radio_init) | |
pitch_slider = gr.Slider(**pitch_slider_init) | |
energy_slider = gr.Slider(**energy_slider_init) | |
with gr.Row(): # Main row for inputs and language selection | |
with gr.Column(): # Input column | |
anger_slider = gr.Slider(**anger_slider_init) | |
sad_slider = gr.Slider(**sad_slider_init) | |
with gr.Column(): # Input column | |
happy_slider = gr.Slider(**happy_slider_init) | |
surprise_slider = gr.Slider(**surprise_slider_init) | |
deepmoji_checkbox = gr.Checkbox(**deepmoji_checkbox_init) | |
# Event handling using click | |
btn = gr.Button("Generate", variant="primary") | |
# with gr.Row(): # Main row for inputs and language selection | |
# with gr.Column(): # Input column | |
output_wav = gr.Audio( | |
label="22kHz audio output", | |
type="filepath", | |
editable=False, | |
autoplay=True | |
) | |
# with gr.Column(): # Input column | |
output_arpabet = gr.HTML(label="ARPAbet") | |
btn.click( | |
fn=self.predict, | |
inputs=[ | |
input_textbox, | |
voice_radio, | |
language_radio, | |
pacing_slider, | |
pitch_slider, | |
energy_slider, | |
anger_slider, | |
happy_slider, | |
sad_slider, | |
surprise_slider, | |
deepmoji_checkbox | |
], | |
outputs=[ | |
output_wav, | |
output_arpabet, | |
anger_slider, | |
happy_slider, | |
sad_slider, | |
surprise_slider, | |
# xVAServer response | |
gr.Textbox(visible=False) | |
] | |
) | |
# more languages option | |
language_radio.change( | |
more_lang_options, | |
inputs=language_radio, | |
outputs=language_radio, | |
trigger_mode='once', | |
show_progress='hidden', | |
) | |
# more voices option | |
voice_radio.change( | |
more_voice_options, | |
inputs=voice_radio, | |
outputs=voice_radio, | |
trigger_mode='once', | |
show_progress='hidden', | |
queue=False, | |
) | |
# set default text | |
language_radio.change( | |
set_default_text, | |
inputs=[language_radio, deepmoji_checkbox], | |
outputs=[input_textbox, deepmoji_checkbox], | |
show_progress='hidden', | |
queue=False, | |
) | |
# toggle en examples | |
language_radio.change( | |
toggle_example_dropdown, | |
inputs=language_radio, | |
outputs=en_examples_dropdown, | |
show_progress='hidden', | |
queue=False, | |
) | |
en_examples_dropdown.change( | |
set_example_as_input, | |
inputs=[en_examples_dropdown], | |
outputs=[input_textbox], | |
show_progress='hidden', | |
queue=False, | |
) | |
deepmoji_checkbox.change( | |
toggle_deepmoji, | |
inputs=[ | |
deepmoji_checkbox, | |
anger_slider, | |
happy_slider, | |
sad_slider, | |
surprise_slider | |
], | |
outputs=[ | |
anger_slider, | |
happy_slider, | |
sad_slider, | |
surprise_slider | |
], | |
show_progress='hidden', | |
queue=False, | |
) | |
input_textbox.change( | |
reset_em_sliders, | |
inputs=[ | |
deepmoji_checkbox, | |
anger_slider, | |
happy_slider, | |
sad_slider, | |
surprise_slider | |
], | |
outputs=[ | |
anger_slider, | |
happy_slider, | |
sad_slider, | |
surprise_slider | |
], | |
show_progress='hidden', | |
queue=False, | |
) | |
voice_radio.change( | |
reset_em_sliders, | |
inputs=[ | |
deepmoji_checkbox, | |
anger_slider, | |
happy_slider, | |
sad_slider, | |
surprise_slider | |
], | |
outputs=[ | |
anger_slider, | |
happy_slider, | |
sad_slider, | |
surprise_slider | |
], | |
show_progress='hidden', | |
queue=False, | |
) | |
# Replace output with voice audio sample | |
voice_radio.change( | |
self.set_default_audio, | |
inputs=voice_radio, | |
outputs=output_wav, | |
queue=True, | |
trigger_mode='once', | |
) | |
# Switched to Lojban voice | |
voice_radio.change( | |
set_lojban_language, | |
inputs=[voice_radio, language_radio], | |
outputs=[language_radio], | |
trigger_mode='once', | |
queue=True, | |
) | |
return demo | |
def predict( | |
self, | |
input_text, | |
voice, | |
lang, | |
pacing, | |
pitch, | |
energy, | |
anger, | |
happy, | |
sad, | |
surprise, | |
deepmoji_checked | |
): | |
wav_path, arpabet_html, angry, happy, sad, surprise, response = client.predict( | |
input_text, # str in 'Input Text' Textbox component | |
voice, # Literal['ccby_nvidia_hifi_6670_M', 'ccby_nv_hifi_11614_F', 'ccby_nvidia_hifi_11697_F', 'ccby_nvidia_hifi_12787_F', 'ccby_nvidia_hifi_6097_M', 'ccby_nvidia_hifi_6671_M', 'ccby_nvidia_hifi_8051_F', 'ccby_nvidia_hifi_9017_M', 'ccby_nvidia_hifi_9136_F', 'ccby_nvidia_hifi_92_F'] in 'Voice' Radio component | |
lang, # Literal['en', 'de', 'es', 'it', 'fr', 'ru', 'tr', 'la', 'ro', 'da', 'vi', 'ha', 'nl', 'zh', 'ar', 'uk', 'hi', 'ko', 'pl', 'sw', 'fi', 'hu', 'pt', 'yo', 'sv', 'el', 'wo', 'jp'] in 'Language' Radio component | |
pacing, # float (numeric value between 0.5 and 2.0) in 'Duration' Slider component | |
pitch, # float (numeric value between 0 and 1.0) in 'Pitch' Slider component | |
energy, # float (numeric value between 0.1 and 1.0) in 'Energy' Slider component | |
anger, # float (numeric value between 0 and 1.0) in '😠 Anger' Slider component | |
happy, # float (numeric value between 0 and 1.0) in '😃 Happiness' Slider component | |
sad, # float (numeric value between 0 and 1.0) in '😭 Sadness' Slider component | |
surprise, # float (numeric value between 0 and 1.0) in '😮 Surprise' Slider component | |
deepmoji_checked, # bool | |
api_name="/predict" | |
) | |
arpabet_html = '' | |
if voice == 'x_selpahi': | |
em_angry = 0 | |
em_happy = 0 | |
em_sad = 0 | |
em_surprise = 0 | |
else: | |
json_data = json.loads(response.replace("'", '"')) | |
arpabet_html = '<h6>ARPAbet & Durations</h6>' | |
arpabet_html += '<table style="margin: 0 var(--size-2)"><tbody><tr>' | |
arpabet_nopad = json_data['arpabet'].split('|PAD|') | |
arpabet_symbols = json_data['arpabet'].split('|') | |
wpad_len = len(arpabet_symbols) | |
nopad_len = len(arpabet_nopad) | |
total_dur_length = 0 | |
for symb_i in range(wpad_len): | |
if (arpabet_symbols[symb_i] == '<PAD>'): | |
continue | |
total_dur_length += float(json_data['durations'][symb_i]) | |
for symb_i in range(wpad_len): | |
if (arpabet_symbols[symb_i] == '<PAD>'): | |
continue | |
arpabet_length = float(json_data['durations'][symb_i]) | |
cell_width = round(arpabet_length / total_dur_length * 100, 2) | |
arpabet_html += '<td class="arpabet" style="width: '\ | |
+ str(cell_width)\ | |
+'%">'\ | |
+ arpabet_symbols[symb_i]\ | |
+ '</td> ' | |
arpabet_html += '<tr></tbody></table>' | |
if use_deepmoji: | |
em_angry = round(json_data['em_angry'][0], 2) | |
em_happy = round(json_data['em_happy'][0], 2) | |
em_sad = round(json_data['em_sad'][0], 2) | |
em_surprise = round(json_data['em_surprise'][0], 2) | |
else: | |
em_angry = anger | |
em_happy = happy | |
em_sad = sad | |
em_surprise = surprise | |
return [ | |
wav_path, | |
arpabet_html, | |
em_angry, | |
em_happy, | |
em_sad, | |
em_surprise, | |
response | |
] | |
def set_default_audio(self, voice_id): | |
if voice_id == 'more': | |
return None | |
if voice_id == 'x_selpahi': | |
sample_path = self.lojban_models_path | |
elif voice_id == 'cnc_cabal': | |
sample_path = self.robotic_models_path | |
else: | |
sample_path = self.models_path | |
if __name__ == "__main__": | |
return client.predict( | |
voice_id, | |
api_name="/set_default_audio" | |
) | |
return sample_path + voice_id + '.wav' | |
if __name__ == "__main__": | |
print('running Gradio interface') | |
client = Client("Pendrokar/xVASynth") | |
demo = BlocksDemo('', '', '') | |
demo.block.launch() | |