Quenya-TTS / run_controllable_GUI.py
AnnieZzz's picture
Update app.py and requirements.txt
cd4e2cb verified
import gradio as gr
from InferenceInterfaces.ControllableInterface import ControllableInterface
from Utility.utils import float2pcm
class TTSWebUI:
def __init__(self, gpu_id="cpu", title="Controllable Embeddings", article="", available_artificial_voices=1000):
self.controllable_ui = ControllableInterface(gpu_id=gpu_id,
available_artificial_voices=available_artificial_voices)
self.iface = gr.Interface(fn=self.read,
inputs=[gr.Textbox(lines=2,
placeholder="write what you want the synthesis to read here...",
value="Colorless green ideas sleep furiously!",
label="Text input"),
gr.Dropdown(['English Text',
'German Text',
'Greek Text',
'Spanish Text',
'Finnish Text',
'Russian Text',
'Hungarian Text',
'Dutch Text',
'French Text',
'Polish Text',
'Portuguese Text',
'Italian Text',
'Chinese Text',
'Vietnamese Text'], type="value", value='English Text', label="Select the Language of the Text"),
gr.Dropdown(['English Accent',
'German Accent',
'Greek Accent',
'Spanish Accent',
'Finnish Accent',
'Russian Accent',
'Hungarian Accent',
'Dutch Accent',
'French Accent',
'Polish Accent',
'Portuguese Accent',
'Italian Accent',
'Chinese Accent',
'Vietnamese Accent'], type="value",
value='English Accent', label="Select the Accent of the Speaker"),
gr.Textbox(lines=3,
placeholder="\nThe sliders below control the speaker embedding",
value="\nThe sliders below control the speaker embedding",
label=" ",
show_label=False),
gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
value=279,
label="Random Seed for the artificial Voice"),
gr.Slider(minimum=-50.0, maximum=50.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
gr.Slider(minimum=-30.0, maximum=30.0, step=0.1, value=0.0, label="Sibilance"),
gr.Slider(minimum=-30.0, maximum=30.0, step=0.1, value=0.0, label="Accentuated High / Low Frequencies"),
gr.Slider(minimum=-30.0, maximum=30.0, step=0.1, value=0.0, label="Loudness / Arousal / Calmness"),
gr.Slider(minimum=-20.0, maximum=20.0, step=0.1, value=0.0, label="Tone / Timbre"),
gr.Textbox(lines=3,
placeholder="\nThe sliders below directly control the TTS",
value="\nThe sliders below directly control the TTS",
label=" ",
show_label=False),
gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Duration Scale"),
gr.Slider(minimum=0.0, maximum=2.0, step=0.1, value=1.0, label="Pause Duration Scale"),
gr.Slider(minimum=0.0, maximum=2.0, step=0.1, value=1.0, label="Pitch Variance Scale"),
gr.Slider(minimum=0.0, maximum=2.0, step=0.1, value=1.0, label="Energy Variance Scale")
],
outputs=[gr.Audio(type="numpy", label="Speech"),
gr.Image(label="Visualization")],
title=title,
theme="default",
allow_flagging="never",
article=article)
self.iface.launch(enable_queue=True)
def read(self,
prompt,
language,
accent,
ignore_1,
voice_seed,
emb1,
emb2,
emb3,
emb5,
emb6,
ignore_2,
duration_scaling_factor,
pause_duration_scaling_factor,
pitch_variance_scale,
energy_variance_scale):
sr, wav, fig = self.controllable_ui.read(prompt,
language,
accent,
voice_seed,
duration_scaling_factor,
pause_duration_scaling_factor,
pitch_variance_scale,
energy_variance_scale,
emb1,
emb2,
emb3,
0.0, # slider 4 did not have a meaningful interpretation, too many properties mixed
emb5,
emb6)
return (sr, float2pcm(wav)), fig
if __name__ == '__main__':
TTSWebUI(gpu_id="cpu")