|
import os |
|
|
|
import gradio as gr |
|
import numpy as np |
|
import torch |
|
|
|
from InferenceInterfaces.Meta_FastSpeech2 import Meta_FastSpeech2 |
|
|
|
os.system("pip uninstall -y gradio") |
|
os.system("pip install gradio==2.7.5.2") |
|
|
|
|
|
def float2pcm(sig, dtype='int16'): |
|
""" |
|
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182 |
|
""" |
|
sig = np.asarray(sig) |
|
if sig.dtype.kind != 'f': |
|
raise TypeError("'sig' must be a float array") |
|
dtype = np.dtype(dtype) |
|
if dtype.kind not in 'iu': |
|
raise TypeError("'dtype' must be an integer type") |
|
i = np.iinfo(dtype) |
|
abs_max = 2 ** (i.bits - 1) |
|
offset = i.min + abs_max |
|
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype) |
|
|
|
|
|
class TTS_Interface: |
|
|
|
def __init__(self): |
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
self.model = Meta_FastSpeech2(device=self.device) |
|
self.current_speaker = "English Speaker's Voice" |
|
self.current_language = "English" |
|
self.current_accent = "English" |
|
self.language_id_lookup = { |
|
"English" : "en", |
|
"German" : "de", |
|
"Greek" : "el", |
|
"Spanish" : "es", |
|
"Finnish" : "fi", |
|
"Russian" : "ru", |
|
"Hungarian" : "hu", |
|
"Dutch" : "nl", |
|
"French" : "fr", |
|
'Polish' : "pl", |
|
'Portuguese': "pt", |
|
'Italian' : "it", |
|
} |
|
self.speaker_path_lookup = { |
|
"English Speaker's Voice" : "reference_audios/english.wav", |
|
"German Speaker's Voice" : "reference_audios/german.wav", |
|
"Greek Speaker's Voice" : "reference_audios/greek.wav", |
|
"Spanish Speaker's Voice" : "reference_audios/spanish.wav", |
|
"Finnish Speaker's Voice" : "reference_audios/finnish.wav", |
|
"Russian Speaker's Voice" : "reference_audios/russian.wav", |
|
"Hungarian Speaker's Voice" : "reference_audios/hungarian.wav", |
|
"Dutch Speaker's Voice" : "reference_audios/dutch.wav", |
|
"French Speaker's Voice" : "reference_audios/french.wav", |
|
"Polish Speaker's Voice" : "reference_audios/polish.flac", |
|
"Portuguese Speaker's Voice": "reference_audios/portuguese.flac", |
|
"Italian Speaker's Voice" : "reference_audios/italian.flac", |
|
} |
|
self.model.set_utterance_embedding(self.speaker_path_lookup[self.current_speaker]) |
|
|
|
|
|
def read(self, prompt, language, accent, speaker): |
|
language = language.split()[0] |
|
accent = accent.split()[0] |
|
if self.current_language != language: |
|
self.model.set_phonemizer_language(self.language_id_lookup[language]) |
|
self.current_language = language |
|
if self.current_accent != accent: |
|
self.model.set_accent_language(self.language_id_lookup[accent]) |
|
self.current_accent = accent |
|
if self.current_speaker != speaker: |
|
self.model.set_utterance_embedding(self.speaker_path_lookup[speaker]) |
|
self.current_speaker = speaker |
|
|
|
phones = self.model.text2phone.get_phone_string(prompt) |
|
if len(phones) > 1800: |
|
if language == "English": |
|
prompt = "Your input was too long. Please try either a shorter text or split it into several parts." |
|
elif language == "German": |
|
prompt = "Deine Eingabe war zu lang. Bitte versuche es entweder mit einem kürzeren Text oder teile ihn in mehrere Teile auf." |
|
elif language == "Greek": |
|
prompt = "Η εισήγησή σας ήταν πολύ μεγάλη. Παρακαλώ δοκιμάστε είτε ένα μικρότερο κείμενο είτε χωρίστε το σε διάφορα μέρη." |
|
elif language == "Spanish": |
|
prompt = "Su entrada es demasiado larga. Por favor, intente un texto más corto o divídalo en varias partes." |
|
elif language == "Finnish": |
|
prompt = "Vastauksesi oli liian pitkä. Kokeile joko lyhyempää tekstiä tai jaa se useampaan osaan." |
|
elif language == "Russian": |
|
prompt = "Ваш текст слишком длинный. Пожалуйста, попробуйте либо сократить текст, либо разделить его на несколько частей." |
|
elif language == "Hungarian": |
|
prompt = "Túl hosszú volt a bevitele. Kérjük, próbáljon meg rövidebb szöveget írni, vagy ossza több részre." |
|
elif language == "Dutch": |
|
prompt = "Uw input was te lang. Probeer een kortere tekst of splits het in verschillende delen." |
|
elif language == "French": |
|
prompt = "Votre saisie était trop longue. Veuillez essayer un texte plus court ou le diviser en plusieurs parties." |
|
elif language == 'Polish': |
|
prompt = "Twój wpis był zbyt długi. Spróbuj skrócić tekst lub podzielić go na kilka części." |
|
elif language == 'Portuguese': |
|
prompt = "O seu contributo foi demasiado longo. Por favor, tente um texto mais curto ou divida-o em várias partes." |
|
elif language == 'Italian': |
|
prompt = "Il tuo input era troppo lungo. Per favore, prova un testo più corto o dividilo in più parti." |
|
phones = self.model.text2phone.get_phone_string(prompt) |
|
|
|
wav = self.model(phones) |
|
return 48000, float2pcm(wav.cpu().numpy()) |
|
|
|
|
|
meta_model = TTS_Interface() |
|
|
|
iface = gr.Interface(fn=meta_model.read, |
|
inputs=[gr.inputs.Textbox(lines=2, |
|
placeholder="write what you want the synthesis to read here...)", |
|
label="Text input"), |
|
gr.inputs.Dropdown(['English Text', |
|
'German Text', |
|
'Greek Text', |
|
'Spanish Text', |
|
'Finnish Text', |
|
'Russian Text', |
|
'Hungarian Text', |
|
'Dutch Text', |
|
'French Text', |
|
'Polish Text', |
|
'Portuguese Text', |
|
'Italian Text'], type="value", default='English Text', label="Select the Language of the Text"), |
|
gr.inputs.Dropdown(['English Accent', |
|
'German Accent', |
|
'Greek Accent', |
|
'Spanish Accent', |
|
'Finnish Accent', |
|
'Russian Accent', |
|
'Hungarian Accent', |
|
'Dutch Accent', |
|
'French Accent', |
|
'Polish Accent', |
|
'Portuguese Accent', |
|
'Italian Accent'], type="value", default='English Accent', label="Select the Accent of the Speaker"), |
|
gr.inputs.Dropdown(["English Speaker's Voice", |
|
"German Speaker's Voice", |
|
"Greek Speaker's Voice", |
|
"Spanish Speaker's Voice", |
|
"Finnish Speaker's Voice", |
|
"Russian Speaker's Voice", |
|
"Hungarian Speaker's Voice", |
|
"Dutch Speaker's Voice", |
|
"French Speaker's Voice", |
|
"Polish Speaker's Voice", |
|
"Portuguese Speaker's Voice", |
|
"Italian Speaker's Voice"], type="value", default="English Speaker's Voice", label="Select the Voice of the Speaker")], |
|
outputs=gr.outputs.Audio(type="numpy", label=None), |
|
layout="vertical", |
|
title="Anonymous ACL Submission", |
|
theme="default", |
|
allow_flagging="never", |
|
allow_screenshot=False, |
|
article="") |
|
iface.launch(enable_queue=True) |
|
|