Spaces:

Flux9665
/

IMS-Toucan

Running

File size: 3,628 Bytes

import os

import gradio as gr
import numpy as np
import torch

from InferenceInterfaces.Meta_FastSpeech2 import Meta_FastSpeech2

os.system("pip uninstall -y gradio")
os.system("pip install gradio==2.7.5.2")


def float2pcm(sig, dtype='int16'):
    """
    https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
    """
    sig = np.asarray(sig)
    if sig.dtype.kind != 'f':
        raise TypeError("'sig' must be a float array")
    dtype = np.dtype(dtype)
    if dtype.kind not in 'iu':
        raise TypeError("'dtype' must be an integer type")
    i = np.iinfo(dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)


class TTS_Interface:

    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = Meta_FastSpeech2(device=self.device)

    def read(self, prompt, language, path_to_audio):
        language_id_lookup = {
            "English"  : "en",
            "German"   : "de",
            "Greek"    : "el",
            "Spanish"  : "es",
            "Finnish"  : "fi",
            "Russian"  : "ru",
            "Hungarian": "hu",
            "Dutch"    : "nl",
            "French"   : "fr"
            }
        self.model.set_language(language_id_lookup[language])
        if path_to_audio is not None:
            try:
                self.model.set_utterance_embedding(path_to_audio)
            except RuntimeError:
                pass
        wav = self.model(prompt)
        return 48000, float2pcm(wav.cpu().numpy())


meta_model = TTS_Interface()
article = "<p style='text-align: left'>This is still a work in progress, models will be exchanged for better ones as soon as they are done. All of those languages are spoken by a single model. Speakers can be transferred across languages. More languages will be added soon.</p><p style='text-align: center'><a href='https://github.com/DigitalPhonetics/IMS-Toucan' target='_blank'>Click here to learn more about the IMS Toucan Speech Synthesis Toolkit</a></p>"

iface = gr.Interface(fn=meta_model.read,
                     inputs=[gr.inputs.Textbox(lines=2, placeholder="write what you want the synthesis to read here...", label=" "),
                             gr.inputs.Dropdown(['English',
                                                 'German',
                                                 'Greek',
                                                 'Spanish',
                                                 'Finnish',
                                                 'Russian',
                                                 'Hungarian',
                                                 'Dutch',
                                                 'French'], type="value", default='English', label="Language Selection"),
                             gr.inputs.Audio(source="microphone",
                                             optional=True,
                                             label="Make the TTS imitate your Voice (optional, press once to start recording and again to stop)",
                                             type="filepath")],
                     outputs=gr.outputs.Audio(type="numpy", label=None),
                     layout="vertical",
                     title="IMS Toucan Multilingual Multispeaker Demo",
                     thumbnail="Utility/toucan.png",
                     theme="default",
                     allow_flagging="never",
                     allow_screenshot=False,
                     article=article)
iface.launch(enable_queue=True)