File size: 9,918 Bytes
ff32d8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import os

import spaces

os.system("git clone --branch v3.1 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
os.system("mv toucan_codebase/* .")

from run_model_downloader import download_models

download_models()

import gradio as gr
import torch.cuda
from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
from Utility.utils import float2pcm

import os

import torch

from Architectures.ControllabilityGAN.GAN import GanWrapper
from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
from Utility.storage_config import MODELS_DIR


class ControllableInterface(torch.nn.Module):

    def __init__(self, available_artificial_voices=1000):
        super().__init__()
        self.model = ToucanTTSInterface(device="cpu", tts_model_path="Meta")
        self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cpu")
        self.generated_speaker_embeds = list()
        self.available_artificial_voices = available_artificial_voices
        self.current_language = ""
        self.current_accent = ""

    def read(self,
             prompt,
             language,
             accent,
             voice_seed,
             prosody_creativity,
             duration_scaling_factor,
             pause_duration_scaling_factor,
             pitch_variance_scale,
             energy_variance_scale,
             emb_slider_1,
             emb_slider_2,
             emb_slider_3,
             emb_slider_4,
             emb_slider_5,
             emb_slider_6,
             loudness_in_db
             ):
        if self.current_language != language:
            self.model.set_phonemizer_language(language)
            self.current_language = language
        if self.current_accent != accent:
            self.model.set_accent_language(accent)
            self.current_accent = accent

        self.wgan.set_latent(voice_seed)
        controllability_vector = torch.tensor([emb_slider_1,
                                               emb_slider_2,
                                               emb_slider_3,
                                               emb_slider_4,
                                               emb_slider_5,
                                               emb_slider_6], dtype=torch.float32)
        embedding = self.wgan.modify_embed(controllability_vector)
        self.model.set_utterance_embedding(embedding=embedding)

        phones = self.model.text2phone.get_phone_string(prompt)
        if len(phones) > 1800:
            if language == "deu":
                prompt = "Deine Eingabe war zu lang. Bitte versuche es entweder mit einem kürzeren Text oder teile ihn in mehrere Teile auf."
            elif language == "ell":
                prompt = "Η εισήγησή σας ήταν πολύ μεγάλη. Παρακαλώ δοκιμάστε είτε ένα μικρότερο κείμενο είτε χωρίστε το σε διάφορα μέρη."
            elif language == "spa":
                prompt = "Su entrada es demasiado larga. Por favor, intente un texto más corto o divídalo en varias partes."
            elif language == "fin":
                prompt = "Vastauksesi oli liian pitkä. Kokeile joko lyhyempää tekstiä tai jaa se useampaan osaan."
            elif language == "rus":
                prompt = "Ваш текст слишком длинный. Пожалуйста, попробуйте либо сократить текст, либо разделить его на несколько частей."
            elif language == "hun":
                prompt = "Túl hosszú volt a bevitele. Kérjük, próbáljon meg rövidebb szöveget írni, vagy ossza több részre."
            elif language == "nld":
                prompt = "Uw input was te lang. Probeer een kortere tekst of splits het in verschillende delen."
            elif language == "fra":
                prompt = "Votre saisie était trop longue. Veuillez essayer un texte plus court ou le diviser en plusieurs parties."
            elif language == 'pol':
                prompt = "Twój wpis był zbyt długi. Spróbuj skrócić tekst lub podzielić go na kilka części."
            elif language == 'por':
                prompt = "O seu contributo foi demasiado longo. Por favor, tente um texto mais curto ou divida-o em várias partes."
            elif language == 'ita':
                prompt = "Il tuo input era troppo lungo. Per favore, prova un testo più corto o dividilo in più parti."
            elif language == 'cmn':
                prompt = "你的输入太长了。请尝试使用较短的文本或将其拆分为多个部分。"
            elif language == 'vie':
                prompt = "Đầu vào của bạn quá dài. Vui lòng thử một văn bản ngắn hơn hoặc chia nó thành nhiều phần."
            else:
                prompt = "Your input was too long. Please try either a shorter text or split it into several parts."
                if self.current_language != "eng":
                    self.model.set_phonemizer_language("eng")
                    self.current_language = "eng"
                if self.current_accent != "eng":
                    self.model.set_accent_language("eng")
                    self.current_accent = "eng"

        print(prompt)
        wav, sr, fig = self.model(prompt,
                                  input_is_phones=False,
                                  duration_scaling_factor=duration_scaling_factor,
                                  pitch_variance_scale=pitch_variance_scale,
                                  energy_variance_scale=energy_variance_scale,
                                  pause_duration_scaling_factor=pause_duration_scaling_factor,
                                  return_plot_as_filepath=True,
                                  prosody_creativity=prosody_creativity,
                                  loudness_in_db=loudness_in_db)
        return sr, wav, fig


title = "Controllable Text-to-Speech for over 7000 Languages"
article = "Check out the IMS Toucan TTS Toolkit at https://github.com/DigitalPhonetics/IMS-Toucan"
available_artificial_voices = 1000
path_to_iso_list = "Preprocessing/multilinguality/iso_to_fullname.json"
iso_to_name = load_json_from_path(path_to_iso_list)
text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
controllable_ui = ControllableInterface(available_artificial_voices=available_artificial_voices)


@spaces.GPU
def read(prompt,
         language,
         voice_seed,
         prosody_creativity,
         duration_scaling_factor,
         pitch_variance_scale,
         energy_variance_scale,
         emb1,
         emb2
         ):
    if torch.cuda.is_available():
        controllable_ui.to("cuda")
        controllable_ui.device = "cuda"
    try:
        sr, wav, fig = controllable_ui.read(prompt,
                                            language.split(" ")[-1].split("(")[1].split(")")[0],
                                            language.split(" ")[-1].split("(")[1].split(")")[0],
                                            voice_seed,
                                            prosody_creativity,
                                            duration_scaling_factor,
                                            1.,
                                            pitch_variance_scale,
                                            energy_variance_scale,
                                            emb1,
                                            emb2,
                                            0.,
                                            0.,
                                            0.,
                                            0.,
                                            -24.)
    finally:
        controllable_ui.to("cpu")
        controllable_ui.device = "cpu"
    return (sr, float2pcm(wav)), fig


iface = gr.Interface(fn=read,
                     inputs=[gr.Textbox(lines=2,
                                        placeholder="write what you want the synthesis to read here...",
                                        value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
                                        label="Text input"),
                             gr.Dropdown(text_selection,
                                         type="value",
                                         value='English Text (eng)',
                                         label="Select the Language of the Text (type on your keyboard to find it quickly)"),
                             gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
                                       value=279,
                                       label="Random Seed for the artificial Voice"),
                             gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.7, label="Prosody Creativity"),
                             gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
                             gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
                             gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
                             gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
                             gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
                             ],
                     outputs=[gr.Audio(type="numpy", label="Speech"),
                              gr.Image(label="Visualization")],
                     title=title,
                     theme="default",
                     allow_flagging="never",
                     article=article)
iface.launch()