File size: 6,457 Bytes
68a11d5
 
 
 
8561621
 
185fc75
 
 
 
6cd09aa
 
 
 
 
 
 
 
 
 
 
62d7978
6cd09aa
c69f215
 
6cd09aa
 
 
 
185fc75
 
 
 
6cd09aa
185fc75
 
6cd09aa
185fc75
 
6cd09aa
 
 
 
 
 
 
185fc75
6cd09aa
fa66f8f
6cd09aa
 
 
 
 
 
 
 
 
 
 
 
 
 
fa66f8f
6cd09aa
 
 
 
 
 
 
 
 
 
 
 
 
99b69ce
6a66802
 
 
 
 
62d7978
6cd09aa
 
 
 
 
 
 
 
 
 
 
5ad4ca1
1b1179a
6cd09aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a66802
 
 
 
 
 
 
 
 
 
 
62d7978
 
 
 
 
 
 
6a66802
62d7978
6a66802
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os

from run_model_downloader import download_models

if not os.path.exists("Models/ToucanTTS_Meta/best.pt"):
    download_models()
import gradio as gr
from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
from Utility.utils import float2pcm

import os

import torch

from Architectures.ControllabilityGAN.GAN import GanWrapper
from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
from Utility.storage_config import MODELS_DIR


class ControllableInterface(torch.nn.Module):

    def __init__(self, available_artificial_voices=1000):
        super().__init__()
        self.model = ToucanTTSInterface(device="cuda", tts_model_path="Meta", language="eng")
        self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cuda")
        self.generated_speaker_embeds = list()
        self.available_artificial_voices = available_artificial_voices
        self.current_language = ""
        self.current_accent = ""

    def read(self,
             prompt,
             language,
             accent,
             voice_seed,
             duration_scaling_factor,
             pause_duration_scaling_factor,
             pitch_variance_scale,
             energy_variance_scale,
             emb_slider_1,
             emb_slider_2,
             emb_slider_3,
             emb_slider_4,
             emb_slider_5,
             emb_slider_6,
             loudness_in_db
             ):
        if self.current_language != language:
            self.model = ToucanTTSInterface(device="cpu", tts_model_path="Meta", language=language)
            self.current_language = language

        self.wgan.set_latent(voice_seed)
        controllability_vector = torch.tensor([emb_slider_1,
                                               emb_slider_2,
                                               emb_slider_3,
                                               emb_slider_4,
                                               emb_slider_5,
                                               emb_slider_6], dtype=torch.float32)
        embedding = self.wgan.modify_embed(controllability_vector)
        self.model.set_utterance_embedding(embedding=embedding)

        phones = self.model.text2phone.get_phone_string(prompt)
        if len(phones) > 1800:
            return

        print(prompt)
        wav, sr, fig = self.model(prompt,
                                  input_is_phones=False,
                                  duration_scaling_factor=duration_scaling_factor,
                                  pitch_variance_scale=pitch_variance_scale,
                                  energy_variance_scale=energy_variance_scale,
                                  pause_duration_scaling_factor=pause_duration_scaling_factor,
                                  return_plot_as_filepath=True,
                                  loudness_in_db=loudness_in_db)
        return sr, wav, fig


title = "Controllable Text-to-Speech for over 7000 Languages"
article = "Check out the IMS Toucan TTS Toolkit at https://github.com/DigitalPhonetics/IMS-Toucan"
available_artificial_voices = 1000
path_to_iso_list = "Preprocessing/multilinguality/iso_to_fullname.json"
iso_to_name = load_json_from_path(path_to_iso_list)
text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
controllable_ui = ControllableInterface(available_artificial_voices=available_artificial_voices)


def read(prompt,
         language,
         voice_seed,
         duration_scaling_factor,
         pitch_variance_scale,
         energy_variance_scale,
         emb1,
         emb2
         ):
    with torch.no_grad():
        sr, wav, fig = controllable_ui.read(prompt,
                                            language.split(" ")[-1].split("(")[1].split(")")[0],
                                            language.split(" ")[-1].split("(")[1].split(")")[0],
                                            voice_seed,
                                            duration_scaling_factor,
                                            1.,
                                            pitch_variance_scale,
                                            energy_variance_scale,
                                            emb1,
                                            emb2,
                                            0.,
                                            0.,
                                            0.,
                                            0.,
                                            -24.)
    return (sr, float2pcm(wav)), fig

iface = gr.Interface(fn=read,
                     inputs=[gr.Textbox(lines=2,
                                        placeholder="write what you want the synthesis to read here...",
                                        value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
                                        label="Text input"),
                             gr.Dropdown(text_selection,
                                         type="value",
                                         value='English Text (eng)',
                                         label="Select the Language of the Text (type on your keyboard to find it quickly)"),
                             gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
                                       value=279,
                                       label="Random Seed for the artificial Voice"),
                             gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
                             gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
                             gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
                             gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
                             gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
                             ],
                     outputs=[gr.Audio(type="numpy", label="Speech"),
                              gr.Image(label="Visualization")],
                     title=title,
                     theme="default",
                     allow_flagging="never",
                     article=article)
iface.launch()