File size: 10,243 Bytes
a8bfe3a
 
6a645d4
a8bfe3a
 
 
 
 
4684f03
b3fa29f
 
 
1f52d1a
 
b3fa29f
 
 
8e89766
b3fa29f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8bfe3a
e8958d3
b3fa29f
 
 
 
e8958d3
 
1f52d1a
e8958d3
7466150
 
 
e8958d3
 
 
 
 
 
 
 
 
e30b9ac
e8958d3
 
 
 
 
 
 
e30b9ac
e8958d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f52d1a
7466150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8958d3
 
 
 
 
 
7466150
 
 
e8958d3
 
 
 
 
 
 
1f52d1a
7466150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8958d3
 
 
 
 
7466150
 
 
 
 
 
 
 
 
e8958d3
 
 
 
 
 
 
 
 
8e89766
e8958d3
 
 
 
 
 
 
 
6a645d4
e8958d3
 
 
 
 
 
 
 
b3fa29f
e8958d3
 
 
 
 
 
 
 
 
 
 
 
 
b3fa29f
8e89766
b3fa29f
 
 
 
a79ca1f
b3fa29f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import os

os.system("git clone --branch v2.5 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
os.system("mv toucan_codebase/* .")

from run_model_downloader import download_models

download_models()

import gradio as gr
import numpy as np
import torch

from InferenceInterfaces.UtteranceCloner import UtteranceCloner
from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator


def float2pcm(sig, dtype='int16'):
    """
    https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
    """
    sig = np.asarray(sig)
    if sig.dtype.kind != 'f':
        raise TypeError("'sig' must be a float array")
    dtype = np.dtype(dtype)
    if dtype.kind not in 'iu':
        raise TypeError("'dtype' must be an integer type")
    i = np.iinfo(dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)


class TTS_Interface:

    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.utterance_cloner = UtteranceCloner(model_id="Meta", device=self.device)
        self.utterance_cloner.tts.set_language("de")
        self.acoustic_model = Aligner()
        self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
        self.acoustic_model = self.acoustic_model.to(self.device)
        self.dc = DurationCalculator(reduction_factor=1)
        self.text = "Quellen hattest du ihm, hattest dem Flüchtigen, kühle Schatten geschenkt, und die Gestade sahen, all ihm nach, und es bebte, aus den Wellen ihr lieblich Bild."
        reference_audio = "reference_audios/2.wav"
        self.duration, self.pitch, self.energy, _, _ = self.utterance_cloner.extract_prosody(self.text, reference_audio, lang="de", on_line_fine_tune=True)
        self.phones = self.utterance_cloner.tts.text2phone.get_phone_string(self.text)
        print(self.phones)
        for index, phone in enumerate(self.phones):
            print(index, phone)

        #######
        self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
        self.current_voice = "male"
        self.cloned_speech_male = self.utterance_cloner.tts(self.phones,
                                                            view=False,
                                                            durations=self.duration,
                                                            pitch=self.pitch,
                                                            energy=self.energy,
                                                            input_is_phones=True).cpu().numpy()
        self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
        self.current_voice = "female"
        self.cloned_speech_female = self.utterance_cloner.tts(self.phones,
                                                              view=False,
                                                              durations=self.duration,
                                                              pitch=self.pitch,
                                                              energy=self.energy,
                                                              input_is_phones=True).cpu().numpy()

        #######
        self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
        self.current_voice = "male"
        self.reg_speech_male = self.utterance_cloner.tts(
            "Quellen hattest du ihm, hattest dem Flüchtigen kühle Schatten geschenkt, und die Gestade sahen all ihm nach, und es bebte aus den Wellen ihr lieblich Bild.",
            view=False).cpu().numpy()
        self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
        self.current_voice = "female"
        self.reg_speech_female = self.utterance_cloner.tts(
            "Quellen hattest du ihm, hattest dem Flüchtigen kühle Schatten geschenkt, und die Gestade sahen all ihm nach, und es bebte aus den Wellen ihr lieblich Bild.",
            view=False).cpu().numpy()

    def read(self, _, speaker, lengthening, pause_dur, pitch_up):

        if speaker == "Female Voice" and self.current_voice != "female":
            self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
            self.current_voice = "female"
        elif speaker == "Male Voice" and self.current_voice != "male":
            self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
            self.current_voice = "male"

        duration = self.duration.clone()
        # lengthening
        lenghtening_candidates = [
            # ('f', 33),
            # ('l', 34),
            ('ʏ', 35),
            ('ç', 36),
            # ('t', 37),
            ('ɪ', 38),
            # ('ɡ', 39),
            ('ə', 40),
            ('n', 41),

            # ('z', 79),
            ('ɑ', 80),
            # ('ə', 81),
            ('n', 82),

            # ('b', 103),
            ('e', 104),
            # ('p', 105),
            # ('t', 106),
            ('ə', 107)
            ]

        for lenghtening_candidate in lenghtening_candidates:
            duration[lenghtening_candidate[1]] = duration[lenghtening_candidate[1]] + lengthening

        # pauses
        pause_candidates = [('~', 42),
                            ('~', 83),
                            ('~', 108)]

        for pause_candidate in pause_candidates:
            duration[pause_candidate[1]] = duration[pause_candidate[1]] + pause_dur

        pitch = self.pitch.clone()
        # pitch raise

        pitch_candidates = [
            # ('k', 44),
            ('y', 45),
            ('l', 46),
            ('ə', 47),
            ('ʃ', 49),
            ('a', 50),
            ('t', 51),
            # ('ə', 52),
            # ('n', 53),

            ('a', 85),
            ('l', 86),

            ('v', 118),
            ('ɛ', 119),
            ('l', 120),
            # ('ə', 121),
            # ('n', 122)
            ]

        for pitch_candidate in pitch_candidates:
            pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] + pitch_up

        fixme = [('f', 33),
                 ('l', 34),
                 ('ʏ', 35),
                 ('ç', 36),
                 ('t', 37),
                 ('ɪ', 38),
                 ('ɡ', 39),
                 ('ə', 40),
                 ('n', 41)
                 ]
        for pitch_candidate in fixme:
            pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] - abs(pitch_up)

        manipulated_speech = self.utterance_cloner.tts(self.phones,
                                                       view=False,
                                                       durations=duration,
                                                       pitch=pitch,
                                                       energy=self.energy,
                                                       input_is_phones=True).cpu()

        if self.current_voice == "female":
            cloned_speech = self.cloned_speech_female
            reg_speech = self.reg_speech_female
        else:
            cloned_speech = self.cloned_speech_male
            reg_speech = self.reg_speech_male

        return (24000, float2pcm(reg_speech)), (24000, float2pcm(cloned_speech)), (24000, float2pcm(manipulated_speech.numpy()))


poem_model = TTS_Interface()
article = "<p style='text-align: left'>This is still a work in progress, models will be exchanged for better ones as soon as they are done. More diverse training data can help with more exact cloning and more controllability. For example we are still trying to incorporate more singing data. </p><p style='text-align: center'><a href='https://github.com/DigitalPhonetics/IMS-Toucan' target='_blank'>Click here to learn more about the IMS Toucan Speech Synthesis Toolkit</a></p>"

iface = gr.Interface(fn=poem_model.read,
                     inputs=[gr.inputs.Dropdown([
                         "Quellen hattest du ihm, hattest dem Flüchtigen // kühle Schatten geschenkt, und die Gestade sahn // all ihm nach, und es bebte // aus den Wellen ihr lieblich Bild."],
                         type="value",
                         default="Quellen hattest du ihm, hattest dem Flüchtigen // kühle Schatten geschenkt, und die Gestade sahn // all ihm nach, und es bebte // aus den Wellen ihr lieblich Bild.",
                         label="Poem Transcript"),
                         gr.inputs.Dropdown(["Female Voice", "Male Voice"],
                                            type="value",
                                            default="Female Voice",
                                            label="Select a Speaker"),
                         gr.inputs.Slider(minimum=0, maximum=4, step=1, default=2, label="Lengthening on verse end"),
                         gr.inputs.Slider(minimum=0, maximum=20, step=1, default=8, label="Length of Pause after verse end"),
                         gr.inputs.Slider(minimum=-0.4, maximum=0.4, step=0.01, default=0.2, label="Raise Pitch on new verse")
                         ],
                     outputs=[gr.outputs.Audio(type="numpy", label="Poem read with prose reading"),
                              gr.outputs.Audio(type="numpy", label="Poem cloned from a reference"),
                              gr.outputs.Audio(type="numpy", label="Poem after human-in-the-loop adjustments")],
                     layout="vertical",
                     title="PoeticTTS - Customizing Poetry",
                     thumbnail="Utility/toucan.png",
                     theme="default",
                     allow_flagging="never",
                     allow_screenshot=False,
                     description="Customize how a poem is read by a text-to-speech system with intuitive high-level controls. You can control phrasing markers to go from prose style syntactic phrasing to verse aware poetry style phrasing with the sliders below.",
                     article=article)
iface.launch(enable_queue=True)