File size: 16,903 Bytes
2cb106d
 
 
 
52c413f
2cb106d
 
52c413f
2cb106d
f360342
 
 
 
 
 
 
 
2cb106d
 
 
f360342
52c413f
2cb106d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52c413f
f360342
2cb106d
 
 
 
 
 
 
 
 
 
52c413f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2cb106d
 
 
 
 
 
0d40f57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52c413f
 
 
0d40f57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2cb106d
42616ca
 
 
 
 
464096d
2cb106d
 
 
 
 
 
 
 
 
 
52c413f
2cb106d
 
 
52c413f
 
2cb106d
 
 
 
 
 
 
 
 
 
 
 
 
52c413f
2cb106d
 
52c413f
2cb106d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52c413f
 
2cb106d
52c413f
2cb106d
 
 
 
 
 
 
 
 
 
 
 
42616ca
 
 
2cb106d
 
52c413f
2cb106d
 
 
 
5468186
2cb106d
52c413f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
import os

import gradio as gr
import numpy as np
import soundfile
import soundfile as sf
import torch
from tqdm import tqdm

os.system("git clone --branch v2.3 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
os.system("mv toucan_codebase/* .")

from run_model_downloader import download_models

download_models()

from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
from Preprocessing.AudioPreprocessor import AudioPreprocessor
from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
from InferenceInterfaces.UtteranceCloner import UtteranceCloner
from Preprocessing.articulatory_features import get_feature_to_index_lookup


def float2pcm(sig, dtype='int16'):
    """
    https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
    """
    sig = np.asarray(sig)
    if sig.dtype.kind != 'f':
        raise TypeError("'sig' must be a float array")
    dtype = np.dtype(dtype)
    if dtype.kind not in 'iu':
        raise TypeError("'dtype' must be an integer type")
    i = np.iinfo(dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)


class TTS_Interface:

    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        self.utterance_cloner = UtteranceCloner(model_id="Meta", device=self.device)
        self.speaker_path_lookup = {
            "Voice 1": "reference_audios/voice_1.flac",
            "Voice 2": "reference_audios/voice_2.wav",
            "Voice 3": "reference_audios/voice_3.wav",
            }
        self.acoustic_model = Aligner()
        self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
        self.acoustic_model = self.acoustic_model.to(self.device)
        self.dc = DurationCalculator(reduction_factor=1)
        self.tf = ArticulatoryCombinedTextFrontend(language="en")
        example_audio, sr = soundfile.read("reference_audios/clone_me_5.wav")
        self.ap = AudioPreprocessor(input_sr=sr, output_sr=16000, )

        ## finetune aligner
        steps = 10
        tokens = list()  # we need an ID sequence for training rather than a sequence of phonological features
        for vector in self.tf.string_to_tensor(
                "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."):
            if vector[get_feature_to_index_lookup()["word-boundary"]] == 0:
                # we don't include word boundaries when performing alignment, since they are not always present in audio.
                for phone in self.tf.phone_to_vector:
                    if vector.numpy().tolist()[13:] == self.tf.phone_to_vector[phone][13:]:
                        # the first 12 dimensions are for modifiers, so we ignore those when trying to find the phoneme in the ID lookup
                        tokens.append(self.tf.phone_to_id[phone])
                        # this is terribly inefficient, but it's fine
                        break
        tokens = torch.LongTensor(tokens).squeeze().to(self.device)
        tokens_len = torch.LongTensor([len(tokens)]).to(self.device)
        mel = self.ap.audio_to_mel_spec_tensor(example_audio, normalize=True).transpose(0, 1).unsqueeze(0).to(self.device)
        mel.requires_grad = True
        mel_len = torch.LongTensor([len(mel[0])]).to(self.device)
        # actual fine-tuning starts here
        optim_asr = torch.optim.SGD(self.acoustic_model.parameters(), lr=0.1)
        self.acoustic_model.train()
        for _ in tqdm(list(range(steps))):
            pred = self.acoustic_model(mel)
            loss = self.acoustic_model.ctc_loss(pred.transpose(0, 1).log_softmax(2), tokens, mel_len, tokens_len)
            optim_asr.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.acoustic_model.parameters(), 1.0)
            optim_asr.step()
        self.acoustic_model.eval()
        ## done finetuning

        reference_audio = "reference_audios/clone_me_5.wav"
        prompt = "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."
        text_list = prompt.replace(".", ".|").replace("?", "?|").replace("!", "!|").split("|")
        # we don't split on the punctuation marks because we want to retain them.

        self.split_audio(reference_audio, text_list)
        # at this point, split_1.wav, split_2.wav and split_3.wav should exist.

        self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_1.flac")
        self.part_1_voice_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
                                                                    reference_transcription=text_list[0],
                                                                    clone_speaker_identity=False,
                                                                    lang="en")
        self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_2.wav")
        self.part_1_voice_2 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
                                                                    reference_transcription=text_list[0],
                                                                    clone_speaker_identity=False,
                                                                    lang="en")
        self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_3.wav")
        self.part_1_voice_3 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
                                                                    reference_transcription=text_list[0],
                                                                    clone_speaker_identity=False,
                                                                    lang="en")

        self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_1.flac")
        self.part_2_voice_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_2.wav",
                                                                    reference_transcription=text_list[1],
                                                                    clone_speaker_identity=False,
                                                                    lang="en")
        self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_2.wav")
        self.part_2_voice_2 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_2.wav",
                                                                    reference_transcription=text_list[1],
                                                                    clone_speaker_identity=False,
                                                                    lang="en")
        self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_3.wav")
        self.part_2_voice_3 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_2.wav",
                                                                    reference_transcription=text_list[1],
                                                                    clone_speaker_identity=False,
                                                                    lang="en")

        self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_1.flac")
        self.part_3_voice_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_3.wav",
                                                                    reference_transcription=text_list[2],
                                                                    clone_speaker_identity=False,
                                                                    lang="en")
        self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_2.wav")
        self.part_3_voice_2 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_3.wav",
                                                                    reference_transcription=text_list[2],
                                                                    clone_speaker_identity=False,
                                                                    lang="en")
        self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_3.wav")
        self.part_3_voice_3 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_3.wav",
                                                                    reference_transcription=text_list[2],
                                                                    clone_speaker_identity=False,
                                                                    lang="en")

    def read(self, _, speaker_1, speaker_2, speaker_3):
        reference_audio = "reference_audios/clone_me_5.wav"

        if speaker_1 == "Voice 1":
            part_1 = self.part_1_voice_1
        elif speaker_1 == "Voice 2":
            part_1 = self.part_1_voice_2
        elif speaker_1 == "Voice 3":
            part_1 = self.part_1_voice_3

        if speaker_2 == "Voice 1":
            part_2 = self.part_2_voice_1
        elif speaker_2 == "Voice 2":
            part_2 = self.part_2_voice_2
        elif speaker_2 == "Voice 3":
            part_2 = self.part_2_voice_3

        if speaker_3 == "Voice 1":
            part_3 = self.part_3_voice_1
        elif speaker_3 == "Voice 2":
            part_3 = self.part_3_voice_2
        elif speaker_3 == "Voice 3":
            part_3 = self.part_3_voice_3

        return "alignment.png", \
               reference_audio, \
               self.speaker_path_lookup["Voice 1"], \
               self.speaker_path_lookup["Voice 2"], \
               self.speaker_path_lookup["Voice 3"], \
               (48000, float2pcm(torch.cat([torch.tensor(part_1), torch.tensor(part_2), torch.tensor(part_3)], dim=0).numpy()))

    def split_audio(self, path_to_audio, text_list):
        # extract audio
        audio, sr = sf.read(path_to_audio)
        ap = AudioPreprocessor(input_sr=sr, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False)
        norm_wave = ap.audio_to_wave_tensor(normalize=True, audio=audio)
        melspec = ap.audio_to_mel_spec_tensor(audio=norm_wave, normalize=False, explicit_sampling_rate=16000).transpose(0, 1)

        # extract phonemes
        lines = list()
        self.tf.use_word_boundaries = False  # this causes problems when splitting otherwise
        for segment in text_list:
            if segment.strip() != "":
                lines.append(self.tf.string_to_tensor(segment, handle_missing=False).squeeze())
        self.tf.use_word_boundaries = True

        # postprocess phonemes: [~ sentence ~ #] --> [sentence ~] except for the first one, which is [~ sentence ~]
        processed_lines = list()
        for index, line in enumerate(lines):
            if index == 0:
                processed_lines.append(line[:-1])
            else:
                processed_lines.append(line[1:-1])
        lines = processed_lines
        joined_phonemes = torch.cat(lines, dim=0)

        # get durations of each phone in audio as average of an ensemble
        alignment_paths = list()
        ensemble_of_durations = list()
        for ensemble in range(1):
            alignment_paths.append(self.acoustic_model.inference(mel=melspec.to(self.device),
                                                                 tokens=joined_phonemes.to(self.device),
                                                                 save_img_for_debug="alignment.png" if ensemble == 0 else None,
                                                                 return_ctc=False))
        for alignment_path in alignment_paths:
            ensemble_of_durations.append(self.dc(torch.LongTensor(alignment_path), vis=None).squeeze())
        durations = list()
        for i, _ in enumerate(ensemble_of_durations[0]):
            duration_of_phone = list()
            for ensemble_member in ensemble_of_durations:
                duration_of_phone.append(ensemble_member.squeeze()[i])
            durations.append(sum(duration_of_phone) / len(duration_of_phone))

        # cut audio according to duration sum of each line in transcript
        line_lens = [len(x) for x in lines]
        index = 0
        segment_durations = list()
        for num_phones in line_lens:
            segment_durations.append(sum(durations[index: index + num_phones]))
            index += num_phones
        spec_to_wave_factor = len(norm_wave) / sum(segment_durations)
        wave_segment_lens = [int(x * spec_to_wave_factor) for x in segment_durations]
        start_index = 0
        wave_segments = list()
        for index, segment_len in enumerate(wave_segment_lens):
            if index == len(wave_segment_lens) - 1:
                wave_segments.append(norm_wave[start_index:])
            else:
                wave_segments.append(norm_wave[start_index: start_index + segment_len])
                start_index += segment_len

        # write the audio segments into new files
        for index, wave_segment in enumerate(wave_segments):
            sf.write(f"split_{index + 1}.wav", wave_segment, 16000)


meta_model = TTS_Interface()
article = "<p style='text-align: left'>This is still a work in progress, models will be exchanged for better ones as soon as they are done. More diverse training data can help with more exact cloning. For example we are still trying to incorporate more singing data. </p><p style='text-align: center'><a href='https://github.com/DigitalPhonetics/IMS-Toucan' target='_blank'>Click here to learn more about the IMS Toucan Speech Synthesis Toolkit</a></p>"

iface = gr.Interface(fn=meta_model.read,
                     inputs=[gr.inputs.Dropdown(
                         [
                             "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."],
                         type="value",
                         default="Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better.",
                         label="Select which utterance should be customized"),
                         gr.inputs.Dropdown(["Voice 1",
                                             "Voice 2",
                                             "Voice 3"], type="value", default="Voice 1", label="Speaker selection for the first sentence"),
                         gr.inputs.Dropdown(["Voice 1",
                                             "Voice 2",
                                             "Voice 3"], type="value", default="Voice 2", label="Speaker selection for the second sentence"),
                         gr.inputs.Dropdown(["Voice 1",
                                             "Voice 2",
                                             "Voice 3"], type="value", default="Voice 3", label="Speaker selection for the third sentence")],
                     outputs=[gr.outputs.Image(label="Alignment of Phonemes to Audio"),
                              gr.outputs.Audio(type="file", label="Original Audio"),
                              gr.outputs.Audio(type="file", label="Reference-Voice 1"),
                              gr.outputs.Audio(type="file", label="Reference-Voice 2"),
                              gr.outputs.Audio(type="file", label="Reference-Voice 3"),
                              gr.outputs.Audio(type="numpy", label="Customized Audio")],
                     layout="vertical",
                     title="Speech Customization",
                     thumbnail="Utility/toucan.png",
                     theme="default",
                     allow_flagging="never",
                     allow_screenshot=False,
                     description="In this demo, an audio is split automatically into individual sentences. Then each of the sentences is re-synthesized into speech with the exact same prosody, but with a voice that you can choose. This allows customizing any existing read speech while retaining as much from the original reading as possible. Unfortunately, we cannot show you the reference audio and the reference voices ahead of time, so they will be displayed together with the resulting cloned speech.",
                     article=article)
iface.launch(enable_queue=True)