Spaces:
Runtime error
Runtime error
File size: 16,903 Bytes
2cb106d 52c413f 2cb106d 52c413f 2cb106d ab21602 f360342 2cb106d f360342 52c413f 2cb106d 52c413f f360342 2cb106d 52c413f 2cb106d 0d40f57 52c413f 0d40f57 2cb106d 42616ca ab21602 2cb106d 52c413f 2cb106d 52c413f 2cb106d 52c413f 2cb106d 52c413f 2cb106d 52c413f 2cb106d 52c413f 2cb106d 42616ca 2cb106d 52c413f 2cb106d 5468186 2cb106d 52c413f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 |
import os
import gradio as gr
import numpy as np
import soundfile
import soundfile as sf
import torch
from tqdm import tqdm
os.system("git clone --branch v2.5 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
os.system("mv toucan_codebase/* .")
from run_model_downloader import download_models
download_models()
from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
from Preprocessing.AudioPreprocessor import AudioPreprocessor
from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
from InferenceInterfaces.UtteranceCloner import UtteranceCloner
from Preprocessing.articulatory_features import get_feature_to_index_lookup
def float2pcm(sig, dtype='int16'):
"""
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
"""
sig = np.asarray(sig)
if sig.dtype.kind != 'f':
raise TypeError("'sig' must be a float array")
dtype = np.dtype(dtype)
if dtype.kind not in 'iu':
raise TypeError("'dtype' must be an integer type")
i = np.iinfo(dtype)
abs_max = 2 ** (i.bits - 1)
offset = i.min + abs_max
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
class TTS_Interface:
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.utterance_cloner = UtteranceCloner(model_id="Meta", device=self.device)
self.speaker_path_lookup = {
"Voice 1": "reference_audios/voice_1.flac",
"Voice 2": "reference_audios/voice_2.wav",
"Voice 3": "reference_audios/voice_3.wav",
}
self.acoustic_model = Aligner()
self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
self.acoustic_model = self.acoustic_model.to(self.device)
self.dc = DurationCalculator(reduction_factor=1)
self.tf = ArticulatoryCombinedTextFrontend(language="en")
example_audio, sr = soundfile.read("reference_audios/clone_me_5.wav")
self.ap = AudioPreprocessor(input_sr=sr, output_sr=16000, )
## finetune aligner
steps = 10
tokens = list() # we need an ID sequence for training rather than a sequence of phonological features
for vector in self.tf.string_to_tensor(
"Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."):
if vector[get_feature_to_index_lookup()["word-boundary"]] == 0:
# we don't include word boundaries when performing alignment, since they are not always present in audio.
for phone in self.tf.phone_to_vector:
if vector.numpy().tolist()[13:] == self.tf.phone_to_vector[phone][13:]:
# the first 12 dimensions are for modifiers, so we ignore those when trying to find the phoneme in the ID lookup
tokens.append(self.tf.phone_to_id[phone])
# this is terribly inefficient, but it's fine
break
tokens = torch.LongTensor(tokens).squeeze().to(self.device)
tokens_len = torch.LongTensor([len(tokens)]).to(self.device)
mel = self.ap.audio_to_mel_spec_tensor(example_audio, normalize=True).transpose(0, 1).unsqueeze(0).to(self.device)
mel.requires_grad = True
mel_len = torch.LongTensor([len(mel[0])]).to(self.device)
# actual fine-tuning starts here
optim_asr = torch.optim.SGD(self.acoustic_model.parameters(), lr=0.1)
self.acoustic_model.train()
for _ in tqdm(list(range(steps))):
pred = self.acoustic_model(mel)
loss = self.acoustic_model.ctc_loss(pred.transpose(0, 1).log_softmax(2), tokens, mel_len, tokens_len)
optim_asr.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(self.acoustic_model.parameters(), 1.0)
optim_asr.step()
self.acoustic_model.eval()
## done finetuning
reference_audio = "reference_audios/clone_me_5.wav"
prompt = "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."
text_list = prompt.replace(".", ".|").replace("?", "?|").replace("!", "!|").split("|")
# we don't split on the punctuation marks because we want to retain them.
self.split_audio(reference_audio, text_list)
# at this point, split_1.wav, split_2.wav and split_3.wav should exist.
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_1.flac")
self.part_1_voice_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
reference_transcription=text_list[0],
clone_speaker_identity=False,
lang="en")
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_2.wav")
self.part_1_voice_2 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
reference_transcription=text_list[0],
clone_speaker_identity=False,
lang="en")
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_3.wav")
self.part_1_voice_3 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
reference_transcription=text_list[0],
clone_speaker_identity=False,
lang="en")
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_1.flac")
self.part_2_voice_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_2.wav",
reference_transcription=text_list[1],
clone_speaker_identity=False,
lang="en")
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_2.wav")
self.part_2_voice_2 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_2.wav",
reference_transcription=text_list[1],
clone_speaker_identity=False,
lang="en")
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_3.wav")
self.part_2_voice_3 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_2.wav",
reference_transcription=text_list[1],
clone_speaker_identity=False,
lang="en")
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_1.flac")
self.part_3_voice_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_3.wav",
reference_transcription=text_list[2],
clone_speaker_identity=False,
lang="en")
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_2.wav")
self.part_3_voice_2 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_3.wav",
reference_transcription=text_list[2],
clone_speaker_identity=False,
lang="en")
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_3.wav")
self.part_3_voice_3 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_3.wav",
reference_transcription=text_list[2],
clone_speaker_identity=False,
lang="en")
def read(self, _, speaker_1, speaker_2, speaker_3):
reference_audio = "reference_audios/clone_me_5.wav"
if speaker_1 == "Voice 1":
part_1 = self.part_1_voice_1
elif speaker_1 == "Voice 2":
part_1 = self.part_1_voice_2
elif speaker_1 == "Voice 3":
part_1 = self.part_1_voice_3
if speaker_2 == "Voice 1":
part_2 = self.part_2_voice_1
elif speaker_2 == "Voice 2":
part_2 = self.part_2_voice_2
elif speaker_2 == "Voice 3":
part_2 = self.part_2_voice_3
if speaker_3 == "Voice 1":
part_3 = self.part_3_voice_1
elif speaker_3 == "Voice 2":
part_3 = self.part_3_voice_2
elif speaker_3 == "Voice 3":
part_3 = self.part_3_voice_3
return "alignment.png", \
reference_audio, \
self.speaker_path_lookup["Voice 1"], \
self.speaker_path_lookup["Voice 2"], \
self.speaker_path_lookup["Voice 3"], \
(24000, float2pcm(torch.cat([torch.tensor(part_1), torch.tensor(part_2), torch.tensor(part_3)], dim=0).numpy()))
def split_audio(self, path_to_audio, text_list):
# extract audio
audio, sr = sf.read(path_to_audio)
ap = AudioPreprocessor(input_sr=sr, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False)
norm_wave = ap.audio_to_wave_tensor(normalize=True, audio=audio)
melspec = ap.audio_to_mel_spec_tensor(audio=norm_wave, normalize=False, explicit_sampling_rate=16000).transpose(0, 1)
# extract phonemes
lines = list()
self.tf.use_word_boundaries = False # this causes problems when splitting otherwise
for segment in text_list:
if segment.strip() != "":
lines.append(self.tf.string_to_tensor(segment, handle_missing=False).squeeze())
self.tf.use_word_boundaries = True
# postprocess phonemes: [~ sentence ~ #] --> [sentence ~] except for the first one, which is [~ sentence ~]
processed_lines = list()
for index, line in enumerate(lines):
if index == 0:
processed_lines.append(line[:-1])
else:
processed_lines.append(line[1:-1])
lines = processed_lines
joined_phonemes = torch.cat(lines, dim=0)
# get durations of each phone in audio as average of an ensemble
alignment_paths = list()
ensemble_of_durations = list()
for ensemble in range(1):
alignment_paths.append(self.acoustic_model.inference(mel=melspec.to(self.device),
tokens=joined_phonemes.to(self.device),
save_img_for_debug="alignment.png" if ensemble == 0 else None,
return_ctc=False))
for alignment_path in alignment_paths:
ensemble_of_durations.append(self.dc(torch.LongTensor(alignment_path), vis=None).squeeze())
durations = list()
for i, _ in enumerate(ensemble_of_durations[0]):
duration_of_phone = list()
for ensemble_member in ensemble_of_durations:
duration_of_phone.append(ensemble_member.squeeze()[i])
durations.append(sum(duration_of_phone) / len(duration_of_phone))
# cut audio according to duration sum of each line in transcript
line_lens = [len(x) for x in lines]
index = 0
segment_durations = list()
for num_phones in line_lens:
segment_durations.append(sum(durations[index: index + num_phones]))
index += num_phones
spec_to_wave_factor = len(norm_wave) / sum(segment_durations)
wave_segment_lens = [int(x * spec_to_wave_factor) for x in segment_durations]
start_index = 0
wave_segments = list()
for index, segment_len in enumerate(wave_segment_lens):
if index == len(wave_segment_lens) - 1:
wave_segments.append(norm_wave[start_index:])
else:
wave_segments.append(norm_wave[start_index: start_index + segment_len])
start_index += segment_len
# write the audio segments into new files
for index, wave_segment in enumerate(wave_segments):
sf.write(f"split_{index + 1}.wav", wave_segment, 16000)
meta_model = TTS_Interface()
article = "<p style='text-align: left'>This is still a work in progress, models will be exchanged for better ones as soon as they are done. More diverse training data can help with more exact cloning. For example we are still trying to incorporate more singing data. </p><p style='text-align: center'><a href='https://github.com/DigitalPhonetics/IMS-Toucan' target='_blank'>Click here to learn more about the IMS Toucan Speech Synthesis Toolkit</a></p>"
iface = gr.Interface(fn=meta_model.read,
inputs=[gr.inputs.Dropdown(
[
"Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."],
type="value",
default="Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better.",
label="Select which utterance should be customized"),
gr.inputs.Dropdown(["Voice 1",
"Voice 2",
"Voice 3"], type="value", default="Voice 1", label="Speaker selection for the first sentence"),
gr.inputs.Dropdown(["Voice 1",
"Voice 2",
"Voice 3"], type="value", default="Voice 2", label="Speaker selection for the second sentence"),
gr.inputs.Dropdown(["Voice 1",
"Voice 2",
"Voice 3"], type="value", default="Voice 3", label="Speaker selection for the third sentence")],
outputs=[gr.outputs.Image(label="Alignment of Phonemes to Audio"),
gr.outputs.Audio(type="file", label="Original Audio"),
gr.outputs.Audio(type="file", label="Reference-Voice 1"),
gr.outputs.Audio(type="file", label="Reference-Voice 2"),
gr.outputs.Audio(type="file", label="Reference-Voice 3"),
gr.outputs.Audio(type="numpy", label="Customized Audio")],
layout="vertical",
title="Speech Customization",
thumbnail="Utility/toucan.png",
theme="default",
allow_flagging="never",
allow_screenshot=False,
description="In this demo, an audio is split automatically into individual sentences. Then each of the sentences is re-synthesized into speech with the exact same prosody, but with a voice that you can choose. This allows customizing any existing read speech while retaining as much from the original reading as possible. Unfortunately, we cannot show you the reference audio and the reference voices ahead of time, so they will be displayed together with the resulting cloned speech.",
article=article)
iface.launch(enable_queue=True)
|