Spaces:

Flux9665
/

SpeechCloning

Running

App Files Files

SpeechCloning / app.py

Flux9665

Update app.py

ab21602 about 1 year ago

raw history blame

No virus

16.9 kB

	import os

	import gradio as gr
	import numpy as np
	import soundfile
	import soundfile as sf
	import torch
	from tqdm import tqdm

	os.system("git clone --branch v2.5 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
	os.system("mv toucan_codebase/* .")

	from run_model_downloader import download_models

	download_models()

	from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
	from Preprocessing.AudioPreprocessor import AudioPreprocessor
	from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
	from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
	from InferenceInterfaces.UtteranceCloner import UtteranceCloner
	from Preprocessing.articulatory_features import get_feature_to_index_lookup


	def float2pcm(sig, dtype='int16'):
	"""
	https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
	"""
	sig = np.asarray(sig)
	if sig.dtype.kind != 'f':
	raise TypeError("'sig' must be a float array")
	dtype = np.dtype(dtype)
	if dtype.kind not in 'iu':
	raise TypeError("'dtype' must be an integer type")
	i = np.iinfo(dtype)
	abs_max = 2 ** (i.bits - 1)
	offset = i.min + abs_max
	return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)


	class TTS_Interface:

	def __init__(self):
	self.device = "cuda" if torch.cuda.is_available() else "cpu"

	self.utterance_cloner = UtteranceCloner(model_id="Meta", device=self.device)
	self.speaker_path_lookup = {
	"Voice 1": "reference_audios/voice_1.flac",
	"Voice 2": "reference_audios/voice_2.wav",
	"Voice 3": "reference_audios/voice_3.wav",
	}
	self.acoustic_model = Aligner()
	self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
	self.acoustic_model = self.acoustic_model.to(self.device)
	self.dc = DurationCalculator(reduction_factor=1)
	self.tf = ArticulatoryCombinedTextFrontend(language="en")
	example_audio, sr = soundfile.read("reference_audios/clone_me_5.wav")
	self.ap = AudioPreprocessor(input_sr=sr, output_sr=16000, )

	## finetune aligner
	steps = 10
	tokens = list() # we need an ID sequence for training rather than a sequence of phonological features
	for vector in self.tf.string_to_tensor(
	"Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."):
	if vector[get_feature_to_index_lookup()["word-boundary"]] == 0:
	# we don't include word boundaries when performing alignment, since they are not always present in audio.
	for phone in self.tf.phone_to_vector:
	if vector.numpy().tolist()[13:] == self.tf.phone_to_vector[phone][13:]:
	# the first 12 dimensions are for modifiers, so we ignore those when trying to find the phoneme in the ID lookup
	tokens.append(self.tf.phone_to_id[phone])
	# this is terribly inefficient, but it's fine
	break
	tokens = torch.LongTensor(tokens).squeeze().to(self.device)
	tokens_len = torch.LongTensor([len(tokens)]).to(self.device)
	mel = self.ap.audio_to_mel_spec_tensor(example_audio, normalize=True).transpose(0, 1).unsqueeze(0).to(self.device)
	mel.requires_grad = True
	mel_len = torch.LongTensor([len(mel[0])]).to(self.device)
	# actual fine-tuning starts here
	optim_asr = torch.optim.SGD(self.acoustic_model.parameters(), lr=0.1)
	self.acoustic_model.train()
	for _ in tqdm(list(range(steps))):
	pred = self.acoustic_model(mel)
	loss = self.acoustic_model.ctc_loss(pred.transpose(0, 1).log_softmax(2), tokens, mel_len, tokens_len)
	optim_asr.zero_grad()
	loss.backward()
	torch.nn.utils.clip_grad_norm_(self.acoustic_model.parameters(), 1.0)
	optim_asr.step()
	self.acoustic_model.eval()
	## done finetuning

	reference_audio = "reference_audios/clone_me_5.wav"
	prompt = "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."
	text_list = prompt.replace(".", ".\|").replace("?", "?\|").replace("!", "!\|").split("\|")
	# we don't split on the punctuation marks because we want to retain them.

	self.split_audio(reference_audio, text_list)
	# at this point, split_1.wav, split_2.wav and split_3.wav should exist.

	self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_1.flac")
	self.part_1_voice_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
	reference_transcription=text_list[0],
	clone_speaker_identity=False,
	lang="en")
	self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_2.wav")
	self.part_1_voice_2 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
	reference_transcription=text_list[0],
	clone_speaker_identity=False,
	lang="en")
	self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_3.wav")
	self.part_1_voice_3 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
	reference_transcription=text_list[0],
	clone_speaker_identity=False,
	lang="en")

	self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_1.flac")
	self.part_2_voice_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_2.wav",
	reference_transcription=text_list[1],
	clone_speaker_identity=False,
	lang="en")
	self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_2.wav")
	self.part_2_voice_2 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_2.wav",
	reference_transcription=text_list[1],
	clone_speaker_identity=False,
	lang="en")
	self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_3.wav")
	self.part_2_voice_3 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_2.wav",
	reference_transcription=text_list[1],
	clone_speaker_identity=False,
	lang="en")

	self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_1.flac")
	self.part_3_voice_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_3.wav",
	reference_transcription=text_list[2],
	clone_speaker_identity=False,
	lang="en")
	self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_2.wav")
	self.part_3_voice_2 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_3.wav",
	reference_transcription=text_list[2],
	clone_speaker_identity=False,
	lang="en")
	self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_3.wav")
	self.part_3_voice_3 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_3.wav",
	reference_transcription=text_list[2],
	clone_speaker_identity=False,
	lang="en")

	def read(self, _, speaker_1, speaker_2, speaker_3):
	reference_audio = "reference_audios/clone_me_5.wav"

	if speaker_1 == "Voice 1":
	part_1 = self.part_1_voice_1
	elif speaker_1 == "Voice 2":
	part_1 = self.part_1_voice_2
	elif speaker_1 == "Voice 3":
	part_1 = self.part_1_voice_3

	if speaker_2 == "Voice 1":
	part_2 = self.part_2_voice_1
	elif speaker_2 == "Voice 2":
	part_2 = self.part_2_voice_2
	elif speaker_2 == "Voice 3":
	part_2 = self.part_2_voice_3

	if speaker_3 == "Voice 1":
	part_3 = self.part_3_voice_1
	elif speaker_3 == "Voice 2":
	part_3 = self.part_3_voice_2
	elif speaker_3 == "Voice 3":
	part_3 = self.part_3_voice_3

	return "alignment.png", \
	reference_audio, \
	self.speaker_path_lookup["Voice 1"], \
	self.speaker_path_lookup["Voice 2"], \
	self.speaker_path_lookup["Voice 3"], \
	(24000, float2pcm(torch.cat([torch.tensor(part_1), torch.tensor(part_2), torch.tensor(part_3)], dim=0).numpy()))

	def split_audio(self, path_to_audio, text_list):
	# extract audio
	audio, sr = sf.read(path_to_audio)
	ap = AudioPreprocessor(input_sr=sr, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False)
	norm_wave = ap.audio_to_wave_tensor(normalize=True, audio=audio)
	melspec = ap.audio_to_mel_spec_tensor(audio=norm_wave, normalize=False, explicit_sampling_rate=16000).transpose(0, 1)

	# extract phonemes
	lines = list()
	self.tf.use_word_boundaries = False # this causes problems when splitting otherwise
	for segment in text_list:
	if segment.strip() != "":
	lines.append(self.tf.string_to_tensor(segment, handle_missing=False).squeeze())
	self.tf.use_word_boundaries = True

	# postprocess phonemes: [~ sentence ~ #] --> [sentence ~] except for the first one, which is [~ sentence ~]
	processed_lines = list()
	for index, line in enumerate(lines):
	if index == 0:
	processed_lines.append(line[:-1])
	else:
	processed_lines.append(line[1:-1])
	lines = processed_lines
	joined_phonemes = torch.cat(lines, dim=0)

	# get durations of each phone in audio as average of an ensemble
	alignment_paths = list()
	ensemble_of_durations = list()
	for ensemble in range(1):
	alignment_paths.append(self.acoustic_model.inference(mel=melspec.to(self.device),
	tokens=joined_phonemes.to(self.device),
	save_img_for_debug="alignment.png" if ensemble == 0 else None,
	return_ctc=False))
	for alignment_path in alignment_paths:
	ensemble_of_durations.append(self.dc(torch.LongTensor(alignment_path), vis=None).squeeze())
	durations = list()
	for i, _ in enumerate(ensemble_of_durations[0]):
	duration_of_phone = list()
	for ensemble_member in ensemble_of_durations:
	duration_of_phone.append(ensemble_member.squeeze()[i])
	durations.append(sum(duration_of_phone) / len(duration_of_phone))

	# cut audio according to duration sum of each line in transcript
	line_lens = [len(x) for x in lines]
	index = 0
	segment_durations = list()
	for num_phones in line_lens:
	segment_durations.append(sum(durations[index: index + num_phones]))
	index += num_phones
	spec_to_wave_factor = len(norm_wave) / sum(segment_durations)
	wave_segment_lens = [int(x * spec_to_wave_factor) for x in segment_durations]
	start_index = 0
	wave_segments = list()
	for index, segment_len in enumerate(wave_segment_lens):
	if index == len(wave_segment_lens) - 1:
	wave_segments.append(norm_wave[start_index:])
	else:
	wave_segments.append(norm_wave[start_index: start_index + segment_len])
	start_index += segment_len

	# write the audio segments into new files
	for index, wave_segment in enumerate(wave_segments):
	sf.write(f"split_{index + 1}.wav", wave_segment, 16000)


	meta_model = TTS_Interface()
	article = "<p style='text-align: left'>This is still a work in progress, models will be exchanged for better ones as soon as they are done. More diverse training data can help with more exact cloning. For example we are still trying to incorporate more singing data. </p><p style='text-align: center'><a href='https://github.com/DigitalPhonetics/IMS-Toucan' target='_blank'>Click here to learn more about the IMS Toucan Speech Synthesis Toolkit</a></p>"

	iface = gr.Interface(fn=meta_model.read,
	inputs=[gr.inputs.Dropdown(
	[
	"Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."],
	type="value",
	default="Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better.",
	label="Select which utterance should be customized"),
	gr.inputs.Dropdown(["Voice 1",
	"Voice 2",
	"Voice 3"], type="value", default="Voice 1", label="Speaker selection for the first sentence"),
	gr.inputs.Dropdown(["Voice 1",
	"Voice 2",
	"Voice 3"], type="value", default="Voice 2", label="Speaker selection for the second sentence"),
	gr.inputs.Dropdown(["Voice 1",
	"Voice 2",
	"Voice 3"], type="value", default="Voice 3", label="Speaker selection for the third sentence")],
	outputs=[gr.outputs.Image(label="Alignment of Phonemes to Audio"),
	gr.outputs.Audio(type="file", label="Original Audio"),
	gr.outputs.Audio(type="file", label="Reference-Voice 1"),
	gr.outputs.Audio(type="file", label="Reference-Voice 2"),
	gr.outputs.Audio(type="file", label="Reference-Voice 3"),
	gr.outputs.Audio(type="numpy", label="Customized Audio")],
	layout="vertical",
	title="Speech Customization",
	thumbnail="Utility/toucan.png",
	theme="default",
	allow_flagging="never",
	allow_screenshot=False,
	description="In this demo, an audio is split automatically into individual sentences. Then each of the sentences is re-synthesized into speech with the exact same prosody, but with a voice that you can choose. This allows customizing any existing read speech while retaining as much from the original reading as possible. Unfortunately, we cannot show you the reference audio and the reference voices ahead of time, so they will be displayed together with the resulting cloned speech.",
	article=article)
	iface.launch(enable_queue=True)