eriquesouza's picture
app v1
e831f85
raw history blame
No virus
10.6 kB
import os
import gradio as gr
import numpy as np
import soundfile as sf
import torch
from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
from Preprocessing.AudioPreprocessor import AudioPreprocessor
from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
from run_utterance_cloner import UtteranceCloner
os.system("pip uninstall -y gradio")
os.system("pip install gradio==2.7.5.2")
def float2pcm(sig, dtype='int16'):
"""
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
"""
sig = np.asarray(sig)
if sig.dtype.kind != 'f':
raise TypeError("'sig' must be a float array")
dtype = np.dtype(dtype)
if dtype.kind not in 'iu':
raise TypeError("'dtype' must be an integer type")
i = np.iinfo(dtype)
abs_max = 2 ** (i.bits - 1)
offset = i.min + abs_max
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
class TTS_Interface:
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.utterance_cloner = UtteranceCloner(device=self.device)
self.speaker_path_lookup = {
"Voice 1": "reference_audios/evandro3.wav",
"Voice 2": "reference_audios/bruno8.wav",
}
self.acoustic_model = Aligner()
self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
self.acoustic_model = self.acoustic_model.to(self.device)
self.dc = DurationCalculator(reduction_factor=1)
self.tf = ArticulatoryCombinedTextFrontend(language="pt")
def read(self, prompt, speaker_1, speaker_2, speaker_3):
if prompt == "Era o fato de que você não tem mão de obra, né?":
reference_audio = "reference_audios/bruno8.wav"
elif prompt == "Gente um aviso muito importante: ontem eu fiz todo o trâmite para me inscrever no bbb e na hora que cliquei no botão de enviar, as inscrições estavam encerradas. Então é, lamento informar, mas o jeito vai ser eu licensiar conteúdo para o onlyfans.":
reference_audio = "reference_audios/evandro3.wav"
text_list = prompt.replace(".", ".|").replace("?", "?|").replace("!", "!|").split("|")
# we don't split on the punctuation marks because we want to retain them.
self.split_audio(reference_audio, text_list)
# at this point, split_1.wav, split_2.wav and split_3.wav should exist.
self.utterance_cloner.tts.set_utterance_embedding(self.speaker_path_lookup[speaker_1])
part_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
reference_transcription=text_list[0],
clone_speaker_identity=False,
lang="pt")
self.utterance_cloner.tts.set_utterance_embedding(self.speaker_path_lookup[speaker_2])
part_2 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_2.wav",
reference_transcription=text_list[1],
clone_speaker_identity=False,
lang="pt")
self.utterance_cloner.tts.set_utterance_embedding(self.speaker_path_lookup[speaker_3])
part_3 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_3.wav",
reference_transcription=text_list[2],
clone_speaker_identity=False,
lang="pt")
return "alignment.png", \
reference_audio, \
self.speaker_path_lookup["Voice 1"], \
self.speaker_path_lookup["Voice 2"], \
self.speaker_path_lookup["Voice 3"], \
(48000, float2pcm(torch.cat([part_1, part_2, part_3], dim=0).numpy()))
def split_audio(self, path_to_audio, text_list):
# extract audio
audio, sr = sf.read(path_to_audio)
ap = AudioPreprocessor(input_sr=sr, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False)
norm_wave = ap.audio_to_wave_tensor(normalize=True, audio=audio)
melspec = ap.audio_to_mel_spec_tensor(audio=norm_wave, normalize=False, explicit_sampling_rate=16000).transpose(0, 1)
# extract phonemes
lines = list()
for segment in text_list:
if segment.strip() != "":
lines.append(self.tf.string_to_tensor(segment, handle_missing=False).squeeze())
# postprocess phonemes: [~ sentence ~ #] --> [sentence ~] except for the first one, which is [~ sentence ~]
processed_lines = list()
for index, line in enumerate(lines):
if index == 0:
processed_lines.append(line[:-1])
else:
processed_lines.append(line[1:-1])
lines = processed_lines
joined_phonemes = torch.cat(lines, dim=0)
# get durations of each phone in audio as average of an ensemble
alignment_paths = list()
ensemble_of_durations = list()
for ensemble in range(2):
alignment_paths.append(self.acoustic_model.inference(mel=melspec.to(self.device),
tokens=joined_phonemes.to(self.device),
save_img_for_debug="alignment.png" if ensemble == 1 else None,
return_ctc=False))
for alignment_path in alignment_paths:
ensemble_of_durations.append(self.dc(torch.LongTensor(alignment_path), vis=None).squeeze())
durations = list()
for i, _ in enumerate(ensemble_of_durations[0]):
duration_of_phone = list()
for ensemble_member in ensemble_of_durations:
duration_of_phone.append(ensemble_member.squeeze()[i])
durations.append(sum(duration_of_phone) / len(duration_of_phone))
# cut audio according to duration sum of each line in transcript
line_lens = [len(x) for x in lines]
index = 0
segment_durations = list()
for num_phones in line_lens:
segment_durations.append(sum(durations[index: index + num_phones]))
index += num_phones
spec_to_wave_factor = len(norm_wave) / sum(segment_durations)
wave_segment_lens = [int(x * spec_to_wave_factor) for x in segment_durations]
start_index = 0
wave_segments = list()
for index, segment_len in enumerate(wave_segment_lens):
if index == len(wave_segment_lens) - 1:
wave_segments.append(norm_wave[start_index:])
else:
wave_segments.append(norm_wave[start_index: start_index + segment_len])
start_index += segment_len
# write the audio segments into new files
for index, wave_segment in enumerate(wave_segments):
sf.write(f"split_{index + 1}.wav", wave_segment, 16000)
meta_model = TTS_Interface()
article = "<p style='text-align: left'>This is still a work in progress, models will be exchanged for better ones as soon as they are done. More diverse training data can help with more exact cloning. For example we are still trying to incorporate more singing data. </p><p style='text-align: center'><a href='https://github.com/DigitalPhonetics/IMS-Toucan' target='_blank'>Click here to learn more about the IMS Toucan Speech Synthesis Toolkit</a></p>"
iface = gr.Interface(fn=meta_model.read,
inputs=[gr.inputs.Dropdown(
["Era o fato de que você não tem mão de obra, né?",
"Gente um aviso muito importante: ontem eu fiz todo o trâmite para me inscrever no bbb e na hora que cliquei no botão de enviar, as inscrições estavam encerradas. Então é, lamento informar, mas o jeito vai ser eu licensiar conteúdo para o onlyfans."],
type="value",
default="Era o fato de que você não tem mão de obra, né?",
label="Select which utterance should be customized"),
gr.inputs.Dropdown(["Voice 1",
"Voice 2"], type="value", default="Voice 1", label="Speaker selection for the first sentence"),
gr.inputs.Dropdown(["Voice 1",
"Voice 2"], type="value", default="Voice 2", label="Speaker selection for the second sentence"),
gr.inputs.Dropdown(["Voice 1",
"Voice 2"], type="value", default="Voice 3", label="Speaker selection for the third sentence")],
outputs=[gr.outputs.Image(label="Alignment of Phonemes to Audio"),
gr.outputs.Audio(type="file", label="Original Audio"),
gr.outputs.Audio(type="file", label="Reference-Voice 1"),
gr.outputs.Audio(type="file", label="Reference-Voice 2"),
gr.outputs.Audio(type="file", label="Reference-Voice 3"),
gr.outputs.Audio(type="numpy", label="Customized Audio")],
layout="vertical",
title="IMS Toucan - Speech Customization through Voice Cloning",
thumbnail="Utility/toucan.png",
theme="default",
allow_flagging="never",
allow_screenshot=False,
description="In this demo, an audio is split automatically into individual sentences. Then each of the sentences is re-synthesized into speech with the exact same prosody, but with a voice that you can choose. This allows customizing any existing read speech while retaining as much from the original reading as possible. Unfortunately, we cannot show you the reference audio and the reference voices ahead of time, so they will be displayed together with the resulting cloned speech.",
article=article)
iface.launch(enable_queue=True)