|
import gradio as gr |
|
import librosa |
|
import numpy as np |
|
import torch |
|
import torch.nn.functional as F |
|
from pathlib import Path |
|
|
|
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan |
|
from speechbrain.pretrained import EncoderClassifier |
|
|
|
title = "SpeechT5: Voice Conversion" |
|
description = """ This space can "clone voice" and can do "text to speech" . Special credit goes to "Microsoft" and |
|
"Speechbrain". Because without their models its not possible to create this space. Enjoy ! """ |
|
|
|
article = """ """ |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
checkpoint = "microsoft/speecht5_vc" |
|
processor_vc = SpeechT5Processor.from_pretrained(checkpoint) |
|
model_vc = SpeechT5ForSpeechToSpeech.from_pretrained(checkpoint) |
|
checkpoint_tts = "microsoft/speecht5_tts" |
|
processor_tts = SpeechT5Processor.from_pretrained(checkpoint_tts) |
|
model_tts = SpeechT5ForTextToSpeech.from_pretrained(checkpoint_tts) |
|
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
|
|
model_embed = { |
|
"speechbrain/spkrec-xvect-voxceleb": 512, |
|
"speechbrain/spkrec-ecapa-voxceleb": 192, |
|
} |
|
checkpoint_embed = "speechbrain/spkrec-xvect-voxceleb" |
|
size_embed = model_embed[checkpoint_embed] |
|
embeding_classifier = EncoderClassifier.from_hparams(source=checkpoint_embed, run_opts={"device": device}, savedir="/tmp/speaker_embed") |
|
|
|
examples_pt = 'examples' |
|
allowed_extentions = ['.mp3', '.wav'] |
|
examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions} |
|
default_voice = list(examples.keys())[0] |
|
verse = """Hey how are you doing today ?""" |
|
|
|
def process_audio(sampling_rate, waveform, target_sr=16000): |
|
|
|
waveform = waveform / 32678.0 |
|
|
|
|
|
if len(waveform.shape) > 1: |
|
waveform = librosa.to_mono(waveform.T) |
|
|
|
|
|
if sampling_rate != target_sr: |
|
waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=target_sr) |
|
|
|
|
|
waveform = waveform[:target_sr * 30] |
|
|
|
|
|
waveform = torch.tensor(waveform) |
|
return waveform |
|
|
|
|
|
def f2embed(waveform, sz): |
|
with torch.no_grad(): |
|
embeddings = embeding_classifier.encode_batch(waveform) |
|
embeddings = F.normalize(embeddings, dim=2) |
|
embeddings = embeddings.squeeze().cpu().numpy() |
|
assert embeddings.shape[0] == sz, embeddings.shape[0] |
|
return embeddings |
|
|
|
|
|
def on_voicedropdown(x): |
|
return examples[x] |
|
|
|
|
|
def on_voiceload(audio, sz=size_embed): |
|
print("on_voiceload") |
|
|
|
if audio is not None: |
|
sampling_rate, waveform = audio |
|
else: |
|
return np.zeros(sz) |
|
waveform = process_audio(sampling_rate, waveform) |
|
embed = f2embed(waveform, sz) |
|
print("Generated embedding", embed[:5]) |
|
return embed |
|
|
|
|
|
def voice_clone(audio, speaker_embedding, target_sr=16000): |
|
|
|
if audio is None or speaker_embedding is None: |
|
return (target_sr, np.zeros(0).astype(np.int16)) |
|
else: |
|
sampling_rate, waveform = audio |
|
|
|
waveform = process_audio(sampling_rate, waveform) |
|
inputs = processor_vc(audio=waveform, sampling_rate=target_sr, return_tensors="pt") |
|
|
|
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0) |
|
|
|
speech = model_vc.generate_speech(inputs["input_values"], speaker_embedding, vocoder=vocoder) |
|
|
|
speech = (speech.numpy() * 32767).astype(np.int16) |
|
return (target_sr, speech) |
|
|
|
def text_to_speech(text, speaker_embedding, target_sr=16000): |
|
if len(text.strip()) == 0 or speaker_embedding is None: |
|
return (target_sr, np.zeros(0).astype(np.int16)) |
|
|
|
inputs = processor_tts(text=text, return_tensors="pt") |
|
|
|
input_ids = inputs["input_ids"] |
|
input_ids = input_ids[..., :model_tts.config.max_text_positions] |
|
|
|
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0) |
|
|
|
speech = model_tts.generate_speech(input_ids, speaker_embedding, vocoder=vocoder) |
|
|
|
speech = (speech.numpy() * 32767).astype(np.int16) |
|
return (target_sr, speech) |
|
|
|
theme = gr.themes.Monochrome() |
|
with gr.Blocks() as demo: |
|
voice_embedding = gr.State(None) |
|
def activate(*args): |
|
return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args) |
|
def deactivate(*args): |
|
return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args) |
|
|
|
gr.Markdown(description) |
|
|
|
with gr.Accordion("Voice to clone", open=False) as accordion: |
|
gr.Markdown("Upload target voice...") |
|
with gr.Row(equal_height=True): |
|
voice_upload = gr.Audio(label="Upload target voice", source="upload", type="numpy") |
|
voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True) |
|
|
|
|
|
|
|
|
|
|
|
with gr.Row(equal_height=True): |
|
with gr.Column(scale=2): |
|
with gr.Row(equal_height=True): |
|
text_to_convert = gr.Textbox(verse) |
|
voice_to_convert = gr.Audio(label="Upload voice to convert", source="upload", type="numpy") |
|
with gr.Row(equal_height=True): |
|
button_text = gr.Button("Text to speech", interactive=False) |
|
button_audio = gr.Button("Convert audio", interactive=False) |
|
with gr.Row(equal_height=True): |
|
speech = gr.Audio(label="Converted Speech", type="numpy", visible=True, interactive=False) |
|
|
|
|
|
kwargs = dict(fn=on_voiceload, inputs=voice_upload, outputs=voice_embedding) |
|
voice_upload.upload(deactivate, [button_text, button_audio], [button_text, button_audio]).\ |
|
then(**kwargs).then(activate, [button_text, button_audio], [button_text, button_audio]) |
|
voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\ |
|
then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\ |
|
then(**kwargs).then(activate, [button_text, button_audio], [button_text, button_audio]) |
|
button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\ |
|
then(fn=text_to_speech, inputs=[text_to_convert, voice_embedding], outputs=speech).\ |
|
then(activate, [button_text, button_audio], [button_text, button_audio]) |
|
button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\ |
|
then(fn=voice_clone, inputs=[voice_to_convert, voice_embedding], outputs=speech).\ |
|
then(activate, [button_text, button_audio], [button_text, button_audio]) |
|
|
|
gr.HTML(article) |
|
demo.launch(share=False) |