Speech-t5 / app.py
Agon H
Update app.py
c48af39
import gradio as gr
import librosa
import numpy as np
import torch
import torch.nn.functional as F
from pathlib import Path
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
title = "SpeechT5: Voice Conversion"
description = """ This space can "clone voice" and can do "text to speech" . Special credit goes to "Microsoft" and
"Speechbrain". Because without their models its not possible to create this space. Enjoy ! """
article = """ """
device = "cuda" if torch.cuda.is_available() else "cpu"
checkpoint = "microsoft/speecht5_vc"
processor_vc = SpeechT5Processor.from_pretrained(checkpoint)
model_vc = SpeechT5ForSpeechToSpeech.from_pretrained(checkpoint)
checkpoint_tts = "microsoft/speecht5_tts"
processor_tts = SpeechT5Processor.from_pretrained(checkpoint_tts)
model_tts = SpeechT5ForTextToSpeech.from_pretrained(checkpoint_tts)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
model_embed = {
"speechbrain/spkrec-xvect-voxceleb": 512,
"speechbrain/spkrec-ecapa-voxceleb": 192,
}
checkpoint_embed = "speechbrain/spkrec-xvect-voxceleb"
size_embed = model_embed[checkpoint_embed]
embeding_classifier = EncoderClassifier.from_hparams(source=checkpoint_embed, run_opts={"device": device}, savedir="/tmp/speaker_embed")
examples_pt = 'examples'
allowed_extentions = ['.mp3', '.wav']
examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}
default_voice = list(examples.keys())[0]
verse = """Hey how are you doing today ?"""
def process_audio(sampling_rate, waveform, target_sr=16000):
# convert from int16 to floating point
waveform = waveform / 32678.0
# convert to mono if stereo
if len(waveform.shape) > 1:
waveform = librosa.to_mono(waveform.T)
# resample to 16 kHz if necessary
if sampling_rate != target_sr:
waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=target_sr)
# limit to 30 seconds
waveform = waveform[:target_sr * 30]
# make PyTorch tensor
waveform = torch.tensor(waveform)
return waveform
def f2embed(waveform, sz):
with torch.no_grad():
embeddings = embeding_classifier.encode_batch(waveform)
embeddings = F.normalize(embeddings, dim=2)
embeddings = embeddings.squeeze().cpu().numpy()
assert embeddings.shape[0] == sz, embeddings.shape[0]
return embeddings
def on_voicedropdown(x):
return examples[x]
def on_voiceload(audio, sz=size_embed):
print("on_voiceload")
# audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels))
if audio is not None:
sampling_rate, waveform = audio
else:
return np.zeros(sz)
waveform = process_audio(sampling_rate, waveform)
embed = f2embed(waveform, sz)
print("Generated embedding", embed[:5])
return embed
def voice_clone(audio, speaker_embedding, target_sr=16000):
# audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels))
if audio is None or speaker_embedding is None:
return (target_sr, np.zeros(0).astype(np.int16))
else:
sampling_rate, waveform = audio
waveform = process_audio(sampling_rate, waveform)
inputs = processor_vc(audio=waveform, sampling_rate=target_sr, return_tensors="pt")
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
speech = model_vc.generate_speech(inputs["input_values"], speaker_embedding, vocoder=vocoder)
speech = (speech.numpy() * 32767).astype(np.int16)
return (target_sr, speech)
def text_to_speech(text, speaker_embedding, target_sr=16000):
if len(text.strip()) == 0 or speaker_embedding is None:
return (target_sr, np.zeros(0).astype(np.int16))
inputs = processor_tts(text=text, return_tensors="pt")
# limit input length
input_ids = inputs["input_ids"]
input_ids = input_ids[..., :model_tts.config.max_text_positions]
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
speech = model_tts.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
speech = (speech.numpy() * 32767).astype(np.int16)
return (target_sr, speech)
theme = gr.themes.Monochrome()
with gr.Blocks() as demo:
voice_embedding = gr.State(None)
def activate(*args):
return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)
def deactivate(*args):
return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)
gr.Markdown(description)
with gr.Accordion("Voice to clone", open=False) as accordion:
gr.Markdown("Upload target voice...")
with gr.Row(equal_height=True):
voice_upload = gr.Audio(label="Upload target voice", source="upload", type="numpy")
voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)
# TODO: couldn't catch microphone stop event
# mic = gr.Audio(label="Record Speech", source="microphone", type="numpy")
# mic.stop(fn=lambda x: print('mic stop'), inputs=None, outputs=None)
with gr.Row(equal_height=True):
with gr.Column(scale=2):
with gr.Row(equal_height=True):
text_to_convert = gr.Textbox(verse)
voice_to_convert = gr.Audio(label="Upload voice to convert", source="upload", type="numpy")
with gr.Row(equal_height=True):
button_text = gr.Button("Text to speech", interactive=False)
button_audio = gr.Button("Convert audio", interactive=False)
with gr.Row(equal_height=True):
speech = gr.Audio(label="Converted Speech", type="numpy", visible=True, interactive=False)
# actions
kwargs = dict(fn=on_voiceload, inputs=voice_upload, outputs=voice_embedding)
voice_upload.upload(deactivate, [button_text, button_audio], [button_text, button_audio]).\
then(**kwargs).then(activate, [button_text, button_audio], [button_text, button_audio])
voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\
then(**kwargs).then(activate, [button_text, button_audio], [button_text, button_audio])
button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
then(fn=text_to_speech, inputs=[text_to_convert, voice_embedding], outputs=speech).\
then(activate, [button_text, button_audio], [button_text, button_audio])
button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
then(fn=voice_clone, inputs=[voice_to_convert, voice_embedding], outputs=speech).\
then(activate, [button_text, button_audio], [button_text, button_audio])
gr.HTML(article)
demo.launch(share=False)