Spaces:
Running
Running
import gradio as gr | |
from hyper_parameters import tacotron_params as hparams | |
from training import load_model | |
from audio_processing import griffin_lim | |
from nn_layers import TacotronSTFT | |
from text import text_to_sequence | |
from hifigan.env import AttrDict | |
from examples_taco2 import * | |
from hifigan.models import Generator | |
import torch | |
import numpy as np | |
import json | |
import os | |
from matplotlib import pyplot as plt | |
# Adjust vertical spacing between subplots | |
plt.subplots_adjust(hspace=0.15) # You can adjust the value as needed | |
# Adjust the white space (margins) around the plot | |
plt.tight_layout(pad=0.5) # You can adjust the pad value as needed | |
torch.manual_seed(1234) | |
MAX_WAV_VALUE = 32768.0 | |
def load_checkpoint(filepath, device): | |
assert os.path.isfile(filepath) | |
print("Loading '{}'".format(filepath)) | |
checkpoint_dict = torch.load(filepath, map_location=device) | |
print("Complete.") | |
return checkpoint_dict | |
def plot_spec_align_sep(mel, align): | |
plt.figure(figsize=(4, 3)) | |
fig_mel = plt.figure() | |
ax_mel = fig_mel.add_subplot(111) | |
fig_mel.tight_layout() | |
ax_mel.imshow(mel) | |
# fig_mel.set_title('Mel-Scale Spectrogram', fontsize=12) | |
fig_align = plt.figure() | |
ax_align = fig_align.add_subplot(111) # fig_align | |
fig_align.tight_layout() | |
ax_align.imshow(align) | |
# fig_align.set_title('Alignment', fontsize=12) | |
return fig_mel, fig_align | |
# load trained tacotron2 + GST model: | |
model = load_model(hparams) | |
checkpoint_path = "models/checkpoint_78000.model" | |
model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict']) | |
# model.to('cuda') | |
_ = model.eval() | |
# load pre-trained HiFi-GAN model for mel2audio: | |
hifigan_checkpoint_path = "models/generator_v1" | |
config_file = os.path.join(os.path.split(hifigan_checkpoint_path)[0], 'config.json') | |
with open(config_file) as f: | |
data = f.read() | |
json_config = json.loads(data) | |
h = AttrDict(json_config) | |
device = torch.device("cpu") | |
generator = Generator(h).to(device) | |
state_dict_g = load_checkpoint(hifigan_checkpoint_path, device) | |
generator.load_state_dict(state_dict_g['generator']) | |
generator.eval() | |
generator.remove_weight_norm() | |
def synthesize(text, gst_1, gst_2, gst_3, voc): | |
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] | |
sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64) | |
# gst_head_scores = np.array([0.5, 0.15, 0.35]) | |
gst_head_scores = np.array([gst_1, gst_2, gst_3]) | |
gst_scores = torch.from_numpy(gst_head_scores).float() | |
with torch.no_grad(): | |
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores) | |
if voc == 0: | |
# mel2wav inference: | |
with torch.no_grad(): | |
y_g_hat = generator(mel_outputs_postnet) | |
audio = y_g_hat.squeeze() | |
audio = audio * MAX_WAV_VALUE | |
audio_numpy = audio.cpu().numpy().astype('int16') | |
# audio = vocoder_model.inference(mel_outputs_postnet) | |
# audio_numpy = audio.data.cpu().detach().numpy() | |
else: | |
# Griffin Lim vocoder synthesis: | |
griffin_iters = 60 | |
taco_stft = TacotronSTFT(hparams['filter_length'], hparams['hop_length'], hparams['win_length'], | |
sampling_rate=hparams['sampling_rate']) | |
mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) | |
mel_decompress = mel_decompress.transpose(1, 2).data.cpu() | |
spec_from_mel_scaling = 60 | |
spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) | |
spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) | |
spec_from_mel = spec_from_mel * spec_from_mel_scaling | |
audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, griffin_iters) | |
audio = audio.squeeze() | |
audio_numpy = audio.cpu().numpy() | |
# prepare plot for the output: | |
mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0]) | |
mel_outputs_postnet = mel_outputs_postnet.detach().numpy() | |
alignments = alignments.squeeze().T.detach().numpy() | |
# normalize numpy arrays between [-1, 1] | |
min_val = np.min(mel_outputs_postnet) | |
max_val = np.max(mel_outputs_postnet) | |
scaled_mel = (mel_outputs_postnet - min_val) / (max_val - min_val) | |
normalized_mel = 2 * scaled_mel - 1 | |
min_val = np.min(alignments) | |
max_val = np.max(alignments) | |
scaled_align = (alignments - min_val) / (max_val - min_val) | |
normalized_align = 2 * scaled_align - 1 | |
aw = gr.make_waveform((22050, audio_numpy), bg_image='background_images/wallpaper_test_1_crop_3.jpg', | |
bars_color=('#f3df4b', '#63edb7'), bar_count=100, bar_width=0.7, animate=True) | |
return aw, normalized_mel, normalized_align # (22050, audio_numpy), fig_mel, fig_align | |
with gr.Blocks() as demo: | |
gr.Markdown("<center><h1>English Neural Text-to-Speech</h1> " | |
"<h2>Speech Synthesis with Partial Style Control</h2></center><br>") | |
# gr.Markdown("## <center>Unsupervised Style Tokens using Single-Head Attention Parallel Encoder " | |
# "with Tacotron2</center>") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
# , value="Speech synthesis has evolved dramatically since the development of neural architectures capable of generating high quality samples." | |
inp = gr.Textbox(label="Input Text") | |
clear_btn = gr.ClearButton(value='Clear Text', size='sm', components=[inp]) | |
# gr.Markdown("A continuació, calibrem els pesos dels *style tokens*:") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
with gr.Tab("Global Style Tokens"): | |
gst_1 = gr.Slider(0.2, 0.45, label="GST 1", value=0.4) | |
gst_2 = gr.Slider(0.2, 0.45, label="GST 2", value=0.26) | |
gst_3 = gr.Slider(0.2, 0.45, label="GST 3", value=0.33) | |
with gr.Column(scale=0): | |
with gr.Tab("Vocoder"): | |
vocoder = gr.Radio([("HiFi-GAN", 0), ("Griffin-Lim", 1)], | |
container=False, value=0, min_width=300) # label="Vocoder") | |
greet_btn = gr.Button("Synthesize!", scale=1) | |
with gr.Column(): | |
with gr.Tab("Spectrogram"): | |
spec_plot = gr.Image(container=False) | |
with gr.Tab("Alignment"): | |
align_plot = gr.Image(container=False) | |
wave_video = gr.Video(label="Waveform", height=150, width=800, container=False) | |
def display_video(): | |
return wave_video | |
greet_btn.click(fn=synthesize, inputs=[inp, gst_1, gst_2, gst_3, vocoder], | |
outputs=[wave_video, spec_plot, align_plot], | |
api_name="synthesize") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Examples(examples=infer_from_text_examples, | |
inputs=[inp, gst_1, gst_2, gst_3, vocoder], | |
outputs=[wave_video, spec_plot, align_plot], | |
fn=synthesize, | |
cache_examples=False, ) | |
gr.Markdown(""" | |
### Details and Indications | |
This is a Text-to-Speech (TTS) system that consists of two modules: 1) a replicated Tacotron2 model, which generates | |
the spectrogram of the speech corresponding to the input text. And 2) a pre-trained HiFiGAN vocoder that maps | |
spectrograms to a digital waveforms. Global Style Tokens (GST) have been implemented to catch style information from | |
the female speaker with which the model has been trained (see the links below for more information). | |
Please, feel free to play with the GST scores and observe how the synthetic voice spells the input text. | |
Keep in mind that GSTs have been trained in an unsupervised way, so there is no specific control of | |
style attributes. Moreover, try to balance the GST scores by making them add up to a value close to 1. Below or | |
higher than 1 may cause low energy, mispronunciations or distortion. | |
You can choose between the HiFiGAN trained vocoder and the iterative algorithm Griffin-Lim, which does not need | |
to be trained but produces a "robotic" effect. | |
### More Information | |
Spectrogram generator has been adapted and trained from the | |
[NVIDIA's](https://github.com/NVIDIA/tacotron2) Tacotron2 replica published in | |
<a href="https://arxiv.org/abs/1712.05884" style="display: inline-block;margin-top: .5em;margin-right: .25em;" | |
target="_blank"> <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" | |
src="https://img.shields.io/badge/ArXiv-Tacotron2-b31b1b" alt="Tacotron2"></a> | |
<br> | |
The neural vocoder is a pre-trained model replicated from <a href="https://arxiv.org/abs/2010.05646" | |
style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"> <img style="margin-bottom: | |
0em;display: inline;margin-top: -.25em;" src="https://img.shields.io/badge/ArXiv-HiFi%20GAN-b31b1b" | |
alt="HiFiGAN"></a> | |
<br> | |
Unsupervised style control has been implemented based on <a href="https://arxiv.org/abs/1803.09017" style="display: | |
inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"> <img style="margin-bottom: 0em;display: | |
inline;margin-top: -.25em;" src="https://img.shields.io/badge/ArXiv-Global%20Style%20Tokens-b31b1b" | |
alt="Global Style Tokens"></a> | |
<br> | |
""") | |
demo.launch() | |