|
from huggingface_hub import from_pretrained_keras |
|
import numpy as np |
|
import tensorflow as tf |
|
from tensorflow.keras import layers |
|
import tensorflow_io as tfio |
|
|
|
import gradio as gr |
|
import librosa |
|
import librosa.display |
|
import matplotlib.pyplot as plt |
|
|
|
class MelSpec(layers.Layer): |
|
def __init__( |
|
self, |
|
frame_length=1024, |
|
frame_step=256, |
|
fft_length=None, |
|
sampling_rate=22050, |
|
num_mel_channels=80, |
|
freq_min=125, |
|
freq_max=7600, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
self.frame_length = frame_length |
|
self.frame_step = frame_step |
|
self.fft_length = fft_length |
|
self.sampling_rate = sampling_rate |
|
self.num_mel_channels = num_mel_channels |
|
self.freq_min = freq_min |
|
self.freq_max = freq_max |
|
self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix( |
|
num_mel_bins=self.num_mel_channels, |
|
num_spectrogram_bins=self.frame_length // 2 + 1, |
|
sample_rate=self.sampling_rate, |
|
lower_edge_hertz=self.freq_min, |
|
upper_edge_hertz=self.freq_max, |
|
) |
|
|
|
def call(self, audio): |
|
stft = tf.signal.stft( |
|
tf.squeeze(audio, -1), |
|
self.frame_length, |
|
self.frame_step, |
|
self.fft_length, |
|
pad_end=True, |
|
) |
|
|
|
|
|
magnitude = tf.abs(stft) |
|
|
|
|
|
mel = tf.matmul(tf.square(magnitude), self.mel_filterbank) |
|
log_mel_spec = tfio.audio.dbscale(mel, top_db=80) |
|
return log_mel_spec |
|
|
|
|
|
def get_config(self): |
|
config = super(MelSpec, self).get_config() |
|
config.update( |
|
{ |
|
"frame_length": self.frame_length, |
|
"frame_step": self.frame_step, |
|
"fft_length": self.fft_length, |
|
"sampling_rate": self.sampling_rate, |
|
"num_mel_channels": self.num_mel_channels, |
|
"freq_min": self.freq_min, |
|
"freq_max": self.freq_max, |
|
} |
|
) |
|
return config |
|
|
|
model = from_pretrained_keras("keras-io/MelGAN-spectrogram-inversion") |
|
|
|
def inference(audio, model): |
|
input, sr = librosa.load(audio) |
|
|
|
x = tf.expand_dims(input, axis=-1) |
|
mel = MelSpec()(x) |
|
audio_sample = tf.expand_dims(mel, axis=0) |
|
pred = model.predict(audio_sample, batch_size=1, verbose=0) |
|
return input, pred.squeeze(), sr |
|
|
|
def predict(audio): |
|
x, x_pred, sr = inference(audio, model) |
|
fig, ax = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(10, 8), dpi=120) |
|
D = librosa.amplitude_to_db(np.abs(librosa.stft(x)), ref=np.max) |
|
img = librosa.display.specshow(D, y_axis='linear', x_axis='time', |
|
sr=sr, ax=ax[0]) |
|
ax[0].set(title='Spectrogram of Original sample audio') |
|
ax[0].label_outer() |
|
|
|
D = librosa.amplitude_to_db(np.abs(librosa.stft(x_pred)), ref=np.max) |
|
img = librosa.display.specshow(D, y_axis='linear', x_axis='time', |
|
sr=sr, ax=ax[1]) |
|
ax[1].set(title='Spectrogram of synthesis sample audio ') |
|
ax[1].label_outer() |
|
return plt.gcf() |
|
|
|
inputs = [ |
|
gr.Audio(source = "upload", label='Upload audio file', type="filepath"), |
|
] |
|
|
|
examples = ["sample_1.wav", "sample_2.wav"] |
|
|
|
gr.Interface( |
|
fn=predict, |
|
title="MelGAN-based spectrogram inversion", |
|
description = "Inversion of audio from mel-spectrograms using the MelGAN architecture and feature matching", |
|
inputs=inputs, |
|
examples=examples, |
|
outputs=gr.Plot(), |
|
cache_examples=False, |
|
article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>. Based on the keras example from <a href=\"https://keras.io/examples/audio/melgan_spectrogram_inversion/\">Darshan Deshpande</a>", |
|
).launch(debug=False, enable_queue=True) |