Spaces:

keras-io
/

MelGAN-spectrogram-inversion

App Files Files Community

MelGAN-spectrogram-inversion / app.py

vumichien's picture

Update app.py

5331e27 over 2 years ago

history blame contribute delete

3.99 kB

	from huggingface_hub import from_pretrained_keras
	import numpy as np
	import tensorflow as tf
	from tensorflow.keras import layers
	import tensorflow_io as tfio

	import gradio as gr
	import librosa
	import librosa.display
	import matplotlib.pyplot as plt

	class MelSpec(layers.Layer):
	def __init__(
	self,
	frame_length=1024,
	frame_step=256,
	fft_length=None,
	sampling_rate=22050,
	num_mel_channels=80,
	freq_min=125,
	freq_max=7600,
	**kwargs,
	):
	super().__init__(**kwargs)
	self.frame_length = frame_length
	self.frame_step = frame_step
	self.fft_length = fft_length
	self.sampling_rate = sampling_rate
	self.num_mel_channels = num_mel_channels
	self.freq_min = freq_min
	self.freq_max = freq_max
	self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
	num_mel_bins=self.num_mel_channels,
	num_spectrogram_bins=self.frame_length // 2 + 1,
	sample_rate=self.sampling_rate,
	lower_edge_hertz=self.freq_min,
	upper_edge_hertz=self.freq_max,
	)

	def call(self, audio):
	stft = tf.signal.stft(
	tf.squeeze(audio, -1),
	self.frame_length,
	self.frame_step,
	self.fft_length,
	pad_end=True,
	)

	# Taking the magnitude of the STFT output
	magnitude = tf.abs(stft)

	# Multiplying the Mel-filterbank with the magnitude and scaling it using the db scale
	mel = tf.matmul(tf.square(magnitude), self.mel_filterbank)
	log_mel_spec = tfio.audio.dbscale(mel, top_db=80)
	return log_mel_spec


	def get_config(self):
	config = super(MelSpec, self).get_config()
	config.update(
	{
	"frame_length": self.frame_length,
	"frame_step": self.frame_step,
	"fft_length": self.fft_length,
	"sampling_rate": self.sampling_rate,
	"num_mel_channels": self.num_mel_channels,
	"freq_min": self.freq_min,
	"freq_max": self.freq_max,
	}
	)
	return config

	model = from_pretrained_keras("keras-io/MelGAN-spectrogram-inversion")

	def inference(audio, model):
	input, sr = librosa.load(audio)
	# input, sr = audio
	x = tf.expand_dims(input, axis=-1)
	mel = MelSpec()(x)
	audio_sample = tf.expand_dims(mel, axis=0)
	pred = model.predict(audio_sample, batch_size=1, verbose=0)
	return input, pred.squeeze(), sr

	def predict(audio):
	x, x_pred, sr = inference(audio, model)
	fig, ax = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(10, 8), dpi=120)
	D = librosa.amplitude_to_db(np.abs(librosa.stft(x)), ref=np.max)
	img = librosa.display.specshow(D, y_axis='linear', x_axis='time',
	sr=sr, ax=ax[0])
	ax[0].set(title='Spectrogram of Original sample audio')
	ax[0].label_outer()

	D = librosa.amplitude_to_db(np.abs(librosa.stft(x_pred)), ref=np.max)
	img = librosa.display.specshow(D, y_axis='linear', x_axis='time',
	sr=sr, ax=ax[1])
	ax[1].set(title='Spectrogram of synthesis sample audio ')
	ax[1].label_outer()
	return plt.gcf()

	inputs = [
	gr.Audio(source = "upload", label='Upload audio file', type="filepath"),
	]

	examples = ["sample_1.wav", "sample_2.wav"]

	gr.Interface(
	fn=predict,
	title="MelGAN-based spectrogram inversion",
	description = "Inversion of audio from mel-spectrograms using the MelGAN architecture and feature matching",
	inputs=inputs,
	examples=examples,
	outputs=gr.Plot(),
	cache_examples=False,
	article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>. Based on the keras example from <a href=\"https://keras.io/examples/audio/melgan_spectrogram_inversion/\">Darshan Deshpande</a>",
	).launch(debug=False, enable_queue=True)