from huggingface_hub import from_pretrained_keras
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_io as tfio

import gradio as gr
import librosa
import librosa.display
import matplotlib.pyplot as plt

class MelSpec(layers.Layer):
    def __init__(
        self,
        frame_length=1024,
        frame_step=256,
        fft_length=None,
        sampling_rate=22050,
        num_mel_channels=80,
        freq_min=125,
        freq_max=7600,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.frame_length = frame_length
        self.frame_step = frame_step
        self.fft_length = fft_length
        self.sampling_rate = sampling_rate
        self.num_mel_channels = num_mel_channels
        self.freq_min = freq_min
        self.freq_max = freq_max
        self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=self.num_mel_channels,
            num_spectrogram_bins=self.frame_length // 2 + 1,
            sample_rate=self.sampling_rate,
            lower_edge_hertz=self.freq_min,
            upper_edge_hertz=self.freq_max,
        )

    def call(self, audio):
        stft = tf.signal.stft(
            tf.squeeze(audio, -1),
            self.frame_length,
            self.frame_step,
            self.fft_length,
            pad_end=True,
        )

        # Taking the magnitude of the STFT output
        magnitude = tf.abs(stft)

        # Multiplying the Mel-filterbank with the magnitude and scaling it using the db scale
        mel = tf.matmul(tf.square(magnitude), self.mel_filterbank)
        log_mel_spec = tfio.audio.dbscale(mel, top_db=80)
        return log_mel_spec
       

    def get_config(self):
        config = super(MelSpec, self).get_config()
        config.update(
            {
                "frame_length": self.frame_length,
                "frame_step": self.frame_step,
                "fft_length": self.fft_length,
                "sampling_rate": self.sampling_rate,
                "num_mel_channels": self.num_mel_channels,
                "freq_min": self.freq_min,
                "freq_max": self.freq_max,
            }
        )
        return config

model = from_pretrained_keras("keras-io/MelGAN-spectrogram-inversion")

def inference(audio, model):
    input, sr = librosa.load(audio)
    # input, sr = audio
    x = tf.expand_dims(input, axis=-1)
    mel = MelSpec()(x)
    audio_sample = tf.expand_dims(mel, axis=0)
    pred = model.predict(audio_sample, batch_size=1, verbose=0)  
    return input, pred.squeeze(), sr

def predict(audio):
    x, x_pred, sr = inference(audio, model)
    fig, ax = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(10, 8), dpi=120)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(x)), ref=np.max)
    img = librosa.display.specshow(D, y_axis='linear', x_axis='time',
                                  sr=sr, ax=ax[0])
    ax[0].set(title='Spectrogram of Original  sample audio')
    ax[0].label_outer()

    D = librosa.amplitude_to_db(np.abs(librosa.stft(x_pred)), ref=np.max)
    img = librosa.display.specshow(D, y_axis='linear', x_axis='time',
                                  sr=sr, ax=ax[1])
    ax[1].set(title='Spectrogram of synthesis  sample audio ')
    ax[1].label_outer()
    return plt.gcf()

inputs = [
        gr.Audio(source = "upload", label='Upload audio file', type="filepath"),
]

examples = ["sample_1.wav", "sample_2.wav"]

gr.Interface(
    fn=predict,
    title="MelGAN-based spectrogram inversion",
    description = "Inversion of audio from mel-spectrograms using the MelGAN architecture and feature matching",
    inputs=inputs,
    examples=examples,
    outputs=gr.Plot(),
    cache_examples=False,
    article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>. Based on the keras example from <a href=\"https://keras.io/examples/audio/melgan_spectrogram_inversion/\">Darshan Deshpande</a>",
).launch(debug=False, enable_queue=True)