import gradio as gr
import tensorflow as tf
import librosa
import numpy as np
from huggingface_hub import hf_hub_download

# Mel Spectrogram parameters
n_fft = 512        # FFT window length
hop_length = 160   # number of samples between successive frames
n_mels = 80        # Number of Mel bands
fmin = 0.0         # Minimum frequency
fmax = 8000.0      # Maximum frequency
sampling_rate = 16000

def extract_mel_spectrogram(audio) -> np.ndarray:
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sampling_rate, hop_length=hop_length,
                                                 n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, power=2.0)
    spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
    #spectrogram = np.expand_dims(spectrogram, axis=-1)  # Adding channel dimension for the model
    return spectrogram

# Download model from Hugging Face Hub
model_path = hf_hub_download(repo_id="kobrasoft/kobraspeech-rnn-cs", filename="kobraspeech.17-40.19.keras")
model = tf.keras.models.load_model(model_path)

def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search. For complex tasks, you can use beam search
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    # Iterate over the results and get back the text
    output_text = []
    for result in results:
        result = label_to_string(result)
        output_text.append(result)
    return output_text

def transcribe(audio_path):
    # Load audio
    audio, _ = librosa.load(audio_path, sr=sampling_rate)
    
    # Extract features
    features = extract_mel_spectrogram(audio)
    
    # Model expects batch dimension
    features = np.expand_dims(features, axis=0)

    # Predict
    prediction = model.predict(features)
    
    # Assuming you have a method to decode the prediction into text
    transcription = decode_batch_predictions(prediction)
    
    return transcription[0]

# Create Gradio interface
iface = gr.Interface(
    fn=transcribe,
    inputs=gr.inputs.Audio(source="microphone", type="filepath"),
    outputs="text",
    title="Kobraspeech RNN ASR demo (cs)",
    description="Upload an audio file or record your voice to get the transcription."
)

if __name__ == "__main__":
    iface.launch()