import gradio as gr
import torch
from transformers import pipeline
import numpy as np

pipe_base = pipeline("automatic-speech-recognition", model="aitor-medrano/whisper-base-lara")
pipe_small = pipeline("automatic-speech-recognition", model="aitor-medrano/whisper-small-lara")

def greet(modelo, grabacion):
    sr, y = grabacion
    # Pasamos el array de muestras a tipo NumPy de 32 bits
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    if modelo == "Base":
        pipe = pipe_base
    else:
        pipe = pipe_small

    return modelo + ":" + pipe({"sampling_rate": sr, "raw": y})["text"]

demo = gr.Interface(fn=greet,
                    inputs=[
                            gr.Dropdown(
                                ["Base", "Small"], label="Modelo", info="Modelos de Lara entrenados"
                            ),
                            gr.Audio()
                    ],
                    outputs="text")
demo.launch()