import gradio as gr
import torch
from transformers import pipeline

canary_pipe = pipeline("automatic-speech-recognition", model="nvidia/canary-1b")

def convert_speech(audio):
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))
    return canary_pipe({"sampling_rate": sr, "raw": y})["text"]


iface = gr.Interface(
    fn=convert_speech, 
    inputs=gr.Audio(sources="microphone"), 
    outputs="text"
)

iface.launch()