import gradio as gr import torch from transformers import pipeline canary_pipe = pipeline("automatic-speech-recognition", model="nvidia/canary-1b") def convert_speech(audio): sr, y = audio y = y.astype(np.float32) y /= np.max(np.abs(y)) return canary_pipe({"sampling_rate": sr, "raw": y})["text"] iface = gr.Interface( fn=convert_speech, inputs=gr.Audio(sources="microphone"), outputs="text" ) iface.launch()