import gradio as gr import numpy as np import torchaudio import torch from transformers import pipeline asr = pipeline("automatic-speech-recognition", model="gigant/romanian-wav2vec2") def f(audio): rate, sample = audio resampler = torchaudio.transforms.Resample(rate, 16_000) sample_16 = resampler(torch.Tensor(sample)).numpy() return asr(sample_16)["text"] app = gr.Interface(fn=f, inputs=gr.inputs.Audio(source="upload", type="numpy", label="Audio"), outputs=gr.outputs.Textbox(type="str", label="Predicted text")) app.launch()