gigant's picture
Create app.py
2e6a157
import gradio as gr
import numpy as np
import torchaudio
import torch
from transformers import pipeline
asr = pipeline("automatic-speech-recognition", model="gigant/romanian-wav2vec2")
def f(audio):
rate, sample = audio
resampler = torchaudio.transforms.Resample(rate, 16_000)
sample_16 = resampler(torch.Tensor(sample)).numpy()
return asr(sample_16)["text"]
app = gr.Interface(fn=f, inputs=gr.inputs.Audio(source="upload", type="numpy", label="Audio"), outputs=gr.outputs.Textbox(type="str", label="Predicted text"))
app.launch()