w2v2asr / app.py
clr's picture
Update app.py
history blame
2.68 kB
import gradio as gr
import soundfile as sf
from scipy import signal
import numpy as np
import torch, torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device)
processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS)
model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device)
processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO)
def readwav(a_f):
wav, sr = sf.read(a_f, dtype=np.float32)
if len(wav.shape) == 2:
wav = wav.mean(1)
if sr != 16000:
wlen = int(wav.shape[0] / sr * 16000)
wav = signal.resample(wav, wlen)
return wav
def recc(audio_file,model,processor):
wav = readwav(audio_file)
with torch.inference_mode():
input_values = processor(wav,sampling_rate=16000).input_values[0]
input_values = torch.tensor(input_values, device=device).unsqueeze(0)
logits = model(input_values).logits
pred_ids = torch.argmax(logits, dim=-1)
xcp = processor.batch_decode(pred_ids)
return xcp[0]
def recis(audio_file):
return recc(audio_file,model_is,processor_is)
def recfo(audio_file):
return recc(audio_file,model_fo,processor_fo)
bl = gr.Blocks()
with bl:
# W2V2 speech recognition
Upload a file for recognition with
or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h
- For some reason, the huggingface 'Hosted inference API' on the model page does not currently work, but this does.
- There is no language model (yet), so it can generate non-words.
- Send errors/bugs to caitlinr@ru.is
with gr.Tabs():
with gr.TabItem("Icelandic"):
with gr.Row():
audio_file = gr.Audio(type="filepath")
text_output = gr.Textbox()
text_button = gr.Button("Recognise Icelandic")
text_button.click(recis, inputs=audio_file, outputs=text_output)
with gr.TabItem("Faroese"):
with gr.Row():
audio_file = gr.Audio(type="filepath")
text_output = gr.Textbox()
text_button = gr.Button("Recognise Faroese")
text_button.click(recfo, inputs=audio_file, outputs=text_output)