|
import gradio as gr |
|
import soundfile as sf |
|
import numpy as np |
|
import torch, torchaudio |
|
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor |
|
|
|
MODEL_IS="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h" |
|
MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h" |
|
|
|
torch.random.manual_seed(0) |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device) |
|
processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS) |
|
model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device) |
|
processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO) |
|
|
|
def readwav(a_f): |
|
wav, sr = sf.read(a_f, dtype=np.float32) |
|
if len(wav.shape) == 2: |
|
wav = wav.mean(1) |
|
if sr != 16000: |
|
wlen = int(wav.shape[0] / sr * 16000) |
|
wav = signal.resample(wav, wlen) |
|
return wav |
|
|
|
def recis(audio_file): |
|
wav = readwav(audio_file) |
|
with torch.inference_mode(): |
|
input_values = processor_is(wav,sampling_rate=16000).input_values[0] |
|
input_values = torch.tensor(input_values, device=device).unsqueeze(0) |
|
logits = model_is(input_values).logits |
|
pred_ids = torch.argmax(logits, dim=-1) |
|
xcp = processor_is.batch_decode(pred_ids) |
|
return xcp |
|
|
|
def recfo(audio_file): |
|
wav = readwav(audio_file) |
|
with torch.inference_mode(): |
|
input_values = processor_fo(wav,sampling_rate=16000).input_values[0] |
|
input_values = torch.tensor(input_values, device=device).unsqueeze(0) |
|
logits = model_fo(input_values).logits |
|
pred_ids = torch.argmax(logits, dim=-1) |
|
xcp = processor_fo.batch_decode(pred_ids) |
|
return xcp |
|
|
|
bl = gr.Blocks() |
|
with bl: |
|
|
|
gr.Markdown( |
|
""" |
|
# W2V2 speech recognition |
|
Upload a file for recognition with |
|
https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h |
|
or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h |
|
|
|
- For some reason, the huggingface 'Hosted inference API' on the model page does not work, but this demo does. |
|
- There is no language model (yet), so it can generate non-words. |
|
""" |
|
) |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Icelandic"): |
|
with gr.Row(): |
|
audio_file = gr.Audio(type="filepath") |
|
text_output = gr.Textbox() |
|
text_button = gr.Button("Recognise") |
|
text_button.click(recis, inputs=audio_file, outputs=text_output) |
|
with gr.TabItem("Faroese"): |
|
with gr.Row(): |
|
audio_file = gr.Audio(type="filepath") |
|
text_output = gr.Textbox() |
|
text_button = gr.Button("Recognise") |
|
text_button.click(recfo, inputs=audio_file, outputs=text_output) |
|
|
|
bl.launch() |
|
|
|
|