File size: 2,831 Bytes
dd2b02c 56f1ec9 a64c958 56f1ec9 88ec444 6b73084 0e89078 72390b0 56f1ec9 0e89078 d4d3d57 5adac55 56f1ec9 5adac55 56f1ec9 5adac55 509f052 5adac55 a16e474 5adac55 e462854 c793959 5adac55 f9a5e8b d4d3d57 0e89078 d4d3d57 0e89078 5adac55 0e89078 5adac55 79be1a5 0e89078 79be1a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import gradio as gr
import soundfile as sf
import numpy as np
import torch, torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
MODEL_IS="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device)
processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS)
model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device)
processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO)
def readwav(a_f):
wav, sr = sf.read(a_f, dtype=np.float32)
if len(wav.shape) == 2:
wav = wav.mean(1)
if sr != 16000:
wlen = int(wav.shape[0] / sr * 16000)
wav = signal.resample(wav, wlen)
return wav
def recis(audio_file):
wav = readwav(audio_file)
with torch.inference_mode():
input_values = processor_is(wav,sampling_rate=16000).input_values[0]
input_values = torch.tensor(input_values, device=device).unsqueeze(0)
logits = model_is(input_values).logits
pred_ids = torch.argmax(logits, dim=-1)
xcp = processor_is.batch_decode(pred_ids)
return xcp
def recfo(audio_file):
wav = readwav(audio_file)
with torch.inference_mode():
input_values = processor_fo(wav,sampling_rate=16000).input_values[0]
input_values = torch.tensor(input_values, device=device).unsqueeze(0)
logits = model_fo(input_values).logits
pred_ids = torch.argmax(logits, dim=-1)
xcp = processor_fo.batch_decode(pred_ids)
return xcp
bl = gr.Blocks()
with bl:
gr.Markdown(
"""
# W2V2 speech recognition
Upload a file for recognition with
https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h
or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h
- For some reason, the huggingface 'Hosted inference API' on the model page does not work, but this demo does.
- There is no language model (yet), so it can generate non-words.
"""
)
with gr.Tabs():
with gr.TabItem("Icelandic"):
with gr.Row():
audio_file = gr.Audio(type="filepath")
text_output = gr.Textbox()
text_button = gr.Button("Recognise")
text_button.click(recis, inputs=audio_file, outputs=text_output)
with gr.TabItem("Faroese"):
with gr.Row():
audio_file = gr.Audio(type="filepath")
text_output = gr.Textbox()
text_button = gr.Button("Recognise")
text_button.click(recfo, inputs=audio_file, outputs=text_output)
bl.launch()
|