import gradio as gr |
import soundfile as sf |
from scipy import signal |
import numpy as np |
import torch, torchaudio |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline, WhisperForConditionalGeneration, WhisperProcessor |
MODEL_IS="language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h" |
MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h" |
MODEL_WHIS= "language-and-voice-lab/whisper-large-icelandic-62640-steps-967h" |
torch.random.manual_seed(0) |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device) |
processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS) |
model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device) |
processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO) |
pipe_is = pipeline(model=MODEL_IS) |
pipe_fo = pipeline(model=MODEL_FO) |
whisperprocessor = WhisperProcessor.from_pretrained(MODEL_WHIS) |
whispermodel = WhisperForConditionalGeneration.from_pretrained(MODEL_WHIS) |
def readwav(a_f): |
wav, sr = sf.read(a_f, dtype=np.float32) |
if len(wav.shape) == 2: |
wav = wav.mean(1) |
if sr != 16000: |
wlen = int(wav.shape[0] / sr * 16000) |
wav = signal.resample(wav, wlen) |
return wav |
def recc(audio_file,model,processor): |
wav = readwav(audio_file) |
with torch.inference_mode(): |
input_values = processor(wav,sampling_rate=16000).input_values[0] |
input_values = torch.tensor(input_values, device=device).unsqueeze(0) |
logits = model(input_values).logits |
pred_ids = torch.argmax(logits, dim=-1) |
xcp = processor.batch_decode(pred_ids) |
return xcp[0] |
def whrecc(audio_file,whisperprocessor,whispermodel): |
wav = readwav(audio_file) |
input_features = whisperprocessor(wav, sampling_rate=16000, return_tensors="pt").input_features |
predicted_ids = whispermodel.generate(input_features) |
dec = whisperprocessor.batch_decode(predicted_ids, skip_special_tokens=True,language_id='is') |
xcp = dec[0] |
return(xcp) |
def recis(audio_file): |
chunk_output = pipe_is(audio_file, chunk_length_s=4)['text'] |
return chunk_output |
def recfo(audio_file): |
chunk_output = pipe_fo(audio_file, chunk_length_s=4)['text'] |
return chunk_output |
def recwhis(audio_file): |
wh_output = whrecc(audio_file,whisperprocessor,whispermodel) |
return(wh_output) |
def pick_asrc(au_src): |
return gr.update(source=au_src,value=None) |
bl = gr.Blocks() |
with bl: |
gr.Markdown( |
""" |
# Speech recognition |
### Users logged in to a Huggingface account can use each model's normal hosted inference API instead. |
## * * * * * * * * |
Upload a file for recognition with |
https://huggingface.co/language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h |
or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h |
- Wav2Vec2 models have no language model (yet), so it can generate non-words. |
- Whisper can hallucinate. |
- Send errors/bugs to caitlinr@ru.is |
""" |
) |
with gr.Tabs(): |
with gr.TabItem("Icelandic"): |
with gr.Row(): |
with gr.Column(): |
asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input") |
audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath") |
with gr.Column(): |
chunk_output = gr.Textbox(label="Wav2Vec2 recognition") |
whisper_output = gr.Textbox(label="Whisper recognition") |
w2v_button = gr.Button("Recognise Icelandic with Wav2Vec2") |
whi_button = gr.Button("Recognise Icelandic with Whisper") |
w2v_button.click(recis, inputs=audio_file, outputs=[chunk_output]) |
whi_button.click(recwhis, inputs=audio_file, outputs=[whisper_output]) |
asrc.change(pick_asrc,asrc,audio_file) |
with gr.TabItem("Faroese"): |
with gr.Row(): |
with gr.Column(): |
asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input") |
audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath") |
with gr.Column(): |
chunk_output = gr.Textbox(label="Wav2Vec2 recognition") |
text_button = gr.Button("Recognise Faroese") |
text_button.click(recfo, inputs=audio_file, outputs=[chunk_output]) |
asrc.change(pick_asrc,asrc,audio_file) |
bl.launch() |