w2v2asr / app.py
clr's picture
Update app.py
2c4f91a verified
import gradio as gr
import soundfile as sf
from scipy import signal
import numpy as np
import torch, torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline, WhisperForConditionalGeneration, WhisperProcessor
#from faster_whisper import WhisperModel
MODEL_IS="language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h"
MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
MODEL_WHIS= "language-and-voice-lab/whisper-large-icelandic-62640-steps-967h"
torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device)
processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS)
model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device)
processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO)
pipe_is = pipeline(model=MODEL_IS)
pipe_fo = pipeline(model=MODEL_FO)
#wdevice = "cuda" if torch.cuda.is_available() else "cpu"
#whm_is = WhisperModel(model_size_or_path=MODEL_WHIS, device=wdevice)
whisperprocessor = WhisperProcessor.from_pretrained(MODEL_WHIS)
whispermodel = WhisperForConditionalGeneration.from_pretrained(MODEL_WHIS)
def readwav(a_f):
wav, sr = sf.read(a_f, dtype=np.float32)
if len(wav.shape) == 2:
wav = wav.mean(1)
if sr != 16000:
wlen = int(wav.shape[0] / sr * 16000)
wav = signal.resample(wav, wlen)
return wav
def recc(audio_file,model,processor):
wav = readwav(audio_file)
with torch.inference_mode():
input_values = processor(wav,sampling_rate=16000).input_values[0]
input_values = torch.tensor(input_values, device=device).unsqueeze(0)
logits = model(input_values).logits
pred_ids = torch.argmax(logits, dim=-1)
xcp = processor.batch_decode(pred_ids)
return xcp[0]
def whrecc(audio_file,whisperprocessor,whispermodel):#lang,wmodel):
wav = readwav(audio_file)
#xcps, info = wmodel.transcribe(audio = audio_file, language = lang, no_repeat_ngram_size = 5)
#txts = [xtp.text for xcp in xcps]
#txt = ' '.join(txts)
#return txt
input_features = whisperprocessor(wav, sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = whispermodel.generate(input_features)
dec = whisperprocessor.batch_decode(predicted_ids, skip_special_tokens=True,language_id='is')
xcp = dec[0]
return(xcp)
def recis(audio_file):
#single_output = recc(audio_file,model_is,processor_is)
chunk_output = pipe_is(audio_file, chunk_length_s=4)['text']
#return (single_output, chunk_output)
return chunk_output
def recfo(audio_file):
#single_output = recc(audio_file,model_fo,processor_fo)
chunk_output = pipe_fo(audio_file, chunk_length_s=4)['text']
#return (single_output, chunk_output)
return chunk_output
def recwhis(audio_file):
wh_output = whrecc(audio_file,whisperprocessor,whispermodel)#"is",whm_is)
return(wh_output)
def pick_asrc(au_src):
return gr.update(source=au_src,value=None)
bl = gr.Blocks()
with bl:
gr.Markdown(
"""
# Speech recognition
### Users logged in to a Huggingface account can use each model's normal hosted inference API instead.
## * * * * * * * *
Upload a file for recognition with
https://huggingface.co/language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h
or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h
- Wav2Vec2 models have no language model (yet), so it can generate non-words.
- Whisper can hallucinate.
- Send errors/bugs to caitlinr@ru.is
"""
)
with gr.Tabs():
with gr.TabItem("Icelandic"):
with gr.Row():
with gr.Column():
asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input")
audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
with gr.Column():
#whole_output = gr.Textbox(label="whole-file recognition")
chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
whisper_output = gr.Textbox(label="Whisper recognition")
w2v_button = gr.Button("Recognise Icelandic with Wav2Vec2")
whi_button = gr.Button("Recognise Icelandic with Whisper")
#text_button.click(recis, inputs=audio_file, outputs=[whole_output,chunk_output])
w2v_button.click(recis, inputs=audio_file, outputs=[chunk_output])
whi_button.click(recwhis, inputs=audio_file, outputs=[whisper_output])
asrc.change(pick_asrc,asrc,audio_file)
with gr.TabItem("Faroese"):
with gr.Row():
with gr.Column():
asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input")
audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
with gr.Column():
#whole_output = gr.Textbox(label="whole-file recognition")
chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
text_button = gr.Button("Recognise Faroese")
#text_button.click(recfo, inputs=audio_file, outputs=[whole_output,chunk_output])
text_button.click(recfo, inputs=audio_file, outputs=[chunk_output])
asrc.change(pick_asrc,asrc,audio_file)
bl.launch()