import gradio as gr import soundfile as sf from scipy import signal import numpy as np import torch, torchaudio from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline from faster_whisper import WhisperModel MODEL_IS="language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h" MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h" MODEL_WHIS= "language-and-voice-lab/whisper-large-icelandic-62640-steps-967h" torch.random.manual_seed(0) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device) processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS) model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device) processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO) pipe_is = pipeline(model=MODEL_IS) pipe_fo = pipeline(model=MODEL_FO) wdevice = "cuda" if torch.cuda.is_available() else "cpu" whm_is = WhisperModel(model_size_or_path=MODEL_WHIS, device=wdevice) def readwav(a_f): wav, sr = sf.read(a_f, dtype=np.float32) if len(wav.shape) == 2: wav = wav.mean(1) if sr != 16000: wlen = int(wav.shape[0] / sr * 16000) wav = signal.resample(wav, wlen) return wav def recc(audio_file,model,processor): wav = readwav(audio_file) with torch.inference_mode(): input_values = processor(wav,sampling_rate=16000).input_values[0] input_values = torch.tensor(input_values, device=device).unsqueeze(0) logits = model(input_values).logits pred_ids = torch.argmax(logits, dim=-1) xcp = processor.batch_decode(pred_ids) return xcp[0] def whrecc(audio_file,lang,wmodel): wav = readwav(audio_file) xcps, info = wmodel.transcribe(audio = audio_file, language = lang, no_repeat_ngram_size = 5) txts = [xtp.text for xcp in xcps] txt = ' '.join(txts) return txt def recis(audio_file): #single_output = recc(audio_file,model_is,processor_is) chunk_output = pipe_is(audio_file, chunk_length_s=4)['text'] #return (single_output, chunk_output) return chunk_output def recfo(audio_file): #single_output = recc(audio_file,model_fo,processor_fo) chunk_output = pipe_fo(audio_file, chunk_length_s=4)['text'] #return (single_output, chunk_output) return chunk_output def recwhis(audio_file): wh_output = whrecc(audio_file,"is",whm_is) return(wh_output) def pick_asrc(au_src): return gr.update(source=au_src,value=None) bl = gr.Blocks() with bl: gr.Markdown( """ # Speech recognition ### Users logged in to a Huggingface account can use each model's normal hosted inference API instead. ## * * * * * * * * Upload a file for recognition with https://huggingface.co/language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h - Wav2Vec2 models have no language model (yet), so it can generate non-words. - Whisper can hallucinate. - Send errors/bugs to caitlinr@ru.is """ ) with gr.Tabs(): with gr.TabItem("Icelandic"): with gr.Row(): with gr.Column(): asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input") audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath") with gr.Column(): #whole_output = gr.Textbox(label="whole-file recognition") chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking") whisper_output = gr.Textbox(label="Whisper recognition") w2v_button = gr.Button("Recognise Icelandic with Wav2Vec2") whi_button = gr.Button("Recognise Icelandic with Whisper") #text_button.click(recis, inputs=audio_file, outputs=[whole_output,chunk_output]) w2v_button.click(recis, inputs=audio_file, outputs=[chunk_output]) whi_button.click(recwhis, inputs=audio_file, outputs=[whisper_output]) asrc.change(pick_asrc,asrc,audio_file) with gr.TabItem("Faroese"): with gr.Row(): with gr.Column(): asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input") audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath") with gr.Column(): #whole_output = gr.Textbox(label="whole-file recognition") chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking") text_button = gr.Button("Recognise Faroese") #text_button.click(recfo, inputs=audio_file, outputs=[whole_output,chunk_output]) text_button.click(recfo, inputs=audio_file, outputs=[chunk_output]) asrc.change(pick_asrc,asrc,audio_file) bl.launch()