Spaces:

clr
/

w2v2asr

Sleeping

App Files Files Community

w2v2asr / app.py

clr

Update app.py

2c4f91a verified about 1 year ago

raw

history blame contribute delete

5.51 kB

	import gradio as gr
	import soundfile as sf
	from scipy import signal
	import numpy as np
	import torch, torchaudio
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline, WhisperForConditionalGeneration, WhisperProcessor
	#from faster_whisper import WhisperModel

	MODEL_IS="language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h"
	MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
	MODEL_WHIS= "language-and-voice-lab/whisper-large-icelandic-62640-steps-967h"

	torch.random.manual_seed(0)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device)
	processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS)
	model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device)
	processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO)

	pipe_is = pipeline(model=MODEL_IS)
	pipe_fo = pipeline(model=MODEL_FO)


	#wdevice = "cuda" if torch.cuda.is_available() else "cpu"
	#whm_is = WhisperModel(model_size_or_path=MODEL_WHIS, device=wdevice)
	whisperprocessor = WhisperProcessor.from_pretrained(MODEL_WHIS)
	whispermodel = WhisperForConditionalGeneration.from_pretrained(MODEL_WHIS)


	def readwav(a_f):
	wav, sr = sf.read(a_f, dtype=np.float32)
	if len(wav.shape) == 2:
	wav = wav.mean(1)
	if sr != 16000:
	wlen = int(wav.shape[0] / sr * 16000)
	wav = signal.resample(wav, wlen)
	return wav

	def recc(audio_file,model,processor):
	wav = readwav(audio_file)
	with torch.inference_mode():
	input_values = processor(wav,sampling_rate=16000).input_values[0]
	input_values = torch.tensor(input_values, device=device).unsqueeze(0)
	logits = model(input_values).logits
	pred_ids = torch.argmax(logits, dim=-1)
	xcp = processor.batch_decode(pred_ids)
	return xcp[0]

	def whrecc(audio_file,whisperprocessor,whispermodel):#lang,wmodel):
	wav = readwav(audio_file)
	#xcps, info = wmodel.transcribe(audio = audio_file, language = lang, no_repeat_ngram_size = 5)
	#txts = [xtp.text for xcp in xcps]
	#txt = ' '.join(txts)
	#return txt
	input_features = whisperprocessor(wav, sampling_rate=16000, return_tensors="pt").input_features
	predicted_ids = whispermodel.generate(input_features)
	dec = whisperprocessor.batch_decode(predicted_ids, skip_special_tokens=True,language_id='is')
	xcp = dec[0]
	return(xcp)


	def recis(audio_file):
	#single_output = recc(audio_file,model_is,processor_is)
	chunk_output = pipe_is(audio_file, chunk_length_s=4)['text']
	#return (single_output, chunk_output)
	return chunk_output

	def recfo(audio_file):
	#single_output = recc(audio_file,model_fo,processor_fo)
	chunk_output = pipe_fo(audio_file, chunk_length_s=4)['text']
	#return (single_output, chunk_output)
	return chunk_output


	def recwhis(audio_file):
	wh_output = whrecc(audio_file,whisperprocessor,whispermodel)#"is",whm_is)
	return(wh_output)

	def pick_asrc(au_src):
	return gr.update(source=au_src,value=None)

	bl = gr.Blocks()
	with bl:

	gr.Markdown(
	"""
	# Speech recognition

	### Users logged in to a Huggingface account can use each model's normal hosted inference API instead.
	## * * * * * * * *

	Upload a file for recognition with
	https://huggingface.co/language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h
	or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h

	- Wav2Vec2 models have no language model (yet), so it can generate non-words.
	- Whisper can hallucinate.
	- Send errors/bugs to caitlinr@ru.is
	"""
	)

	with gr.Tabs():
	with gr.TabItem("Icelandic"):
	with gr.Row():
	with gr.Column():
	asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input")
	audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
	with gr.Column():
	#whole_output = gr.Textbox(label="whole-file recognition")
	chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
	whisper_output = gr.Textbox(label="Whisper recognition")
	w2v_button = gr.Button("Recognise Icelandic with Wav2Vec2")
	whi_button = gr.Button("Recognise Icelandic with Whisper")
	#text_button.click(recis, inputs=audio_file, outputs=[whole_output,chunk_output])
	w2v_button.click(recis, inputs=audio_file, outputs=[chunk_output])
	whi_button.click(recwhis, inputs=audio_file, outputs=[whisper_output])

	asrc.change(pick_asrc,asrc,audio_file)


	with gr.TabItem("Faroese"):
	with gr.Row():
	with gr.Column():
	asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input")
	audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
	with gr.Column():
	#whole_output = gr.Textbox(label="whole-file recognition")
	chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
	text_button = gr.Button("Recognise Faroese")
	#text_button.click(recfo, inputs=audio_file, outputs=[whole_output,chunk_output])
	text_button.click(recfo, inputs=audio_file, outputs=[chunk_output])

	asrc.change(pick_asrc,asrc,audio_file)


	bl.launch()