Spaces:

clr
/

w2v2asr

Sleeping

App Files Files Community

w2v2asr / app.py

clr

Update app.py

0e17f64 over 2 years ago

raw

history blame

2.68 kB

	import gradio as gr
	import soundfile as sf
	from scipy import signal
	import numpy as np
	import torch, torchaudio
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

	MODEL_IS="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
	MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"

	torch.random.manual_seed(0)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device)
	processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS)
	model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device)
	processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO)


	def readwav(a_f):
	wav, sr = sf.read(a_f, dtype=np.float32)
	if len(wav.shape) == 2:
	wav = wav.mean(1)
	if sr != 16000:
	wlen = int(wav.shape[0] / sr * 16000)
	wav = signal.resample(wav, wlen)
	return wav

	def recc(audio_file,model,processor):
	wav = readwav(audio_file)
	with torch.inference_mode():
	input_values = processor(wav,sampling_rate=16000).input_values[0]
	input_values = torch.tensor(input_values, device=device).unsqueeze(0)
	logits = model(input_values).logits
	pred_ids = torch.argmax(logits, dim=-1)
	xcp = processor.batch_decode(pred_ids)
	return xcp[0]


	def recis(audio_file):
	return recc(audio_file,model_is,processor_is)

	def recfo(audio_file):
	return recc(audio_file,model_fo,processor_fo)


	bl = gr.Blocks()
	with bl:

	gr.Markdown(
	"""
	# W2V2 speech recognition
	Upload a file for recognition with
	https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h
	or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h

	- For some reason, the huggingface 'Hosted inference API' on the model page does not currently work, but this does.
	- There is no language model (yet), so it can generate non-words.
	- Send errors/bugs to caitlinr@ru.is
	"""
	)

	with gr.Tabs():
	with gr.TabItem("Icelandic"):
	with gr.Row():
	audio_file = gr.Audio(type="filepath")
	text_output = gr.Textbox()
	text_button = gr.Button("Recognise Icelandic")
	text_button.click(recis, inputs=audio_file, outputs=text_output)
	with gr.TabItem("Faroese"):
	with gr.Row():
	audio_file = gr.Audio(type="filepath")
	text_output = gr.Textbox()
	text_button = gr.Button("Recognise Faroese")
	text_button.click(recfo, inputs=audio_file, outputs=text_output)

	bl.launch()