Spaces:

akhaliq
/

espnet2_asr

Runtime error

App Files Files Community

espnet2_asr / app.py

Ahsen Khaliq

Update app.py

b31525e almost 3 years ago

raw

history blame

No virus

1.7 kB

	import time
	import torch
	import string
	from espnet_model_zoo.downloader import ModelDownloader
	from espnet2.bin.asr_inference import Speech2Text


	import soundfile
	import librosa.display
	import matplotlib.pyplot as plt
	import gradio as gr

	lang = 'multilingual'
	fs = 16000
	tag = 'ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best'

	d = ModelDownloader()
	speech2text = Speech2Text(
	**d.download_and_unpack(tag),
	device="cpu",
	minlenratio=0.0,
	maxlenratio=0.0,
	ctc_weight=0.3,
	beam_size=10,
	batch_size=0,
	nbest=1
	)

	def text_normalizer(text):
	text = text.upper()
	return text.translate(str.maketrans('', '', string.punctuation))

	def inference(audio):
	speech, rate = soundfile.read(audio.name)
	assert rate == fs, "mismatch in sampling rate"
	nbests = speech2text(speech)
	text, *_ = nbests[0]

	print(f"Input Speech: {file_name}")
	return f"ASR hypothesis: {text_normalizer(text)}"

	inputs = gr.inputs.Audio(label="Input Audio", type="file")
	outputs = gr.outputs.Textbox(label="Output Text")

	title = "wav2vec 2.0"
	description = "demo for Facebook AI wav2vec 2.0 using Hugging Face transformers. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below."
	article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2006.11477'>wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations</a> \| <a href='https://github.com/pytorch/fairseq'>Github Repo</a> \| <a href='https://huggingface.co/facebook/wav2vec2-base-960h'>Hugging Face model</a></p>"


	gr.Interface(inference, inputs, outputs, title=title, description=description, article=article).launch()