Spaces:

jkang
/

espnet2_librispeech_100h_word_vs_bpe_vs_char

Runtime error

App Files Files Community

espnet2_librispeech_100h_word_vs_bpe_vs_char / gradio_asr_en_libri100_word_vs_bpe.py

jkang

Update gradio_asr_en_libri100_word_vs_bpe.py

eadd8d7 about 2 years ago

raw history blame contribute delete

No virus

3.25 kB

	'''Librispeech 100h English ASR demo
	@ML2 --> @HuggingFace

	2022-02-23 jkang first created
	2022-02-28 jkang char added (bpe, word, char now all included)
	'''

	import os
	from difflib import Differ
	from glob import glob
	from loguru import logger
	import librosa
	import gradio as gr

	from espnet2.bin.asr_inference import Speech2Text

	# ---------- Settings ----------
	GPU_ID = '-1'
	os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID
	DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu'

	# SERVER_PORT = 42208
	# SERVER_NAME = "0.0.0.0"

	MODEL_DIR = './model'

	EXAMPLE_DIR = './examples'
	examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav')))
	# examples = [[example] for example in examples]

	d = Differ()

	# ---------- Logging ----------
	logger.add('app.log', mode='a')
	logger.info('============================= App restarted =============================')

	# ---------- Model ----------
	logger.info('download model')
	logger.info('model downloaded')
	model_word = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer_word")
	model_bpe = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer")
	model_char = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer_char")
	logger.info('model loaded')

	def predict(wav_file):
	logger.info('wav file loaded')
	# Load audio
	speech, rate = librosa.load(wav_file, sr=16000)

	logger.info(f'--- speech.shape: {speech.shape}')
	logger.info(f'--- speech.dtype: {speech.dtype}')
	logger.info(f'--- speech rate: {rate}')

	# Run inference
	W = model_word(speech)[0]
	B = model_bpe(speech)[0]
	C = model_char(speech)[0]
	word_decoded = W[0]
	bpe_decoded = B[0]
	char_decoded = C[0]
	# comparison = ''.join(list(d.compare([word_decoded+'\n'], [bpe_decoded+'\n'])))

	logger.info(f'--- word-based model decoded: {word_decoded}')
	logger.info(f'--- BPE-based model decoded: {bpe_decoded}')
	logger.info(f'--- char-based model decoded: {char_decoded}')

	logger.info('finished')
	# return word_decoded, bpe_decoded, comparison
	return bpe_decoded, char_decoded, word_decoded

	iface = gr.Interface(
	predict,
	title='Comparison between word vs BPE tokens based on ESPNet2 ASR models',
	description='Two models were trained on Librispeech (clean-100h)',
	inputs=[
	gr.inputs.Audio(label='wav file', source='upload', type='filepath')
	],
	outputs=[
	gr.outputs.Textbox(label='Decoding result (BPE 기반 model)'),
	gr.outputs.Textbox(label='Decoding result (char 기반 model)'),
	gr.outputs.Textbox(label='Decoding result (word 기반 model)'),
	# gr.outputs.Textbox(label='Comparison'),
	],
	examples=examples,
	examples_per_page=5,
	css='.examples-gallery {display: grid !important;}'
	# article='<p style="text-align:center">Model URL<a target="_blank" href="https://huggingface.co/jkang/espnet2_librispeech_100_conformer">🤗</a></p>',
	)

	if __name__ == '__main__':
	try:
	iface.launch(debug=True,
	# server_name=SERVER_NAME,
	# server_port=SERVER_PORT,
	enable_queue=True,
	)
	except KeyboardInterrupt as e:
	print(e)

	finally:
	iface.close()