espnet2_librispeech_100h_word_vs_bpe_vs_char / gradio_asr_en_libri100_word_vs_bpe.py
jkang's picture
Update gradio_asr_en_libri100_word_vs_bpe.py
eadd8d7
'''Librispeech 100h English ASR demo
@ML2 --> @HuggingFace
2022-02-23 jkang first created
2022-02-28 jkang char added (bpe, word, char now all included)
'''
import os
from difflib import Differ
from glob import glob
from loguru import logger
import librosa
import gradio as gr
from espnet2.bin.asr_inference import Speech2Text
# ---------- Settings ----------
GPU_ID = '-1'
os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID
DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu'
# SERVER_PORT = 42208
# SERVER_NAME = "0.0.0.0"
MODEL_DIR = './model'
EXAMPLE_DIR = './examples'
examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav')))
# examples = [[example] for example in examples]
d = Differ()
# ---------- Logging ----------
logger.add('app.log', mode='a')
logger.info('============================= App restarted =============================')
# ---------- Model ----------
logger.info('download model')
logger.info('model downloaded')
model_word = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer_word")
model_bpe = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer")
model_char = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer_char")
logger.info('model loaded')
def predict(wav_file):
logger.info('wav file loaded')
# Load audio
speech, rate = librosa.load(wav_file, sr=16000)
logger.info(f'--- speech.shape: {speech.shape}')
logger.info(f'--- speech.dtype: {speech.dtype}')
logger.info(f'--- speech rate: {rate}')
# Run inference
W = model_word(speech)[0]
B = model_bpe(speech)[0]
C = model_char(speech)[0]
word_decoded = W[0]
bpe_decoded = B[0]
char_decoded = C[0]
# comparison = ''.join(list(d.compare([word_decoded+'\n'], [bpe_decoded+'\n'])))
logger.info(f'--- word-based model decoded: {word_decoded}')
logger.info(f'--- BPE-based model decoded: {bpe_decoded}')
logger.info(f'--- char-based model decoded: {char_decoded}')
logger.info('finished')
# return word_decoded, bpe_decoded, comparison
return bpe_decoded, char_decoded, word_decoded
iface = gr.Interface(
predict,
title='Comparison between word vs BPE tokens based on ESPNet2 ASR models',
description='Two models were trained on Librispeech (clean-100h)',
inputs=[
gr.inputs.Audio(label='wav file', source='upload', type='filepath')
],
outputs=[
gr.outputs.Textbox(label='Decoding result (BPE 기반 model)'),
gr.outputs.Textbox(label='Decoding result (char 기반 model)'),
gr.outputs.Textbox(label='Decoding result (word 기반 model)'),
# gr.outputs.Textbox(label='Comparison'),
],
examples=examples,
examples_per_page=5,
css='.examples-gallery {display: grid !important;}'
# article='<p style="text-align:center">Model URL<a target="_blank" href="https://huggingface.co/jkang/espnet2_librispeech_100_conformer">🤗</a></p>',
)
if __name__ == '__main__':
try:
iface.launch(debug=True,
# server_name=SERVER_NAME,
# server_port=SERVER_PORT,
enable_queue=True,
)
except KeyboardInterrupt as e:
print(e)
finally:
iface.close()