Spaces:
Runtime error
Runtime error
'''Librispeech 100h English ASR demo | |
@ML2 --> @HuggingFace | |
2022-02-23 jkang first created | |
''' | |
import os | |
from difflib import Differ | |
from glob import glob | |
from loguru import logger | |
# import librosa | |
import wavio | |
import gradio as gr | |
from espnet2.bin.asr_inference import Speech2Text | |
# ---------- Settings ---------- | |
GPU_ID = '-1' | |
os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID | |
DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu' | |
# SERVER_PORT = 42208 | |
# SERVER_NAME = "0.0.0.0" | |
MODEL_DIR = './model' | |
EXAMPLE_DIR = './examples' | |
examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav'))) | |
examples = [[example] for example in examples] | |
d = Differ() | |
# ---------- Logging ---------- | |
logger.add('app.log', mode='a') | |
logger.info('============================= App restarted =============================') | |
# ---------- Model ---------- | |
logger.info('download model') | |
logger.info('model downloaded') | |
model_word = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer_word") | |
model_bpe = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer") | |
logger.info('model loaded') | |
def predict(wav_file): | |
logger.info('wav file loaded') | |
# Load audio | |
# speech, rate = librosa.load(wav_file, sr=16000) | |
wav = wavio.read(wav_file) | |
speech = wav.data | |
rate = wav.rate | |
# number_of_samples = round(len(speech_raw) * float(16000) / rate) | |
# speech = sps.resample(speech_raw, number_of_samples) | |
# Run inference | |
W = model_word(speech)[0] | |
B = model_bpe(speech)[0] | |
word_decoded = W[0] | |
bpe_decoded = B[0] | |
# comparison = ''.join(list(d.compare([word_decoded+'\n'], [bpe_decoded+'\n']))) | |
logger.info('predicted') | |
# return word_decoded, bpe_decoded, comparison | |
return word_decoded, bpe_decoded | |
iface = gr.Interface( | |
predict, | |
title='Comparison between word vs BPE tokens based on ESPNet2 ASR models', | |
description='Two models were trained on Librispeech (clean-100h)', | |
inputs=[ | |
gr.inputs.Audio(label='wav file', source='microphone', type='filepath') | |
], | |
outputs=[ | |
gr.outputs.Textbox(label='Decoding result (word-token model)'), | |
gr.outputs.Textbox(label='Decoding result (BPE-token model)'), | |
# gr.outputs.Textbox(label='Comparison'), | |
], | |
examples=examples, | |
examples_per_page=5, | |
# article='<p style="text-align:center">Model URL<a target="_blank" href="https://huggingface.co/jkang/espnet2_librispeech_100_conformer">🤗</a></p>', | |
) | |
if __name__ == '__main__': | |
try: | |
iface.launch(debug=True, | |
# server_name=SERVER_NAME, | |
# server_port=SERVER_PORT, | |
enable_queue=True, | |
) | |
except KeyboardInterrupt as e: | |
print(e) | |
finally: | |
iface.close() |