'''Librispeech 100h English ASR demo @ML2 --> @HuggingFace 2022-02-23 jkang first created 2022-02-28 jkang char added (bpe, word, char now all included) ''' import os from difflib import Differ from glob import glob from loguru import logger import librosa import gradio as gr from espnet2.bin.asr_inference import Speech2Text # ---------- Settings ---------- GPU_ID = '-1' os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu' # SERVER_PORT = 42208 # SERVER_NAME = "0.0.0.0" MODEL_DIR = './model' EXAMPLE_DIR = './examples' examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav'))) # examples = [[example] for example in examples] d = Differ() # ---------- Logging ---------- logger.add('app.log', mode='a') logger.info('============================= App restarted =============================') # ---------- Model ---------- logger.info('download model') logger.info('model downloaded') model_word = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer_word") model_bpe = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer") model_char = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer_char") logger.info('model loaded') def predict(wav_file): logger.info('wav file loaded') # Load audio speech, rate = librosa.load(wav_file, sr=16000) logger.info(f'--- speech.shape: {speech.shape}') logger.info(f'--- speech.dtype: {speech.dtype}') logger.info(f'--- speech rate: {rate}') # Run inference W = model_word(speech)[0] B = model_bpe(speech)[0] C = model_char(speech)[0] word_decoded = W[0] bpe_decoded = B[0] char_decoded = C[0] # comparison = ''.join(list(d.compare([word_decoded+'\n'], [bpe_decoded+'\n']))) logger.info(f'--- word-based model decoded: {word_decoded}') logger.info(f'--- BPE-based model decoded: {bpe_decoded}') logger.info(f'--- char-based model decoded: {char_decoded}') logger.info('finished') # return word_decoded, bpe_decoded, comparison return bpe_decoded, char_decoded, word_decoded iface = gr.Interface( predict, title='Comparison between word vs BPE tokens based on ESPNet2 ASR models', description='Two models were trained on Librispeech (clean-100h)', inputs=[ gr.inputs.Audio(label='wav file', source='upload', type='filepath') ], outputs=[ gr.outputs.Textbox(label='Decoding result (BPE 기반 model)'), gr.outputs.Textbox(label='Decoding result (char 기반 model)'), gr.outputs.Textbox(label='Decoding result (word 기반 model)'), # gr.outputs.Textbox(label='Comparison'), ], examples=examples, examples_per_page=5, css='.examples-gallery {display: grid !important;}' # article='

Model URL🤗

', ) if __name__ == '__main__': try: iface.launch(debug=True, # server_name=SERVER_NAME, # server_port=SERVER_PORT, enable_queue=True, ) except KeyboardInterrupt as e: print(e) finally: iface.close()