'''Librispeech 100h English ASR demo @ML2 --> @HuggingFace 2022-02-23 jkang first created ''' import os from difflib import Differ from glob import glob from loguru import logger import librosa # import wavio import gradio as gr from espnet2.bin.asr_inference import Speech2Text # ---------- Settings ---------- GPU_ID = '-1' os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu' # SERVER_PORT = 42208 # SERVER_NAME = "0.0.0.0" MODEL_DIR = './model' EXAMPLE_DIR = './examples' examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav'))) # examples = [[example] for example in examples] d = Differ() # ---------- Logging ---------- logger.add('app.log', mode='a') logger.info('============================= App restarted =============================') # ---------- Model ---------- logger.info('download model') logger.info('model downloaded') model_word = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer_word") model_bpe = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer") logger.info('model loaded') def predict(wav_file): logger.info('wav file loaded') # Load audio # speech, rate = librosa.load(wav_file, sr=16000) rate, speech = wav_file # wav = wavio.read(wav_file) # speech = wav.data # rate = wav.rate # number_of_samples = round(len(speech_raw) * float(16000) / rate) # speech = sps.resample(speech_raw, number_of_samples) # Run inference W = model_word(speech)[0] B = model_bpe(speech)[0] word_decoded = W[0] bpe_decoded = B[0] # comparison = ''.join(list(d.compare([word_decoded+'\n'], [bpe_decoded+'\n']))) logger.info('predicted') # return word_decoded, bpe_decoded, comparison return word_decoded, bpe_decoded iface = gr.Interface( predict, title='Comparison between word vs BPE tokens based on ESPNet2 ASR models', description='Two models were trained on Librispeech (clean-100h)', inputs=[ gr.inputs.Audio(label='wav file', source='microphone', type='numpy') ], outputs=[ gr.outputs.Textbox(label='Decoding result (word-token model)'), gr.outputs.Textbox(label='Decoding result (BPE-token model)'), # gr.outputs.Textbox(label='Comparison'), ], examples=examples, examples_per_page=5, # article='

Model URL🤗

', ) if __name__ == '__main__': try: iface.launch(debug=True, # server_name=SERVER_NAME, # server_port=SERVER_PORT, enable_queue=True, ) except KeyboardInterrupt as e: print(e) finally: iface.close()