import time import torch import string from espnet_model_zoo.downloader import ModelDownloader from espnet2.bin.asr_inference import Speech2Text import soundfile import librosa.display import matplotlib.pyplot as plt import gradio as gr d = ModelDownloader() speech2text = Speech2Text( **d.download_and_unpack(tag), device="cpu", minlenratio=0.0, maxlenratio=0.0, ctc_weight=0.3, beam_size=10, batch_size=0, nbest=1 ) def text_normalizer(text): text = text.upper() return text.translate(str.maketrans('', '', string.punctuation)) lang = 'multilingual' fs = 16000 tag = 'ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best' def inference(audio): speech, rate = soundfile.read(audio.name) assert rate == fs, "mismatch in sampling rate" nbests = speech2text(speech) text, *_ = nbests[0] print(f"Input Speech: {file_name}") display(Audio(speech, rate=rate)) librosa.display.waveplot(speech, sr=rate) plt.show() print(f"ASR hypothesis: {text_normalizer(text)}") print("*" * 50)