espnet2_asr / app.py
Ahsen Khaliq
Update app.py
d6a2d06
raw
history blame
No virus
1.06 kB
import time
import torch
import string
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.asr_inference import Speech2Text
import soundfile
import librosa.display
import matplotlib.pyplot as plt
import gradio as gr
d = ModelDownloader()
speech2text = Speech2Text(
**d.download_and_unpack(tag),
device="cpu",
minlenratio=0.0,
maxlenratio=0.0,
ctc_weight=0.3,
beam_size=10,
batch_size=0,
nbest=1
)
def text_normalizer(text):
text = text.upper()
return text.translate(str.maketrans('', '', string.punctuation))
lang = 'multilingual'
fs = 16000
tag = 'ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best'
def inference(audio):
speech, rate = soundfile.read(audio.name)
assert rate == fs, "mismatch in sampling rate"
nbests = speech2text(speech)
text, *_ = nbests[0]
print(f"Input Speech: {file_name}")
display(Audio(speech, rate=rate))
librosa.display.waveplot(speech, sr=rate)
plt.show()
print(f"ASR hypothesis: {text_normalizer(text)}")
print("*" * 50)